In [3]:
import pandas as pd
import numpy as np


In [9]:
train=pd.read_csv("train.csv",sep="\t")
test=pd.read_csv("test.csv",sep="\t")

In [10]:
list(train)

['Form_ID',
 'School_type',
 'Soil_type',
 'Area',
 'Budget',
 'Population_density',
 'Latency',
 'Disability',
 'Property_Area',
 'Target']

In [11]:
train.head(10)

Unnamed: 0,Form_ID,School_type,Soil_type,Area,Budget,Population_density,Latency,Disability,Property_Area,Target
0,LP001002,0.0,0,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,0.0,1,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,0.0,0,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,0.0,0,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,0.0,0,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,0.0,2,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,0.0,0,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,0.0,3+,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,0.0,2,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,0.0,1,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [12]:
train.dtypes

Form_ID                object
School_type           float64
Soil_type              object
Area                    int64
Budget                float64
Population_density    float64
Latency               float64
Disability            float64
Property_Area          object
Target                 object
dtype: object

In [13]:
train.describe()

Unnamed: 0,School_type,Area,Budget,Population_density,Latency,Disability
count,601.0,614.0,614.0,592.0,600.0,564.0
mean,0.186356,5403.459283,1621.245798,146.412162,342.0,0.842199
std,0.389718,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,0.0,150.0,0.0,9.0,12.0,0.0
25%,0.0,2877.5,0.0,100.0,360.0,1.0
50%,0.0,3812.5,1188.5,128.0,360.0,1.0
75%,0.0,5795.0,2297.25,168.0,360.0,1.0
max,1.0,81000.0,41667.0,700.0,480.0,1.0


In [14]:
#DATA CLEANING AND PREPROCESSING
#Find missing values
train.isnull().sum()
test.isnull().sum()

#Impute missing values with mean (numerical variables)
train.fillna(train.mean(),inplace=True) 
train.isnull().sum() 

#Test data
test.fillna(test.mean(),inplace=True) 
test.isnull().sum()

Form_ID                0
School_type            0
Soil_type             10
Area                   0
Budget                 0
Population_density     0
Latency                0
Disability             0
Property_Area          0
dtype: int64

In [16]:
train.School_type.fillna(train.School_type.mode()[0],inplace=True)
train.Soil_type.fillna(train.Soil_type.mode()[0],inplace=True) 


In [17]:
test.School_type.fillna(test.School_type.mode()[0],inplace=True)
test.Soil_type.fillna(test.Soil_type.mode()[0],inplace=True) 

In [18]:
#Treatment of outliers
train.Latency=np.log(train.Latency)


In [19]:
#PREDICTIVE MODELLING
#Remove Form_ID variable - Irrelevant
train=train.drop('Form_ID',axis=1)
test=test.drop('Form_ID',axis=1)


In [20]:
#Create target variable
X=train.drop('Target',1)
y=train.Target

In [21]:
#Build dummy variables for categorical variables
X=pd.get_dummies(X)
train=pd.get_dummies(train)
test=pd.get_dummies(test)

In [22]:
#Split train data for cross validation
from sklearn.model_selection import train_test_split
x_train,x_cv,y_train,y_cv = train_test_split(X,y,test_size=0.2)

In [24]:
#(a)LOGISTIC REGRESSION ALGORITHM
#Fit model
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(x_train,y_train)

#Predict values for cv data
pred_cv=model.predict(x_cv)

#Evaluate accuracy of model
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
print("Accuracy: ", accuracy_score(y_cv,pred_cv))
matrix=confusion_matrix(y_cv,pred_cv)
print(matrix)




Accuracy:  0.8292682926829268
[[19 20]
 [ 1 83]]




In [27]:
#(b)DECISION TREE ALGORITHM
#Fit model
from sklearn import tree
dt=tree.DecisionTreeClassifier(criterion='gini')
dt.fit(x_train,y_train)

#Predict values for cv data
pred_cv1=dt.predict(x_cv)

#Evaluate accuracy of model
print("Accuracy: ", accuracy_score(y_cv,pred_cv1))
matrix1=confusion_matrix(y_cv,pred_cv1)
print(matrix1)




Accuracy:  0.7154471544715447
[[21 18]
 [17 67]]


In [28]:
#(c)RANDOM FOREST ALGORITHM
#Fit model
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
rf.fit(x_train,y_train)

#Predict values for cv data
pred_cv2=rf.predict(x_cv)

#Evaluate accuracy of model
print("Accuracy: ", accuracy_score(y_cv,pred_cv2))
matrix2=confusion_matrix(y_cv,pred_cv2)
print(matrix2)




Accuracy:  0.7642276422764228
[[22 17]
 [12 72]]




In [29]:
#(d)SUPPORT VECTOR MACHINE (SVM) ALGORITHM
from sklearn import svm
svm_model=svm.SVC()
svm_model.fit(x_train,y_train)

#Predict values for cv data
pred_cv3=svm_model.predict(x_cv)

#Evaluate accuracy of model
print("Accuracy: ", accuracy_score(y_cv,pred_cv3))
matrix3=confusion_matrix(y_cv,pred_cv3)
print(matrix3)




Accuracy:  0.6829268292682927
[[ 0 39]
 [ 0 84]]




In [30]:
#(e)NAIVE BAYES ALGORITHM
from sklearn.naive_bayes import GaussianNB 
nb=GaussianNB()
nb.fit(x_train,y_train)

#Predict values for cv data
pred_cv4=nb.predict(x_cv)

#Evaluate accuracy of model
print("Accuracy: ", accuracy_score(y_cv,pred_cv4))
matrix4=confusion_matrix(y_cv,pred_cv4)
print(matrix4)




Accuracy:  0.8130081300813008
[[20 19]
 [ 4 80]]


In [31]:
#(f)K-NEAREST NEIGHBOR(kNN) ALGORITHM
from sklearn.neighbors import KNeighborsClassifier
kNN=KNeighborsClassifier()
kNN.fit(x_train,y_train)

#Predict values for cv data
pred_cv5=kNN.predict(x_cv)

#Evaluate accuracy of model
print("Accuracy: ", accuracy_score(y_cv,pred_cv5))
matrix5=confusion_matrix(y_cv,pred_cv5)
print(matrix5)




Accuracy:  0.6504065040650406
[[ 9 30]
 [13 71]]


In [32]:
#(g) GRADIENT BOOSTING MACHINE ALGORITHM
from sklearn.ensemble import GradientBoostingClassifier
gbm=GradientBoostingClassifier()
gbm.fit(x_train,y_train)

#Predict values for cv data
pred_cv6=gbm.predict(x_cv)

#Evaluate accuracy of model
print("Accuracy: ", accuracy_score(y_cv,pred_cv6))
matrix6=confusion_matrix(y_cv,pred_cv6)
print(matrix6)



Accuracy:  0.7967479674796748
[[19 20]
 [ 5 79]]


In [33]:
#Predict values using test data (Naive Bayes)
pred_test=nb.predict(test)

#Write test results in csv file
predictions=pd.DataFrame(pred_test, columns=['predictions']).to_csv('Credit_Predictions.csv')
