In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, Imputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
mydata = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data",
                      header = None)

In [3]:
#rename columns based off column names from original dataset
mydata.columns = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12',
                  'A13', 'A14', 'A15', 'Y']

In [4]:
#replace any missing data with "Nan" value instead of "?"
mydata = mydata.replace('?', np.nan)

In [5]:
mydata.shape

(690, 16)

In [6]:
#drop data rows that are missing multiple feature values
mydata = mydata.drop(mydata.index[[206, 270, 330, 456, 479, 592, 601]])

In [7]:
mydata.shape

(683, 16)

In [8]:
mydata.isnull().sum()

A1     10
A2     12
A3      0
A4      1
A5      1
A6      2
A7      2
A8      0
A9      0
A10     0
A11     0
A12     0
A13     0
A14     8
A15     0
Y       0
dtype: int64

In [9]:
#A2 and A14 are continuous features that will be scaled.  Change from object data type to float
mydata['A2'] = mydata.A2.astype(float)
mydata['A14'] = mydata.A14.astype(float)

In [10]:
#remaining Nan values will not affect outcome if dropped
mydata = mydata.dropna()

In [11]:
mydata.isnull().sum()

A1     0
A2     0
A3     0
A4     0
A5     0
A6     0
A7     0
A8     0
A9     0
A10    0
A11    0
A12    0
A13    0
A14    0
A15    0
Y      0
dtype: int64

In [12]:
mydata.head(100)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,Y
0,b,30.83,0.000,u,g,w,v,1.250,t,t,1,f,g,202.0,0,+
1,a,58.67,4.460,u,g,q,h,3.040,t,t,6,f,g,43.0,560,+
2,a,24.50,0.500,u,g,q,h,1.500,t,f,0,f,g,280.0,824,+
3,b,27.83,1.540,u,g,w,v,3.750,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.710,t,f,0,f,s,120.0,0,+
5,b,32.08,4.000,u,g,m,v,2.500,t,f,0,t,g,360.0,0,+
6,b,33.17,1.040,u,g,r,h,6.500,t,f,0,t,g,164.0,31285,+
7,a,22.92,11.585,u,g,cc,v,0.040,t,f,0,f,g,80.0,1349,+
8,b,54.42,0.500,y,p,k,h,3.960,t,f,0,f,g,180.0,314,+
9,b,42.50,4.915,y,p,w,v,3.165,t,f,0,t,g,52.0,1442,+


In [13]:
mydata.dtypes

A1      object
A2     float64
A3     float64
A4      object
A5      object
A6      object
A7      object
A8     float64
A9      object
A10     object
A11      int64
A12     object
A13     object
A14    float64
A15      int64
Y       object
dtype: object

In [14]:
mydata['A4'].value_counts()

u    499
y    152
l      2
Name: A4, dtype: int64

In [15]:
#create seperate dataframes for continuous features and categorical features
mydataCats = mydata[['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13', 'Y']]
mydataNum = mydata[['A2', 'A3', 'A8', 'A11', 'A14', 'A15']]

In [16]:
#use labelencoder to convert character and string features to numerical 
mydataCats = mydataCats.apply(LabelEncoder().fit_transform)

In [17]:
#use onehotencoder to create dummy variables
onehotencoder = OneHotEncoder(categorical_features = "all")
mydataCats = onehotencoder.fit_transform(mydataCats).toarray()

In [18]:
#convert from numpy array back to dataframe
mydataCats = pd.DataFrame(mydataCats)

In [19]:
#rename newly constructed dummy variable features
mydataCats = mydataCats.rename(index=str, columns={0: "A1_a", 1: "A1_b", 2:"A2_l", 3: 'A2_u',
                                                  4: 'A2_y', 5: 'A5_g', 6: 'A5_gg', 7: 'A5_p',
                                                  8: 'A6_aa', 9: 'A6_c', 10: 'A6_cc', 11: 'A6_d',
                                                  12: 'A6_e', 13: 'A6_ff', 14: 'A6_i', 15: 'A6_j',
                                                  16: 'A6_k', 17: 'A6_m', 18: 'A6_q', 19: 'A6_r', 
                                                  20: 'A6_w', 21: 'A6_x', 22: 'A7_bb', 23: 'A7_dd', 
                                                  24: 'A7_ff', 25: 'A7_h', 26: 'A7_j', 27: 'A7_n',
                                                  28: 'A7_o', 29: 'A7_v', 30: 'A7_z', 31: 'A9_f',
                                                  32: 'A9_t', 33: 'A10_f', 34: 'A10_t', 35: 'A12_f',
                                                  36: 'A12_t', 37: 'A13_g', 38: 'A13_p', 39: 'A13_s',
                                                  40: 'y_+', 41: 'y_-'})

In [20]:
mydataCats.head(100)

Unnamed: 0,A1_a,A1_b,A2_l,A2_u,A2_y,A5_g,A5_gg,A5_p,A6_aa,A6_c,...,A9_t,A10_f,A10_t,A12_f,A12_t,A13_g,A13_p,A13_s,y_+,y_-
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
5,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
6,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
7,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
8,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
9,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


In [21]:
#drop one dummy variable for each feature to simplify and increase interpretability 
mydataCats = mydataCats.drop(columns=['A1_b', 'A9_f', 'A10_f', 'A12_f', 'y_-'])

In [22]:
mydataCats.shape

(653, 37)

In [23]:
#apply standard scaler to numerical features to account for difference in Euclidian distance between 
#non-standardized X features. 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
mydataNum = sc.fit_transform(mydataNum)

In [24]:
#convert numpy array back to dataframe
mydataNum = pd.DataFrame(mydataNum)

In [25]:
mydataNum.head()

Unnamed: 0,0,1,2,3,4,5
0,-0.056962,-0.96144,-0.295171,-0.302596,0.128682,-0.193125
1,2.296536,-0.073565,0.236217,0.704516,-0.816802,-0.086443
2,-0.592078,-0.861903,-0.220955,-0.504019,0.592504,-0.03615
3,-0.310572,-0.654865,0.44699,0.503093,-0.477855,-0.192553
4,-0.958122,0.158358,-0.158613,-0.504019,-0.358926,-0.193125


In [26]:
mydataNum = mydataNum.rename(index=str, columns={0:'A2', 1:'A3', 2:'A8', 3:'A11', 
                                                 4:'A14', 5:'A15'})

In [27]:
mydataNum.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15
0,-0.056962,-0.96144,-0.295171,-0.302596,0.128682,-0.193125
1,2.296536,-0.073565,0.236217,0.704516,-0.816802,-0.086443
2,-0.592078,-0.861903,-0.220955,-0.504019,0.592504,-0.03615
3,-0.310572,-0.654865,0.44699,0.503093,-0.477855,-0.192553
4,-0.958122,0.158358,-0.158613,-0.504019,-0.358926,-0.193125


In [28]:
#concatenate two dataframes now that dummy variable creating and feature scaling is complete
CreditData = pd.concat([mydataNum, mydataCats], axis = 1)

In [29]:
CreditData_X = CreditData.iloc[:, :-1].values
CreditData_y = CreditData.iloc[:, 42].values

In [30]:
CreditData.head()

Unnamed: 0,A2,A3,A8,A11,A14,A15,A1_a,A2_l,A2_u,A2_y,...,A7_o,A7_v,A7_z,A9_t,A10_t,A12_t,A13_g,A13_p,A13_s,y_+
0,-0.056962,-0.96144,-0.295171,-0.302596,0.128682,-0.193125,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
1,2.296536,-0.073565,0.236217,0.704516,-0.816802,-0.086443,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
2,-0.592078,-0.861903,-0.220955,-0.504019,0.592504,-0.03615,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,-0.310572,-0.654865,0.44699,0.503093,-0.477855,-0.192553,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
4,-0.958122,0.158358,-0.158613,-0.504019,-0.358926,-0.193125,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(CreditData_X,CreditData_y, test_size = .2,
                                                   random_state = 23)

In [32]:
#define fucntion to run a baseline score for classifiers using default hyperparameter values
def scores(models, X, y):
    for model in models:
        y_pred = model.predict(X)
        acc = accuracy_score(y, y_pred)
        print("Accuracy Score: {0:0.2f} %".format(acc * 100))

In [33]:
#classification models used with default hyperparameters
models = [LogisticRegression(random_state=23),
          KNeighborsClassifier(),
          GaussianNB(),
          SVC(random_state = 23),
          RandomForestClassifier(random_state = 23),]

In [34]:
#fit each model using training data
for model in models:
    model.fit(X_train, y_train)

In [35]:
#training scores for each model using default hyperparameters
scores(models, X_train, y_train)

Accuracy Score: 88.31 %
Accuracy Score: 85.63 %
Accuracy Score: 71.84 %
Accuracy Score: 85.25 %
Accuracy Score: 99.62 %


In [36]:
#test scores for each model using default hyperparameters
scores(models, X_test, y_test)

Accuracy Score: 92.37 %
Accuracy Score: 86.26 %
Accuracy Score: 63.36 %
Accuracy Score: 92.37 %
Accuracy Score: 87.79 %


In [37]:
#create dictionary of parameters used for hypertuning
LR_grid_parameters = {'C': [0.1, .01, 1, 10, 50, 100], 'penalty': ['l1', 'l2'], 'random_state': [23]}
KNN_grid_parameters = {'n_neighbors': [3, 5, 10, 20], 'weights': ['uniform', 'distance']}
SVM_poly_grid_parameters = {'kernel': ['poly'], 'degree': [1, 2, 3, 4], 'gamma': [.5, .2, .05, .01, .001],
                            'C': [.01, 0.1, .5, 1, 10, 50], 'random_state': [23]}
SVM_grid_parameters = {'kernel': ['linear', 'rbf', 'sigmoid'], 'gamma': [.5, .2, .05, .01, .001],
                       'C': [.01, 0.1, .5, 1, 10, 50], 'random_state': [23]}
RF_grid_parameters = {'n_estimators': [10, 50, 100], 'criterion': ['gini', 'entropy'],
               'max_depth': [None, 10, 25, 50], 'random_state': [23]}

In [38]:
#implement GridSearch to iterate through dictionary and select best cross-validated training score
LR_grid = GridSearchCV(estimator = LogisticRegression(), param_grid = LR_grid_parameters, 
                        scoring = 'accuracy')
LR_grid.fit(X_train, y_train)
print("Best score: %0.3f" % LR_grid.best_score_)
print("Best parameters set:")
best_parameters=LR_grid.best_estimator_.get_params()
for param_name in sorted(LR_grid_parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.852
Best parameters set:
	C: 1
	penalty: 'l1'
	random_state: 23


In [39]:
LR_grid.grid_scores_



[mean: 0.85057, std: 0.04181, params: {'C': 0.1, 'penalty': 'l1', 'random_state': 23},
 mean: 0.84674, std: 0.03469, params: {'C': 0.1, 'penalty': 'l2', 'random_state': 23},
 mean: 0.56130, std: 0.00138, params: {'C': 0.01, 'penalty': 'l1', 'random_state': 23},
 mean: 0.80077, std: 0.01408, params: {'C': 0.01, 'penalty': 'l2', 'random_state': 23},
 mean: 0.85249, std: 0.03025, params: {'C': 1, 'penalty': 'l1', 'random_state': 23},
 mean: 0.85249, std: 0.03793, params: {'C': 1, 'penalty': 'l2', 'random_state': 23},
 mean: 0.84291, std: 0.02760, params: {'C': 10, 'penalty': 'l1', 'random_state': 23},
 mean: 0.84483, std: 0.02859, params: {'C': 10, 'penalty': 'l2', 'random_state': 23},
 mean: 0.84483, std: 0.02494, params: {'C': 50, 'penalty': 'l1', 'random_state': 23},
 mean: 0.84483, std: 0.02494, params: {'C': 50, 'penalty': 'l2', 'random_state': 23},
 mean: 0.84291, std: 0.02426, params: {'C': 100, 'penalty': 'l1', 'random_state': 23},
 mean: 0.84291, std: 0.02426, params: {'C': 100, 

In [40]:
KNN_grid = GridSearchCV(estimator = KNeighborsClassifier(), param_grid = KNN_grid_parameters, 
                        scoring = 'accuracy')
KNN_grid.fit(X_train, y_train)
print("Best score: %0.3f" % KNN_grid.best_score_)
print("Best parameters set:")
best_parameters=KNN_grid.best_estimator_.get_params()
for param_name in sorted(KNN_grid_parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.816
Best parameters set:
	n_neighbors: 20
	weights: 'distance'


In [41]:
KNN_grid.grid_scores_



[mean: 0.77395, std: 0.02495, params: {'weights': 'uniform', 'n_neighbors': 3},
 mean: 0.77395, std: 0.02570, params: {'weights': 'distance', 'n_neighbors': 3},
 mean: 0.80460, std: 0.04132, params: {'weights': 'uniform', 'n_neighbors': 5},
 mean: 0.80077, std: 0.04368, params: {'weights': 'distance', 'n_neighbors': 5},
 mean: 0.78927, std: 0.02262, params: {'weights': 'uniform', 'n_neighbors': 10},
 mean: 0.81226, std: 0.02273, params: {'weights': 'distance', 'n_neighbors': 10},
 mean: 0.81034, std: 0.03759, params: {'weights': 'uniform', 'n_neighbors': 20},
 mean: 0.81609, std: 0.02994, params: {'weights': 'distance', 'n_neighbors': 20}]

In [42]:
SVM_poly_grid = GridSearchCV(estimator = SVC(), param_grid = SVM_poly_grid_parameters, 
                             scoring = 'accuracy')
SVM_poly_grid.fit(X_train, y_train)
print("Best score: %0.3f" % SVM_poly_grid.best_score_)
print("Best parameters set:")
best_parameters=SVM_poly_grid.best_estimator_.get_params()
for param_name in sorted(SVM_poly_grid_parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.877
Best parameters set:
	C: 0.1
	degree: 2
	gamma: 0.5
	kernel: 'poly'
	random_state: 23


In [43]:
SVM_poly_grid.grid_scores_



[mean: 0.76245, std: 0.02218, params: {'kernel': 'poly', 'C': 0.01, 'gamma': 0.5, 'random_state': 23, 'degree': 1},
 mean: 0.64559, std: 0.02546, params: {'kernel': 'poly', 'C': 0.01, 'gamma': 0.2, 'random_state': 23, 'degree': 1},
 mean: 0.56130, std: 0.00138, params: {'kernel': 'poly', 'C': 0.01, 'gamma': 0.05, 'random_state': 23, 'degree': 1},
 mean: 0.56130, std: 0.00138, params: {'kernel': 'poly', 'C': 0.01, 'gamma': 0.01, 'random_state': 23, 'degree': 1},
 mean: 0.56130, std: 0.00138, params: {'kernel': 'poly', 'C': 0.01, 'gamma': 0.001, 'random_state': 23, 'degree': 1},
 mean: 0.85249, std: 0.01614, params: {'kernel': 'poly', 'C': 0.01, 'gamma': 0.5, 'random_state': 23, 'degree': 2},
 mean: 0.71456, std: 0.01301, params: {'kernel': 'poly', 'C': 0.01, 'gamma': 0.2, 'random_state': 23, 'degree': 2},
 mean: 0.56513, std: 0.00378, params: {'kernel': 'poly', 'C': 0.01, 'gamma': 0.05, 'random_state': 23, 'degree': 2},
 mean: 0.56130, std: 0.00138, params: {'kernel': 'poly', 'C': 0.01,

In [44]:
SVM_grid = GridSearchCV(estimator = SVC(), param_grid = SVM_grid_parameters, 
                             scoring = 'accuracy')
SVM_grid.fit(X_train, y_train)
print("Best score: %0.3f" % SVM_grid.best_score_)
print("Best parameters set:")
best_parameters=SVM_grid.best_estimator_.get_params()
for param_name in sorted(SVM_grid_parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.866
Best parameters set:
	C: 10
	gamma: 0.05
	kernel: 'rbf'
	random_state: 23


In [45]:
RF_grid = GridSearchCV(estimator = RandomForestClassifier(), param_grid = RF_grid_parameters, 
                             scoring = 'accuracy')
RF_grid.fit(X_train, y_train)
print("Best score: %0.3f" % RF_grid.best_score_)
print("Best parameters set:")
best_parameters=RF_grid.best_estimator_.get_params()
for param_name in sorted(RF_grid_parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Best score: 0.885
Best parameters set:
	criterion: 'entropy'
	max_depth: 10
	n_estimators: 100
	random_state: 23


In [46]:
RF_grid.grid_scores_



[mean: 0.86398, std: 0.01417, params: {'n_estimators': 10, 'max_depth': None, 'criterion': 'gini', 'random_state': 23},
 mean: 0.88314, std: 0.02852, params: {'n_estimators': 50, 'max_depth': None, 'criterion': 'gini', 'random_state': 23},
 mean: 0.87931, std: 0.02836, params: {'n_estimators': 100, 'max_depth': None, 'criterion': 'gini', 'random_state': 23},
 mean: 0.85632, std: 0.01734, params: {'n_estimators': 10, 'max_depth': 10, 'criterion': 'gini', 'random_state': 23},
 mean: 0.88123, std: 0.02894, params: {'n_estimators': 50, 'max_depth': 10, 'criterion': 'gini', 'random_state': 23},
 mean: 0.87931, std: 0.02165, params: {'n_estimators': 100, 'max_depth': 10, 'criterion': 'gini', 'random_state': 23},
 mean: 0.86398, std: 0.01417, params: {'n_estimators': 10, 'max_depth': 25, 'criterion': 'gini', 'random_state': 23},
 mean: 0.88314, std: 0.02852, params: {'n_estimators': 50, 'max_depth': 25, 'criterion': 'gini', 'random_state': 23},
 mean: 0.87931, std: 0.02836, params: {'n_estima

In [48]:
#overall it appears that the RandomForest classifier generalizes the training set and test set the most
#accurately.  We'll do one more measurement and get an accuracy score for the hypertuned model against 
#the test set 

RF_2 = RandomForestClassifier(criterion = 'entropy', max_depth = 10, n_estimators = 100, random_state = 23)
RF_2.fit(X_train, y_train)
RF_2.score(X_train, y_train)

0.9865900383141762

In [49]:
RF_2.score(X_test, y_test)

0.9083969465648855