<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#PCA-using-pipeline" data-toc-modified-id="PCA-using-pipeline-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>PCA using pipeline</a></span></li><li><span><a href="#Grid-Search" data-toc-modified-id="Grid-Search-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Grid Search</a></span><ul class="toc-item"><li><span><a href="#LogisticRegression" data-toc-modified-id="LogisticRegression-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>LogisticRegression</a></span></li><li><span><a href="#Support-Vector-Machine" data-toc-modified-id="Support-Vector-Machine-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Support Vector Machine</a></span></li><li><span><a href="#Decision-Tree" data-toc-modified-id="Decision-Tree-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Decision Tree</a></span></li></ul></li></ul></div>

# Homework 3: PCA/Hyperparameter/CV
Data source: http://archive.ics.uci.edu/ml/datasets/Polish+companies+bankruptcy+data

In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np

In [2]:
data = arff.loadarff('./4year.arff')
df = pd.DataFrame(data[0])

In [3]:
df['bankruptcy'] = (df['class']==b'1')
df.drop(columns=['class'], inplace=True)
df.columns = ['X{0:02d}'.format(k) for k in range(1,65)] + ['bankruptcy']

In [4]:
df.describe()

Unnamed: 0,X01,X02,X03,X04,X05,X06,X07,X08,X09,X10,...,X55,X56,X57,X58,X59,X60,X61,X62,X63,X64
count,9791.0,9791.0,9791.0,9749.0,9771.0,9791.0,9791.0,9773.0,9792.0,9791.0,...,9792.0,9771.0,9791.0,9776.0,9791.0,9178.0,9760.0,9771.0,9749.0,9561.0
mean,0.043019,0.596404,0.130959,8.1366,64.65164,-0.059273,0.059446,19.884016,1.882296,0.38904,...,7686.33,-0.992263,0.035022,1.133287,0.856053,118.156064,25.19443,2015.157,8.660813,35.949619
std,0.359321,4.587122,4.559074,290.647281,14759.39,6.812754,0.533344,698.697015,17.67465,4.590299,...,76052.61,77.007971,8.945365,8.038201,26.393305,3230.316692,1099.260821,117146.1,60.838202,483.318623
min,-12.458,0.0,-445.91,-0.045319,-379460.0,-486.82,-12.458,-1.8482,-0.032371,-445.91,...,-713220.0,-7522.1,-597.42,-30.892,-284.38,0.0,-12.656,-14965.0,-0.02439,-1.5e-05
25%,0.001321,0.263145,0.020377,1.047,-51.217,-0.000578,0.003004,0.4283,1.006675,0.29444,...,21.84,0.003121,0.008768,0.885722,0.0,5.356325,4.2677,43.234,2.9388,2.0129
50%,0.041364,0.46774,0.19929,1.5918,-0.055576,0.0,0.04882,1.0887,1.1613,0.51045,...,950.33,0.043679,0.098026,0.958305,0.002129,9.482,6.28355,74.729,4.8489,4.0416
75%,0.11113,0.689255,0.41067,2.8804,55.732,0.065322,0.12694,2.691,1.970225,0.71429,...,4694.55,0.11717,0.24268,0.996163,0.21179,19.506,9.9382,123.345,8.3638,9.4135
max,20.482,446.91,22.769,27146.0,1034100.0,322.2,38.618,53209.0,1704.8,12.602,...,6123700.0,112.02,226.76,668.75,1661.0,251570.0,108000.0,10779000.0,5662.4,21153.0


In [5]:
sum(df.bankruptcy == True)

515

In [6]:
df.fillna(df.mean(), inplace=True)
df.isna().sum()
X_imp = df.values

In [7]:
from sklearn.model_selection import train_test_split
X, y = X_imp[:, :-1], X_imp[:, -1].astype('int')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [8]:
import sklearn.preprocessing as skpre
stdsc = skpre.StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
print(X_train_std.shape)
X_test_std = stdsc.transform(X_test)
print(X_test_std.shape)

(7833, 64)
(1959, 64)


## PCA using pipeline

In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

In [10]:
pipe_lr = make_pipeline(PCA(n_components=3), LogisticRegression(penalty='l1', C=1, solver='liblinear'))
pipe_lr.fit(X_train_std, y_train)
pipe_lr.score(X_test_std, y_test)

0.9464012251148545

In [11]:
pipe_svm = make_pipeline(PCA(n_components=3), LinearSVC(C=0.5))
pipe_svm.fit(X_train_std, y_train)
pipe_svm.score(X_test_std, y_test)



0.9464012251148545

In [12]:
pipe_dt = make_pipeline(PCA(n_components=3), DecisionTreeClassifier(max_depth=5))
pipe_dt.fit(X_train_std, y_train)
pipe_dt.score(X_test_std, y_test)

0.9458907605921388

## Grid Search 

`GridSearchCV`'s default cross-validation splitting strategy is 5-fold cross validation.

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import confusion_matrix

In [14]:
X_train_std = PCA(n_components=3).fit_transform(X_train_std)
X_test_std = PCA(n_components=3).fit_transform(X_test_std)

### LogisticRegression

In [15]:
clf = LogisticRegression(solver='liblinear')
grid_values = {'penalty': ['l1', 'l2'],'C':[1e-7,1e-6,1e-5,1e-4,1e-3,.009,0.01,.09,1,5,10,25]}
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'f1',n_jobs=-1)
grid_clf_acc.fit(X_train_std, y_train)

y_pred_acc = grid_clf_acc.predict(X_test_std)

print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_acc)))
print('Precision Score : ' + str(precision_score(y_test,y_pred_acc)))
print('Recall Score : ' + str(recall_score(y_test,y_pred_acc)))
print('F1 Score : ' + str(f1_score(y_test,y_pred_acc)))
print('Best estimator : '+ str(grid_clf_acc.best_estimator_))

confusion_matrix(y_test,y_pred_acc)

Accuracy Score : 0.9469116896375702
Precision Score : 0.3333333333333333
Recall Score : 0.009708737864077669
F1 Score : 0.018867924528301886
Best estimator : LogisticRegression(C=1e-07, solver='liblinear')


array([[1854,    2],
       [ 102,    1]], dtype=int64)

### Support Vector Machine

In [16]:
clf = LinearSVC()
grid_values = {'C':[1e-7,1e-6,1e-5,1e-4,1e-3,.009,0.01,.09,1,5,10,25]}
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'f1',n_jobs=-1)
grid_clf_acc.fit(X_train_std, y_train)

y_pred_acc = grid_clf_acc.predict(X_test_std)

print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_acc)))
print('Precision Score : ' + str(precision_score(y_test,y_pred_acc)))
print('Recall Score : ' + str(recall_score(y_test,y_pred_acc)))
print('F1 Score : ' + str(f1_score(y_test,y_pred_acc)))
print('Best estimator : '+ str(grid_clf_acc.best_estimator_))

confusion_matrix(y_test,y_pred_acc)

Accuracy Score : 0.9469116896375702
Precision Score : 0.3333333333333333
Recall Score : 0.009708737864077669
F1 Score : 0.018867924528301886
Best estimator : LinearSVC(C=1e-06)


array([[1854,    2],
       [ 102,    1]], dtype=int64)

### Decision Tree

In [17]:
clf = DecisionTreeClassifier(criterion="entropy")
grid_values = {'max_depth':np.arange(1,50)}
grid_clf_acc = GridSearchCV(clf, param_grid = grid_values,scoring = 'f1',n_jobs=-1)
grid_clf_acc.fit(X_train_std, y_train)

y_pred_acc = grid_clf_acc.predict(X_test_std)

print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_acc)))
print('Precision Score : ' + str(precision_score(y_test,y_pred_acc)))
print('Recall Score : ' + str(recall_score(y_test,y_pred_acc)))
print('F1 Score : ' + str(f1_score(y_test,y_pred_acc)))
print('Best estimator : '+ str(grid_clf_acc.best_estimator_))

confusion_matrix(y_test,y_pred_acc)

Accuracy Score : 0.6850433894844309
Precision Score : 0.08006535947712418
Recall Score : 0.47572815533980584
F1 Score : 0.13706293706293707
Best estimator : DecisionTreeClassifier(criterion='entropy', max_depth=33)


array([[1293,  563],
       [  54,   49]], dtype=int64)