In [1]:
from scipy.io import arff
import pandas as pd
import numpy as np

In [2]:
data = arff.loadarff('4year.arff')
df = pd.DataFrame(data[0])

In [3]:
df['bankruptcy'] = (df['class']==b'1')
del df['class']
df.columns = ['X{0:02d}'.format(k) for k in range(1,65)] + ['bankruptcy']

In [4]:
#from sklearn.impute import SimpleImputer
#imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
#X_imp = imp_mean.fit_transform(df.values)

df.fillna(df.mean(),inplace = True)
#df.fillna(0)

### Split Dataset

In [5]:
from sklearn.model_selection import train_test_split
X =df.iloc[:, :-1]
y =df.iloc[:, -1]
X_train, X_test, y_train, y_test =\
    train_test_split(X, y, 
                     test_size=0.3, 
                     random_state=0, 
                     stratify=y)

### Standardized 

In [6]:
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

### PCA

In [7]:
from sklearn.decomposition import PCA

In [8]:
pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

### Training Model: Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=1)
lr = lr.fit(X_train_pca, y_train)



In [10]:
print('Logistic Test Accuracy: %.3f' % lr.score(X_test_pca, y_test))

Logistic Test Accuracy: 0.946


### Training Model: Decision Tree

In [11]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=1)
tree.fit(X_train_pca, y_train)
print('DecisionTree Test accuracy:', tree.score(X_test_pca, y_test))

DecisionTree Test accuracy: 0.9472430224642614


### Training Model: SVM

In [12]:
from sklearn.svm import SVC
svm = SVC(kernel='linear', C=1, random_state=1)
#C越大，对误分的容忍度越高
svm.fit(X_train_pca, y_train)
print('SVM Test accuracy:', svm.score(X_test_pca, y_test))

SVM Test accuracy: 0.9472430224642614


### Training Model Using Pipeline

In [13]:
from sklearn.pipeline import make_pipeline

pipe_lr = make_pipeline(StandardScaler(),
                        PCA(n_components=3),
                        LogisticRegression(random_state=1))

pipe_lr.fit(X_train, y_train)
print('Logistic Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test))

Logistic Test Accuracy: 0.946




In [14]:
pipe_tree = make_pipeline(StandardScaler(),
                        PCA(n_components=3),
                        DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=1))

pipe_tree.fit(X_train, y_train)
print('Tree Test Accuracy: %.3f' % pipe_tree.score(X_test, y_test))

Tree Test Accuracy: 0.947


In [15]:
pipe_svm = make_pipeline(StandardScaler(),
                        PCA(n_components=3),
                        SVC(kernel='linear', C=1, random_state=1))

pipe_svm.fit(X_train, y_train)
print('SVM Test Accuracy: %.3f' % pipe_tree.score(X_test, y_test))

SVM Test Accuracy: 0.947


### Grid Search using SVM

In [16]:
from sklearn.model_selection import GridSearchCV

param_range = [1, 2]
param_grid = [{'svc__C': param_range, 
               'svc__kernel': ['linear']},
              {'svc__C': param_range, 
               'svc__gamma': param_range, 
               'svc__kernel': ['rbf']}]

gs = GridSearchCV(estimator=pipe_svm, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=-1)
gs = gs.fit(X_train_pca, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.9474759264662971
{'svc__C': 1, 'svc__kernel': 'linear'}
