# Preamble and Datasets

In [36]:
%matplotlib notebook
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.ensemble import VotingClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.decomposition import PCA

In [37]:
data = pd.read_csv('cardiac_arrhythmia.csv')


In [38]:
data.head()

Unnamed: 0,75,0,190,80,91,193,371,174,121,-16,...,0.150,9,-0.9,0.151,0.152,0.9.3,2.9.1,23.3,49.4,8
0,56,1,165,64,81,174,401,149,39,25,...,0.0,8.5,0.0,0.0,0,0.2,2.1,20.4,38.8,6
1,54,0,172,95,138,163,386,185,102,96,...,0.0,9.5,-2.4,0.0,0,0.3,3.4,12.3,49.0,10
2,55,0,175,94,100,202,380,179,143,28,...,0.0,12.2,-2.2,0.0,0,0.4,2.6,34.6,61.6,1
3,75,0,190,80,88,181,360,177,103,-16,...,0.0,13.1,-3.6,0.0,0,-0.1,3.9,25.4,62.8,7
4,13,0,169,51,100,167,321,174,91,107,...,-0.6,12.2,-2.8,0.0,0,0.9,2.2,13.5,31.1,14


In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451 entries, 0 to 450
Columns: 280 entries, 75 to 8
dtypes: float64(116), int64(159), object(5)
memory usage: 986.6+ KB


In [40]:
data.describe()

Unnamed: 0,75,0,190,80,91,193,371,174,121,-16,...,0.150,9,-0.9,0.151,0.152,0.9.3,2.9.1,23.3,49.4,8
count,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,...,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0
mean,46.407982,0.552106,166.135255,68.144124,88.915743,155.068736,367.199557,169.940133,89.935698,33.78714,...,-0.279601,9.048115,-1.458537,0.003991,0.0,0.513969,1.218625,19.317295,29.429047,3.871397
std,16.429846,0.49783,37.194646,16.599841,15.381143,44.856534,33.422017,35.67213,25.813912,45.421423,...,0.549328,3.476718,2.004481,0.050173,0.0,0.347441,1.425438,13.517617,18.490566,4.407706
min,0.0,0.0,105.0,6.0,55.0,0.0,232.0,108.0,0.0,-172.0,...,-4.1,0.0,-28.6,0.0,0.0,-0.8,-6.0,-44.2,-38.6,1.0
25%,36.0,0.0,160.0,59.0,80.0,142.0,350.0,148.0,79.0,4.0,...,-0.45,6.6,-2.1,0.0,0.0,0.4,0.5,11.4,17.5,1.0
50%,47.0,1.0,164.0,68.0,86.0,157.0,367.0,162.0,91.0,40.0,...,0.0,8.8,-1.1,0.0,0.0,0.5,1.3,18.1,27.9,1.0
75%,58.0,1.0,170.0,78.5,94.0,174.5,384.0,179.0,102.0,66.0,...,0.0,11.2,0.0,0.0,0.0,0.7,2.1,25.85,41.05,6.0
max,83.0,1.0,780.0,176.0,188.0,524.0,509.0,381.0,205.0,169.0,...,0.0,23.6,0.0,0.8,0.0,2.4,6.0,88.8,115.9,16.0


In [41]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451 entries, 0 to 450
Columns: 280 entries, 75 to 8
dtypes: float64(116), int64(159), object(5)
memory usage: 986.6+ KB


In [42]:
data.head()

Unnamed: 0,75,0,190,80,91,193,371,174,121,-16,...,0.150,9,-0.9,0.151,0.152,0.9.3,2.9.1,23.3,49.4,8
0,56,1,165,64,81,174,401,149,39,25,...,0.0,8.5,0.0,0.0,0,0.2,2.1,20.4,38.8,6
1,54,0,172,95,138,163,386,185,102,96,...,0.0,9.5,-2.4,0.0,0,0.3,3.4,12.3,49.0,10
2,55,0,175,94,100,202,380,179,143,28,...,0.0,12.2,-2.2,0.0,0,0.4,2.6,34.6,61.6,1
3,75,0,190,80,88,181,360,177,103,-16,...,0.0,13.1,-3.6,0.0,0,-0.1,3.9,25.4,62.8,7
4,13,0,169,51,100,167,321,174,91,107,...,-0.6,12.2,-2.8,0.0,0,0.9,2.2,13.5,31.1,14


In [43]:
data.tail()

Unnamed: 0,75,0,190,80,91,193,371,174,121,-16,...,0.150,9,-0.9,0.151,0.152,0.9.3,2.9.1,23.3,49.4,8
446,53,1,160,70,80,199,382,154,117,-37,...,0.0,4.3,-5.0,0.0,0,0.7,0.6,-4.4,-0.5,1
447,37,0,190,85,100,137,361,201,73,86,...,0.0,15.6,-1.6,0.0,0,0.4,2.4,38.0,62.4,10
448,36,0,166,68,108,176,365,194,116,-85,...,0.0,16.3,-28.6,0.0,0,1.5,1.0,-44.2,-33.2,2
449,32,1,155,55,93,106,386,218,63,54,...,-0.4,12.0,-0.7,0.0,0,0.5,2.4,25.0,46.6,1
450,78,1,160,70,79,127,364,138,78,28,...,0.0,10.4,-1.8,0.0,0,0.5,1.6,21.3,32.8,1


### Missing Data
Because question mark '?' is of object type, this value is stringified and replaced with NaN before addressing missing data. 

Both 'ffill' and 'bfill' method are used to fill to cover every cells of NaN.

In [44]:
data[(data.astype(str) == '?')] = np.nan

In [45]:
# Columns with missing data( '?')
null_columns=data.columns[data.isnull().any()] 
data[null_columns].isnull().sum()
null_columns

Index(['13', '64', '-2', '?', '63'], dtype='object')

In [46]:
data= data.fillna( method='ffill') 
data= data.fillna( method='bfill')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451 entries, 0 to 450
Columns: 280 entries, 75 to 8
dtypes: float64(116), int64(159), object(5)
memory usage: 986.6+ KB


In [47]:
# Missing value double-check
null_columns=data.columns[data.isnull().any()] 
data[null_columns].isnull().sum()
null_columns

Index([], dtype='object')

### Data preprocessing

In [48]:
X = data.loc[:, '75':'49.4']
y= data['8']

In [49]:
from sklearn.preprocessing import MinMaxScaler
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 10)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Part I: GridSeachCV, Bagging, Boosting, Voting Classifier
- Evaluation strategy is GridSearchCV because it is fundamental to choose the best parameters.

## 1) GridSearchCV & Hard Voting ( Pure Estimators)


#### K Nearest Neighbors ( K=5)

In [225]:

param = {'n_neighbors':[1,3,5,7,15,55]} #create dict of parameters

grid_search_knn = GridSearchCV(KNeighborsClassifier(),param, cv=10) # each parameter has a cv of 10 folds.
grid_search_knn.fit(X_train, y_train)
print("Best parameters: {}".format(grid_search_knn.best_params_)) 
print("Best cross-validation score: {:.2f}".format(grid_search_knn.best_score_)) 
print("Best estimator:\n{}".format(grid_search_knn.best_estimator_))

results = pd.DataFrame(grid_search_knn.cv_results_)
scores = np.array(results.mean_test_score)
train_score = grid_search_knn.score(X_train,y_train)
test_score = grid_search_knn.score(X_test,y_test)
print('KNN score( training): {:.6f}'
     .format(train_score))
print('KNN score (testing): {:.6f}'
     .format(test_score))




Best parameters: {'n_neighbors': 5}
Best cross-validation score: 0.57
Best estimator:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
KNN score( training): 0.627219
KNN score (testing): 0.637168




#### Logistic Regression( penalty='l1')

In [60]:

paramlog = {'penalty':['l1','l2']} #create dict of parameters

grid_search_log = GridSearchCV(LogisticRegression(),paramlog, cv=10) # each parameter has a cv of 10 folds.
grid_search_log.fit(X_train, y_train)
print("Best parameters: {}".format(grid_search_log.best_params_)) 
print("Best cross-validation score: {:.2f}".format(grid_search_log.best_score_)) 
print("Best estimator:\n{}".format(grid_search_log.best_estimator_))

results = pd.DataFrame(grid_search_log.cv_results_)
scores = np.array(results.mean_test_score)
train_score = grid_search_log.score(X_train,y_train)
test_score = grid_search_log.score(X_test,y_test)
print('LogReg score( training): {:.6f}'
     .format(train_score))
print('LogReg score (testing): {:.6f}'
     .format(test_score))




Best parameters: {'penalty': 'l1'}
Best cross-validation score: 0.69
Best estimator:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
LogReg score( training): 0.778107
LogReg score (testing): 0.734513




#### Linear SVC 
Best Estimatorswithout Ensemble Method

In [268]:

paramC = {'C':[0.01,0.1,1,10,100]} #create dict of parameters

grid_search_svc = GridSearchCV(LinearSVC(),paramC, cv=10) # each parameter has a cv of 10 folds.
grid_search_svc.fit(X_train, y_train)
print("Best parameters: {}".format(grid_search_svc.best_params_)) 
print("Best cross-validation score: {:.2f}".format(grid_search_svc.best_score_)) 
print("Best estimator:\n{}".format(grid_search_svc.best_estimator_))

results = pd.DataFrame(grid_search_svc.cv_results_)
scores = np.array(results.mean_test_score)
train_score = grid_search_svc.score(X_train,y_train)
test_score = grid_search_svc.score(X_test,y_test)
print('Linear svc score( training): {:.6f}'
     .format(train_score))
print('Linear svc score (testing): {:.6f}'
     .format(test_score))




Best parameters: {'C': 0.1}
Best cross-validation score: 0.70
Best estimator:
LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
Linear svc score( training): 0.825444
Linear svc score (testing): 0.761062




#### Kernel SVC

In [267]:

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 100]} #create dict of parameters

grid_search_rbf = GridSearchCV(SVC(kernel = 'rbf'),param_grid, cv=10) # each parameter has a cv of 10 folds.
grid_search_rbf.fit(X_train, y_train)
print("Best parameters: {}".format(grid_search_rbf.best_params_)) 
print("Best cross-validation score: {:.2f}".format(grid_search_rbf.best_score_)) 
print("Best estimator:\n{}".format(grid_search_rbf.best_estimator_))

results = pd.DataFrame(grid_search_rbf.cv_results_)
scores = np.array(results.mean_test_score)
train_score = grid_search_rbf.score(X_train,y_train)
test_score = grid_search_rbf.score(X_test,y_test)
print('rbf score( training): {:.6f}'
     .format(train_score))
print('rbf score (testing): {:.6f}'
     .format(test_score))




Best parameters: {'C': 100, 'gamma': 0.01}
Best cross-validation score: 0.69
Best estimator:
SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
rbf score( training): 0.955621
rbf score (testing): 0.716814




#### Decision tree

In [53]:
depth = {'max_depth':[8,58,108,138,158,208,258,279]} #create dict of parameters
grid_search_dtc = GridSearchCV(dtc(),depth, cv=10) # each parameter has a cv of 10 folds.
grid_search_dtc.fit(X_train, y_train)
print("Best parameters: {}".format(grid_search_dtc.best_params_)) 
print("Best cross-validation score: {:.2f}".format(grid_search_dtc.best_score_)) 
print("Best estimator:\n{}".format(grid_search_dtc.best_estimator_))

results = pd.DataFrame(grid_search_dtc.cv_results_)
scores = np.array(results.mean_test_score)
train_score = grid_search_dtc.score(X_train,y_train)
test_score = grid_search_dtc.score(X_test,y_test)
print('Decision Tree score( training): {:.6f}'
     .format(train_score))
print('Decision Tree score (testing): {:.6f}'
     .format(test_score))




Best parameters: {'max_depth': 8}
Best cross-validation score: 0.66
Best estimator:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Decision Tree score( training): 0.905325
Decision Tree score (testing): 0.690265




#### Random Forest

In [55]:
maxfeat = {'max_features':[8,10,50,75,105,235,255,275,279]} #create dict of parameters

grid_search_rfc = GridSearchCV(rfc(),maxfeat, cv=10) # each parameter has a cv of 10 folds.
grid_search_rfc.fit(X_train, y_train)
print("Best parameters: {}".format(grid_search_rfc.best_params_)) 
print("Best cross-validation score: {:.2f}".format(grid_search_rfc.best_score_)) 
print("Best estimator:\n{}".format(grid_search_rfc.best_estimator_))

results = pd.DataFrame(grid_search_rfc.cv_results_)
scores = np.array(results.mean_test_score)
train_score = grid_search_rfc.score(X_train,y_train)
test_score = grid_search_rfc.score(X_test,y_test)
print('Random Forest score( training): {:.6f}'
     .format(train_score))
print('Random Forest score (testing): {:.6f}'
     .format(test_score))




Best parameters: {'max_features': 235}
Best cross-validation score: 0.74
Best estimator:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=235, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Random Forest score( training): 0.982249
Random Forest score (testing): 0.761062




#### Hard voting
In this section, 6 Estimators are used for final hard voting. 

Soft voting is not included since probability option is unavailable in Linear SVC. Also, It takes too long to calculate in Kernelized SVC. 

Best parameters chosen for each estimator:
- KNN Classifier: k=5
- Logistic Regressor: penalty='l1'
- Linear SVC: C=0.1
- Kernelized SVC: C=100, gamma =0.01
- Decision Tree Classifier: Max_depth=8
- Random Forest Classifier: Max_features= 279 ( may change due to PCA reduction)



In [50]:
knn_clf= KNeighborsClassifier(n_neighbors = 5)
log_clf = LogisticRegression(penalty='l1',random_state=10)
svm_clf = LinearSVC(C=0.1,random_state=10)
rbf_clf= SVC(kernel = 'rbf', C =100, gamma =0.01, random_state=10)
dtc_clf= dtc(max_depth=8, random_state=10)
rfc_clf = rfc(max_features= 279, random_state=10)

voting_clf = VotingClassifier(estimators=[('knn',knn_clf),('lr', log_clf), ('svc', svm_clf),
                                          ('rbf',rbf_clf),('dtc',dtc_clf),('rfc', rfc_clf)], voting='hard')
voting_clf.fit(X_train, y_train)
print('Train score: {0:0.6f}'.format(voting_clf.score(X_train,y_train)))
print('Test score: {0:0.6f}'.format(voting_clf.score(X_test,y_test)))

Train score: 0.843195
Test score: 0.752212


  if diff:
  if diff:


##  2) Bagging Ensemble( excluding KNN)
Because of the nature of K-nearest neighbor classifier, the sample weight is not supported for Bagging Ensemble.

In [49]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score

#### Logistic Regression

In [50]:
bag_log= BaggingClassifier(LogisticRegression(penalty='l1'), 
                           n_estimators=500, max_samples=100, bootstrap=True,oob_score=True, n_jobs=-1, random_state=10)
bag_log.fit(X,y)
print('Train score: {0:0.6f}'.format(bag_log.score(X,y)))
print('Out of bag score: {0:0.6f}'.format(bag_log.oob_score_))

Train score: 0.811530
Out of bag score: 0.709534


### Linear SVC - The Best Estimator with 96% for Out-of-bag Score

In [111]:
bag_svc= BaggingClassifier(LinearSVC(C=0.1, random_state=10), 
                           n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1, random_state=10,oob_score=True)
bag_svc.fit(X, y)
print('Train score: {0:0.6f}'.format(bag_svc.score(X,y)))
print('Out of bag score: {0:0.6f}'.format(bag_svc.oob_score_)) # Best one so far



Train score: 0.964523
Out of bag score: 0.962306


#### Kernelized SVC

In [109]:
bag_rbf=BaggingClassifier(SVC(kernel = 'rbf', C =100, gamma =0.01, random_state=10), 
                           n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1, random_state=10,oob_score=True)
bag_rbf.fit(X, y)
print('Train score: {0:0.6f}'.format(bag_rbf.score(X,y)))
print('Out of bag score: {0:0.6f}'.format(bag_rbf.oob_score_))

Train score: 0.543237
Out of bag score: 0.543237


#### Decision tree

In [108]:
bag_dtc = BaggingClassifier(dtc(max_depth=8, random_state=10), 
                            n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1, random_state=10,oob_score=True)
bag_dtc.fit(X, y)
print('Train score: {0:0.6f}'.format(bag_dtc.score(X,y)))
print('Out of bag score: {0:0.6f}'.format(bag_dtc.oob_score_))

Train score: 0.824834
Out of bag score: 0.722838


#### Random Forest

In [107]:
bag_random_forest = BaggingClassifier(rfc(max_features= 279, random_state=10), 
                           n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1, random_state=10,oob_score=True)
bag_random_forest.fit(X, y)
print('Train score: {0:0.6f}'.format(bag_random_forest.score(X,y)))
print('Out of bag score: {0:0.6f}'.format(bag_random_forest.oob_score_))

Train score: 0.789357
Out of bag score: 0.671840


#### Bagging Ensemble: Hard Voting

In [181]:
voting_bag_clf = VotingClassifier(estimators=[('lr',bag_log), ('svc',bag_svc),
                                          ('rbf',bag_rbf),('dtc',bag_dtc),('rfc', bag_random_forest)], voting='hard')
voting_bag_clf.fit(X_train, y_train)
print('Train score: {0:0.6f}'.format(voting_bag_clf.score(X_train,y_train)))
print('Test score: {0:0.6f}'.format(voting_bag_clf.score(X_test,y_test)))

  if diff:


Train score: 0.766272
Test score: 0.663717


  if diff:


## 3) Boosting Ensemble (excluding KNN)
Because of the nature of K-nearest neighbor classifier, the sample weight is not supported for Boosting Ensemble.

In [115]:
from sklearn.ensemble import AdaBoostClassifier

#### Logistic Regression

In [128]:
ada_log = AdaBoostClassifier(
    LogisticRegression(penalty='l1'), n_estimators=500,
    algorithm="SAMME.R", learning_rate=0.5, random_state=10)
ada_log.fit(X_train, y_train)
print('Train score: {0:0.6f}'.format(ada_log.score(X_train,y_train)))
print('Test score: {0:0.6f}'.format(ada_log.score(X_test,y_test)))

Train score: 0.529586
Test score: 0.584071


#### Linear SVC

In [126]:
ada_svc = AdaBoostClassifier(
    LinearSVC(C=0.1), n_estimators=500,
    algorithm="SAMME", learning_rate=0.5, random_state=10)
ada_svc.fit(X_train, y_train)
print('Train score: {0:0.6f}'.format(ada_svc.score(X_train,y_train)))
print('Test score: {0:0.6f}'.format(ada_svc.score(X_test,y_test)))

Train score: 0.825444
Test score: 0.761062


#### Kernelized SVC

In [129]:
ada_rbf = AdaBoostClassifier(
    SVC(kernel = 'rbf', C =100, gamma =0.01, random_state=10), n_estimators=500,
    algorithm="SAMME", learning_rate=0.5, random_state=10)
ada_rbf.fit(X_train, y_train)
print('Train score: {0:0.6f}'.format(ada_rbf.score(X_train,y_train)))
print('Test score: {0:0.6f}'.format(ada_rbf.score(X_test,y_test)))

Train score: 0.535503
Test score: 0.584071


#### Decision tree

In [132]:
ada_dtc = AdaBoostClassifier(
    dtc(max_depth=8), n_estimators=500,
    algorithm="SAMME.R", learning_rate=0.5, random_state=10)
ada_dtc.fit(X_train, y_train)
print('Train score: {0:0.6f}'.format(ada_dtc.score(X_train,y_train)))
print('Test score: {0:0.6f}'.format(ada_dtc.score(X_test,y_test)))


Train score: 1.000000
Test score: 0.681416


In [135]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(learning_rate = 0.05)
gb_clf.fit(X_train,y_train)
print('Train score: {0:0.6f}'.format(gb_clf.score(X_train,y_train)))
print('Test score: {0:0.6f}'.format(gb_clf.score(X_test,y_test)))
print('Feature Importance:'.format(gb_clf.feature_importances_))

Train score: 1.000000
Test score: 0.769912
Feature Importance:


#### Random Forest

In [133]:
ada_rfc = AdaBoostClassifier(
    rfc(max_features= 279, random_state=10), n_estimators=500,
    algorithm="SAMME.R", learning_rate=0.5, random_state=10)
ada_rfc.fit(X_train, y_train)
print('Train score: {0:0.6f}'.format(ada_rfc.score(X_train,y_train)))
print('Test score: {0:0.6f}'.format(ada_rfc.score(X_test,y_test)))

Train score: 1.000000
Test score: 0.716814


#### Adaboost Ensemble: Hard voting

In [182]:
voting_ada_clf = VotingClassifier(estimators=[('lr',ada_log), ('svc',ada_svc),
                                          ('rbf',ada_rbf),('dtc',ada_dtc),('rfc', ada_rfc)], voting='hard')
voting_ada_clf.fit(X_train, y_train)
print('Train score: {0:0.6f}'.format(voting_ada_clf.score(X_train,y_train)))
print('Test score: {0:0.6f}'.format(voting_ada_clf.score(X_test,y_test)))

  if diff:


Train score: 0.828402
Test score: 0.663717


  if diff:


# Part II: Data Reduction ( Bagging/Boosting/Pure Estimators)

In this section, PCA is calculated Bagging/ Boosting, and Original Estimators. X_train and X_test sets are used for Boosting and Original Estimators. For Bagging Ensemble, X_combined is created for the purpose of generating Out-of-bag set.

In [26]:
pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)

print(pca.n_components_)
print(np.sum(pca.explained_variance_ratio_))

82
0.950684113118704


In [27]:
pca = PCA(n_components = 82)
X_reduced = pca.fit_transform(X_train)
X_test_reduced= pca.transform(X_test)

In [30]:
X_combined = np.concatenate((X_reduced, X_test_reduced), axis=0)
X_combined.shape

### 1) Bagging (PCA)


In [36]:
# Logistic Regression
bag_log= BaggingClassifier(LogisticRegression(penalty='l1'), 
                           n_estimators=500, max_samples=100, bootstrap=True,oob_score=True, n_jobs=-1, random_state=10)
bag_log.fit(X_combined,y)
print('Train score: {0:0.6f}'.format(bag_log.score(X_combined,y)))
print('Out of bag score: {0:0.6f}'.format(bag_log.oob_score_))

Train score: 0.543237
Out of bag score: 0.543237


In [37]:
#Linear SVC
bag_svc= BaggingClassifier(LinearSVC(C=0.1, random_state=10), 
                           n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1, random_state=10,oob_score=True)
bag_svc.fit(X_combined, y)
print('Train score: {0:0.6f}'.format(bag_svc.score(X_combined,y)))
print('Out of bag score: {0:0.6f}'.format(bag_svc.oob_score_)) 

Train score: 0.578714
Out of bag score: 0.578714


In [38]:
#Kernelized SVC
bag_rbf=BaggingClassifier(SVC(kernel = 'rbf', C =100, gamma =0.01, random_state=10), 
                           n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1, random_state=10,oob_score=True)
bag_rbf.fit(X_combined, y)
print('Train score: {0:0.6f}'.format(bag_rbf.score(X_combined,y)))
print('Out of bag score: {0:0.6f}'.format(bag_rbf.oob_score_))

Train score: 0.545455
Out of bag score: 0.543237


In [39]:
#Decision Tree Classifier
bag_dtc = BaggingClassifier(dtc(max_depth=8, random_state=10), 
                            n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1, random_state=10,oob_score=True)
bag_dtc.fit(X_combined, y)
print('Train score: {0:0.6f}'.format(bag_dtc.score(X_combined,y)))
print('Out of bag score: {0:0.6f}'.format(bag_dtc.oob_score_))

Train score: 0.572062
Out of bag score: 0.543237


In [41]:
# Random Forest (features are 82)
bag_random_forest = BaggingClassifier(rfc(max_features= 82, random_state=10), 
                           n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1, random_state=10,oob_score=True)
bag_random_forest.fit(X_combined, y)
print('Train score: {0:0.6f}'.format(bag_random_forest.score(X_combined,y)))
print('Out of bag score: {0:0.6f}'.format(bag_random_forest.oob_score_))

Train score: 0.547672
Out of bag score: 0.543237


### 2) Boosting (PCA)

Boosting uses splitted training and testing datasets of predictors( X_train/ X_test)

In [157]:
from sklearn.ensemble import AdaBoostClassifier

In [160]:
# Logistic Regression
ada_log = AdaBoostClassifier(
    LogisticRegression(penalty='l1'), n_estimators=500,
    algorithm="SAMME.R", learning_rate=0.5, random_state=10)
ada_log.fit(X_reduced, y_train)
print('Train score: {0:0.6f}'.format(ada_log.score(X_reduced,y_train)))
print('Test score: {0:0.6f}'.format(ada_log.score(X_test_reduced,y_test)))

Train score: 0.529586
Test score: 0.584071


In [161]:
#Linear SVC
ada_svc = AdaBoostClassifier(
    LinearSVC(C=0.1), n_estimators=500,
    algorithm="SAMME", learning_rate=0.5, random_state=10)
ada_svc.fit(X_reduced, y_train)
print('Train score: {0:0.6f}'.format(ada_svc.score(X_reduced,y_train)))
print('Test score: {0:0.6f}'.format(ada_svc.score(X_test_reduced,y_test)))

Train score: 0.801775
Test score: 0.761062


In [165]:
#Kernelized SVC
ada_rbf = AdaBoostClassifier(
    SVC(kernel = 'rbf', C =100, gamma =0.01, random_state=10), n_estimators=500,
    algorithm="SAMME", learning_rate=0.5, random_state=10)
ada_rbf.fit(X_reduced, y_train)
print('Train score: {0:0.6f}'.format(ada_rbf.score(X_reduced,y_train)))
print('Test score: {0:0.6f}'.format(ada_rbf.score(X_test_reduced,y_test)))

Train score: 0.535503
Test score: 0.584071


In [132]:
# Decision tree Classifier
ada_dtc = AdaBoostClassifier(
    dtc(max_depth=8), n_estimators=500,
    algorithm="SAMME.R", learning_rate=0.5, random_state=10)
ada_dtc.fit(X_reduced, y_train)
print('Train score: {0:0.6f}'.format(ada_dtc.score(X_reduced,y_train)))
print('Test score: {0:0.6f}'.format(ada_dtc.score(X_test_reduced,y_test)))


Train score: 1.000000
Test score: 0.681416


In [163]:
# Gradient Boosting Classifier( Decision Tree)
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(learning_rate = 0.05)
gb_clf.fit(X_reduced,y_train)
print('Train score: {0:0.6f}'.format(gb_clf.score(X_reduced,y_train)))
print('Test score: {0:0.6f}'.format(gb_clf.score(X_test_reduced,y_test)))
print('Feature Importance:'.format(gb_clf.feature_importances_))

Train score: 1.000000
Test score: 0.672566
Feature Importance:


In [164]:
# Random Forest
ada_rfc = AdaBoostClassifier(
    rfc(max_features= 82, random_state=10), n_estimators=500,
    algorithm="SAMME.R", learning_rate=0.5, random_state=10)
ada_rfc.fit(X_reduced, y_train)
print('Train score: {0:0.6f}'.format(ada_rfc.score(X_reduced,y_train)))
print('Test score: {0:0.6f}'.format(ada_rfc.score(X_test_reduced,y_test)))

Train score: 1.000000
Test score: 0.637168


### 3) Original Estimators in PCA ( Including KNN Classifier)


In [191]:
# K Nearest Neighbor
knn_clf= KNeighborsClassifier(n_neighbors = 5)
knn_clf.fit(X_reduced, y_train)
print('Train score: {0:0.6f}'.format(knn_clf.score(X_reduced,y_train)))
print('Test score: {0:0.6f}'.format(knn_clf.score(X_test_reduced,y_test)))

Train score: 0.636095
Test score: 0.654867


In [192]:
# Logistic Regression
log_clf = LogisticRegression(penalty='l1',random_state=10)
log_clf.fit(X_reduced, y_train)
print('Train score: {0:0.6f}'.format(log_clf.score(X_reduced,y_train)))
print('Test score: {0:0.6f}'.format(log_clf.score(X_test_reduced,y_test)))


Train score: 0.760355
Test score: 0.734513


In [193]:
# Linear SVC
svm_clf = LinearSVC(C=0.1,random_state=10)
svm_clf.fit(X_reduced, y_train)
print('Train score: {0:0.6f}'.format(svm_clf.score(X_reduced,y_train)))
print('Test score: {0:0.6f}'.format(svm_clf.score(X_test_reduced,y_test)))


Train score: 0.801775
Test score: 0.761062


In [194]:
#Kernelized SVC
rbf_clf= SVC(kernel = 'rbf', C =100, gamma =0.01, random_state=10)
rbf_clf.fit(X_reduced, y_train)
print('Train score: {0:0.6f}'.format(rbf_clf.score(X_reduced,y_train)))
print('Test score: {0:0.6f}'.format(rbf_clf.score(X_test_reduced,y_test)))


Train score: 0.926036
Test score: 0.743363


In [195]:
# Decision Tree Classifier
dtc_clf= dtc(max_depth=8, random_state=10)
dtc_clf.fit(X_reduced, y_train)
print('Train score: {0:0.6f}'.format(dtc_clf.score(X_reduced,y_train)))
print('Test score: {0:0.6f}'.format(dtc_clf.score(X_test_reduced,y_test)))


Train score: 0.846154
Test score: 0.566372


In [196]:
# Random Forest( features are 82)
rfc_clf = rfc(max_features= 82, random_state=10)
rfc_clf.fit(X_reduced, y_train)
print('Train score: {0:0.6f}'.format(rfc_clf.score(X_reduced,y_train)))
print('Test score: {0:0.6f}'.format(rfc_clf.score(X_test_reduced,y_test)))


Train score: 0.976331
Test score: 0.619469


### Hard Voting in PCA ( Original estimators)

In [198]:
voting_clf_reduced = VotingClassifier(estimators=[('knn',knn_clf),('lr', log_clf), ('svc', svm_clf),
                                          ('rbf',rbf_clf),('dtc',dtc_clf),('rfc', rfc_clf)], voting='hard')
voting_clf_reduced.fit(X_reduced, y_train)
print('Train score: {0:0.6f}'.format(voting_clf_reduced.score(X_reduced,y_train)))
print('Test score: {0:0.6f}'.format(voting_clf_reduced.score(X_test_reduced,y_test)))

Train score: 0.819527
Test score: 0.716814


  if diff:
  if diff:


# Part III: Evaluation Strategy ( Recall and Micro-average)

- The purpose of this section is to recommend treatment to people who really have symptoms of Cardiac Arrhythmia. In another word, we do not want to miss people that are predicted to be normal yet actually have symptoms. Therefore, in this situation, I propose using Recall as the classification metric and Micro-Average to address imbalanced data.

- In this section, I have plotted out pie chart to prove an imbalanced data, where the majority part is normal cases for Cardiac Arrhythmia. Then a binary code is assigned to normal case as 1 versus 0 to the rest as having symptoms. From there I preprocess  binary imbalanced data with the same parameters. All the estimators( with the best chosen parameters) will then train and test new data to boil down to confusion matrix and a classification report.  

In [18]:
unique, counts = np.unique(y, return_counts=True) 
plt.pie(counts, labels = unique, autopct='%1.1f%%', shadow=True, startangle=90)

<IPython.core.display.Javascript object>

([<matplotlib.patches.Wedge at 0x258fdb4f320>,
  <matplotlib.patches.Wedge at 0x258fdb4fcf8>,
  <matplotlib.patches.Wedge at 0x258fdb52748>,
  <matplotlib.patches.Wedge at 0x258fdb43160>,
  <matplotlib.patches.Wedge at 0x258fdb43b70>,
  <matplotlib.patches.Wedge at 0x258fdb385c0>,
  <matplotlib.patches.Wedge at 0x258fdb38fd0>,
  <matplotlib.patches.Wedge at 0x258fdb2ca20>,
  <matplotlib.patches.Wedge at 0x258fdbde470>,
  <matplotlib.patches.Wedge at 0x258fdbdee80>,
  <matplotlib.patches.Wedge at 0x258fdb9beb8>,
  <matplotlib.patches.Wedge at 0x258fdbca2e8>,
  <matplotlib.patches.Wedge at 0x258fdbcacf8>],
 [Text(-1.08987,-0.148958,'1'),
  Text(0.601137,-0.921214,'2'),
  Text(0.919115,-0.604341,'3'),
  Text(1.02449,-0.400516,'4'),
  Text(1.08269,-0.194366,'5'),
  Text(1.09583,0.0956594,'6'),
  Text(1.05652,0.306228,'7'),
  Text(1.04757,0.335543,'8'),
  Text(1.02168,0.407643,'9'),
  Text(0.773743,0.781871,'10'),
  Text(0.432428,1.01144,'14'),
  Text(0.368211,1.03654,'15'),
  Text(0.167914

In [51]:
y_binary_imbalanced = y.copy()
y_binary_imbalanced[y_binary_imbalanced != 1] = 0

print('Original labels:\t', y[1:17])
print('New binary labels:\t', y_binary_imbalanced[1:17])

Original labels:	 1     10
2      1
3      7
4     14
5      1
6      1
7      1
8     10
9      3
10     1
11    10
12     6
13     1
14     1
15    10
16     1
Name: 8, dtype: int64
New binary labels:	 1     0
2     1
3     0
4     0
5     1
6     1
7     1
8     0
9     0
10    1
11    0
12    0
13    1
14    1
15    0
16    1
Name: 8, dtype: int64


In [52]:
np.bincount(y_binary_imbalanced) # class 1 is the most frequent 245 observations where status is normal

array([206, 245], dtype=int64)

In [53]:
from sklearn.preprocessing import MinMaxScaler
X_train, X_test, y_train, y_test = train_test_split(X,y_binary_imbalanced, random_state = 10)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### K-Nearest Neighbor`

In [54]:
knn_clf = KNeighborsClassifier(n_neighbors = 5).fit(X_train,y_train)
y_predicted= knn_clf.predict(X_test)
confusion = confusion_matrix(y_test, y_predicted)
print('confusion regression classifier \n', confusion)
print(classification_report(y_test, y_predicted, target_names=['not 1', '1']))

confusion regression classifier 
 [[15 32]
 [ 1 65]]
             precision    recall  f1-score   support

      not 1       0.94      0.32      0.48        47
          1       0.67      0.98      0.80        66

avg / total       0.78      0.71      0.66       113



#### Logistic Regression

In [55]:
log_clf = LogisticRegression(penalty='l1',random_state=10).fit(X_train,y_train)
y_predicted= log_clf.predict(X_test)
confusion = confusion_matrix(y_test, y_predicted)
print('confusion regression classifier \n', confusion)
print(classification_report(y_test, y_predicted, target_names=['not 1', '1']))

confusion regression classifier 
 [[31 16]
 [ 7 59]]
             precision    recall  f1-score   support

      not 1       0.82      0.66      0.73        47
          1       0.79      0.89      0.84        66

avg / total       0.80      0.80      0.79       113



#### Linear SVC

In [56]:
svm_clf = LinearSVC(C=0.1,random_state=10).fit(X_train,y_train)
y_predicted= svm_clf.predict(X_test)
confusion = confusion_matrix(y_test, y_predicted)
print('confusion regression classifier \n', confusion)
print(classification_report(y_test, y_predicted, target_names=['not 1', '1']))

confusion regression classifier 
 [[29 18]
 [ 7 59]]
             precision    recall  f1-score   support

      not 1       0.81      0.62      0.70        47
          1       0.77      0.89      0.83        66

avg / total       0.78      0.78      0.77       113



#### Kernelized SVC

In [57]:
rbf_clf= SVC(kernel = 'rbf', C =100, gamma =0.01, random_state=10).fit(X_train,y_train)
y_predicted= rbf_clf.predict(X_test)
confusion = confusion_matrix(y_test, y_predicted)
print('confusion regression classifier \n', confusion)
print(classification_report(y_test, y_predicted, target_names=['not 1', '1']))

confusion regression classifier 
 [[32 15]
 [ 8 58]]
             precision    recall  f1-score   support

      not 1       0.80      0.68      0.74        47
          1       0.79      0.88      0.83        66

avg / total       0.80      0.80      0.79       113



#### Decision Tree

In [58]:
dtc_clf= dtc(max_depth=8, random_state=10).fit(X_train,y_train)
y_predicted= dtc_clf.predict(X_test)
confusion = confusion_matrix(y_test, y_predicted)
print('confusion regression classifier \n', confusion)
print(classification_report(y_test, y_predicted, target_names=['not 1', '1']))

confusion regression classifier 
 [[33 14]
 [16 50]]
             precision    recall  f1-score   support

      not 1       0.67      0.70      0.69        47
          1       0.78      0.76      0.77        66

avg / total       0.74      0.73      0.74       113



#### Random Forest

In [59]:
rfc_clf = rfc(max_features= 279, random_state=10).fit(X_train,y_train)
y_predicted= rfc_clf.predict(X_test)
confusion = confusion_matrix(y_test, y_predicted)
print('confusion regression classifier \n', confusion)
print(classification_report(y_test, y_predicted, target_names=['not 1', '1']))

confusion regression classifier 
 [[37 10]
 [ 4 62]]
             precision    recall  f1-score   support

      not 1       0.90      0.79      0.84        47
          1       0.86      0.94      0.90        66

avg / total       0.88      0.88      0.87       113



#### Hard Voting Classifier

In [60]:
knn_clf= KNeighborsClassifier(n_neighbors = 5)
log_clf = LogisticRegression(penalty='l1',random_state=10)
svm_clf = LinearSVC(C=0.1,random_state=10)
rbf_clf= SVC(kernel = 'rbf', C =100, gamma =0.01, random_state=10)
dtc_clf= dtc(max_depth=8, random_state=10)
rfc_clf = rfc(max_features= 279, random_state=10)

voting_clf = VotingClassifier(estimators=[('knn',knn_clf),('lr', log_clf), ('svc', svm_clf),
                                          ('rbf',rbf_clf),('dtc',dtc_clf),('rfc', rfc_clf)], voting='hard')
voting_clf.fit(X_train, y_train)
y_predicted= voting_clf.predict(X_test)
confusion = confusion_matrix(y_test, y_predicted)
print('confusion regression classifier \n', confusion)
print(classification_report(y_test, y_predicted, target_names=['not 1', '1']))

confusion regression classifier 
 [[35 12]
 [ 6 60]]
             precision    recall  f1-score   support

      not 1       0.85      0.74      0.80        47
          1       0.83      0.91      0.87        66

avg / total       0.84      0.84      0.84       113



  if diff:


# Part IV: SUMMARY

 This summary is a brief guide to all justifications. Results in part III are based on test scores including out-of-bag outputs.

## Overall Performance

#### Best Performers

 Linear SVC is the best estimator among all algorithms in this project in term of Accuracy. However, in term of Recall, Random Forest has done the best job to recommend treatment to most people.

- Linear SVC scores highest in Bagging Ensemble with out-of-bag accuracy of 96% and  has the lowest of 76% with advanced methods
- Random Forest has the highest Recall of 88% among other estimators in pure method.

#### Medium Performers 
By descending maximum test score:

- Decision tree with Gradient Boosting( before PCA) scores the highest among the medium with 77%.

- Kernelized SVC in PCA and Random Forest in pure method both score 76%.

- Hard voting of pure estimators scores 75%.

- Logistic Regressor scores 73% in pure method. 

- Random Forest has maximum score of 72% in pure method, and generally worsened with all new methods including PCA. 

#### Low Performers
By descending maximum test score:

- K-Nearest Neighbor Classifier has the lowest score of 65%( in PCA), and can not be supported by Bagging/Boosting.


## Ensemble learning, Voting Classifier, and PCA
- In short, both bagging and boosting see signfinicant changes for all estimators. The reason is some estimators have done its best in training and testing; however, bagging/boosting even overfits the data as a whole. In this scenario, bagging works the best for Linear SVC, and boosting tremendously support decision tree.

- For PCA, results are almost equal on average compared to all estimators before data reduction. Bagging/boosting/voting with PCA do not optimize for the highest performance as existing in previous methods. In my opinion, the reason for no change is because PCA is not an appropriate method to capture the sigfificance of features for this dataset. Also, interdependency among variables is yet to be fully addressed in this project.

 #### Bagging:
- Improve the most with Linear SVC with 96%, and worsen Kernelized SVC with 54%.
#### Boosting:
- Gradient Boosting is a huge advantage for Decision Tree with 77% , and AdaBoost is the worst for Random Forest with 64% after PCA.
#### Hard Voting:
- The maximum score is 75% in pure method. The minimum score is 66% in both Boosting and Bagging( before PCA)
#### PCA:
- The maximum score is 76% Linear SVC and 74% for Kernel SVC in pure method. The worst is Kernelized SVC and Random Forest in Bagging Ensemble 54%.

## Evaluation Strategy 

- The accuracy is used in most methods of classfiers for simplicity of usage in comparing different method. Linear SVC seems to out perform other methods with 96 % out-of-bag score in Bagging ensemble.

- However, when it comes to situation when we wants to look at a different classification metric like Recall, Random Forest has proved to be the optimal one. In this imbalanced data, Random forest has highest score in Recall of 88% by Micro-average method. 

- The reason why I do not apply Bagging/Boosting/PCA in term of Recall is, generally, these methods on average have worsened performances as being seen with Accuracy.