In [92]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

# 1. Load the Data and read the column description and ensure you understand each attribute well

Attribute Information:

* name - ASCII subject name and recording number
* MDVP:Fo(Hz) - Average vocal fundamental frequency
* MDVP:Fhi(Hz) - Maximum vocal fundamental frequency
* MDVP:Flo(Hz) - Minimum vocal fundamental frequency
* MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP - Several 
* measures of variation in fundamental frequency
* MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA - Several measures of variation in amplitude
* NHR,HNR - Two measures of ratio of noise to tonal components in the voice
* status - Health status of the subject (one) - Parkinson's, (zero) - healthy
* RPDE,D2 - Two nonlinear dynamical complexity measures
* DFA - Signal fractal scaling exponent
* spread1,spread2,PPE - Three nonlinear measures of fundamental frequency variation 

In [93]:
df = pd.read_csv("Parkinson-disease-data-updated",index_col=0)

In [94]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MDVP:Fo(Hz),195.0,154.228641,41.390065,88.333,117.572,148.79,182.769,260.105
MDVP:Fhi(Hz),195.0,197.104918,91.491548,102.145,134.8625,175.829,224.2055,592.03
MDVP:Flo(Hz),195.0,116.324631,43.521413,65.476,84.291,104.315,140.0185,239.17
MDVP:Jitter(%),195.0,0.00622,0.004848,0.00168,0.00346,0.00494,0.007365,0.03316
MDVP:Jitter(Abs),195.0,4.4e-05,3.5e-05,7e-06,2e-05,3e-05,6e-05,0.00026
MDVP:RAP,195.0,0.003306,0.002968,0.00068,0.00166,0.0025,0.003835,0.02144
MDVP:PPQ,195.0,0.003446,0.002759,0.00092,0.00186,0.00269,0.003955,0.01958
Jitter:DDP,195.0,0.00992,0.008903,0.00204,0.004985,0.00749,0.011505,0.06433
MDVP:Shimmer,195.0,0.029709,0.018857,0.00954,0.016505,0.02297,0.037885,0.11908
MDVP:Shimmer(dB),195.0,0.282251,0.194877,0.085,0.1485,0.221,0.35,1.302


In [95]:
features = df.loc[:, df.columns != 'status'].values[:, 1:]
labels = df.loc[:, 'status'].values


In [96]:
df.isnull().any()

name                False
MDVP:Fo(Hz)         False
MDVP:Fhi(Hz)        False
MDVP:Flo(Hz)        False
MDVP:Jitter(%)      False
MDVP:Jitter(Abs)    False
MDVP:RAP            False
MDVP:PPQ            False
Jitter:DDP          False
MDVP:Shimmer        False
MDVP:Shimmer(dB)    False
Shimmer:APQ3        False
Shimmer:APQ5        False
MDVP:APQ            False
Shimmer:DDA         False
NHR                 False
HNR                 False
status              False
RPDE                False
DFA                 False
spread1             False
spread2             False
D2                  False
PPE                 False
dtype: bool

# 2. Split the data into training and test set in the ratio of 70:30 respectively

In [97]:
### DATA NOT PRE-PROCESSED

from sklearn.model_selection import train_test_split
to_drop = ['status','name']
X = df.drop(to_drop, axis=1)
y = df['status']
X_train, X_test, y_train,  y_test = cross_validation.train_test_split(X, y,train_size=0.7, test_size=0.3, random_state=42)

In [98]:
### DATA PRE-PROCESSING

from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn import cross_validation


to_drop = ['name','status']
df1 = df.drop(to_drop, axis=1)

Xss = df1.values

## Intentional - as scalar is not be applied on the status column. 
yss = df.status

scaler = MinMaxScaler(feature_range=(-1,1))
scaled = scaler.fit_transform(Xss)

Xss_train, Xss_test, yss_train, yss_test = cross_validation.train_test_split(Xss, labels, test_size=0.14)

# 3. Classification algorithms and compare the models to find the best mode

Two families of ensemble methods are usually distinguished:

In averaging methods, the driving principle is to build several estimators independently and then to average their predictions. On average, the combined estimator is usually better than any of the single base estimator because its variance is reduced.

Examples: Bagging methods, Forests of randomized trees, …

By contrast, in boosting methods, base estimators are built sequentially and one tries to reduce the bias of the combined estimator. The motivation is to combine several weak models to produce a powerful ensemble.

Examples: AdaBoost, Gradient Tree Boosting, …

In [99]:
from sklearn.metrics import accuracy_score

In [100]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(),max_samples=0.5, max_features=0.5,random_state=42)
bagging = bagging.fit(X_train, y_train)
predictions = bagging.predict(X_test)
accuracy = accuracy_score(y_test, predictions)*100
train_score = bagging.score(X_train, y_train)

print (predictions)        
print (accuracy)    
print (train_score) 

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 0 1 1 1 0 1 0 0 1 1 1 1 1 1 0 1 0]
88.13559322033898
0.8529411764705882


In [102]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging_ss = BaggingClassifier(KNeighborsClassifier(),max_samples=0.3, max_features=0.5,random_state=42)
bagging_ss = bagging.fit(Xss_train, yss_train)
predictions_ss = bagging.predict(Xss_test)
accuracy_ss = accuracy_score(yss_test, predictions_ss)*100
train_score_ss = bagging.score(Xss_train, yss_train)*100
print (predictions)        
print (accuracy_ss)    
print (train_score) 

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 0 1 1 1 0 1 0 0 1 1 1 1 1 1 0 1 0]
89.28571428571429
0.8529411764705882


In [106]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=25)
clf = clf.fit(Xss, yss)
predictions_ss = clf.predict(Xss_test)
accuracy_ss = accuracy_score(yss_test, predictions_ss)*100
train_score_ss = clf.score(Xss_train, yss_train)*100
print (predictions)        
print (accuracy_ss)    
print (train_score) 

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 0 1 1 1 0 1 0 0 1 1 1 1 1 1 0 1 0]
100.0
0.8529411764705882


In [140]:
#####

#CHECK CLF SCORE CALCULATION
#####


from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier


print("/n/nDECISION TREE CLASSIFIER")
clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2,random_state=1)
clf = clf.fit(Xss, yss)
predictions_ss = clf.predict(Xss_test)
accuracy_ss = accuracy_score(yss_test, predictions_ss)*100
train_score_ss = clf.score(Xss_train, yss_train)*100
print (predictions)        
print (accuracy_ss)    
print (train_score) 
scores = cross_val_score(clf, Xss_train, yss_train)
print (scores.mean())                           


print("/n/nRANDOM FOREST CLASSIFIER")
clf = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=1)
clf = clf.fit(Xss, yss)
predictions_ss = clf.predict(Xss_test)
accuracy_ss = accuracy_score(yss_test, predictions_ss)*100
train_score_ss = clf.score(Xss_train, yss_train)*100
print (predictions)        
print (accuracy_ss)    
print (train_score) 
scores = cross_val_score(clf, Xss_test, yss_test)
print(scores.mean())                            

print("/n/nEXTRA TREE CLASSIFIER")
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=1)
clf = clf.fit(Xss, yss)
predictions_ss = clf.predict(Xss_test)
accuracy_ss = accuracy_score(yss_test, predictions_ss)*100
train_score_ss = clf.score(Xss_train, yss_train)*100
print (predictions)        
print (accuracy_ss)    
print (train_score) 
scores = cross_val_score(clf, Xss_test, yss_test)
print(scores.mean())


/n/nDECISION TREE CLASSIFIER
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 0 1 1 1 0 1 0 0 1 1 1 1 1 1 0 1 0]
100.0
0.8529411764705882
0.8561688311688312
/n/nRANDOM FOREST CLASSIFIER
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 0 1 1 1 0 1 0 0 1 1 1 1 1 1 0 1 0]
100.0
0.8529411764705882
0.8592592592592592
/n/nEXTRA TREE CLASSIFIER
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 0 1 1 1 0 1 0 0 1 1 1 1 1 1 0 1 0]
100.0
0.8529411764705882
0.8592592592592593


In [141]:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

In [142]:
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'Naive Bayes', 'Ensemble']):
    scores = cross_val_score(clf, Xss_test, yss_test, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean()*100, scores.std(), label))

Accuracy: 86.67 (+/- 0.07) [Logistic Regression]
Accuracy: 86.67 (+/- 0.12) [Random Forest]
Accuracy: 90.00 (+/- 0.08) [Naive Bayes]
Accuracy: 90.00 (+/- 0.08) [Ensemble]


In [143]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from itertools import product
from sklearn.ensemble import VotingClassifier

In [144]:
# Training classifiers
clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(kernel='rbf', probability=True)

eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)], voting='soft', weights=[2,1,2])

clf1 = clf1.fit(Xss_train,yss_train)
clf2 = clf2.fit(Xss_train,yss_train)
clf3 = clf3.fit(Xss_train,yss_train)
eclf = eclf.fit(Xss_train,yss_train)

In [145]:
from sklearn.model_selection import GridSearchCV
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')

params = {'lr__C': [1.0, 100.0], 'rf__n_estimators': [20, 200],}

grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(Xss_train, yss_train)

In [146]:
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')

for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'Naive Bayes', 'Ensemble']):
    scores = cross_val_score(clf, Xss_test, yss_test, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean()*100, scores.std(), label))

Accuracy: 86.67 (+/- 0.07) [Logistic Regression]
Accuracy: 86.67 (+/- 0.12) [Random Forest]
Accuracy: 90.00 (+/- 0.08) [Naive Bayes]
Accuracy: 90.00 (+/- 0.08) [Ensemble]


In [147]:
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[2,5,1])

for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'Naive Bayes', 'Ensemble']):
    scores = cross_val_score(clf, Xss_test, yss_test, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean()*100, scores.std(), label))

Accuracy: 86.67 (+/- 0.07) [Logistic Regression]
Accuracy: 86.67 (+/- 0.12) [Random Forest]
Accuracy: 90.00 (+/- 0.08) [Naive Bayes]
Accuracy: 86.67 (+/- 0.07) [Ensemble]


In [149]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(clf, Xss_train, yss_train)
scores.mean()                             

0.8683982683982684