In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline

In [2]:
parkinson_data = pd.read_csv("Parkinson-disease-data-updated")

In [3]:
parkinson_data.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,0,1,2,3,4
name,phon_R01_S01_1,phon_R01_S01_2,phon_R01_S01_3,phon_R01_S01_4,phon_R01_S01_5
MDVP:Fo(Hz),119.992,122.4,116.682,116.676,116.014
MDVP:Fhi(Hz),157.302,148.65,131.111,137.871,141.781
MDVP:Flo(Hz),74.997,113.819,111.555,111.366,110.655
MDVP:Jitter(%),0.00784,0.00968,0.0105,0.00997,0.01284
MDVP:Jitter(Abs),7e-05,8e-05,9e-05,9e-05,0.00011
MDVP:RAP,0.0037,0.00465,0.00544,0.00502,0.00655
MDVP:PPQ,0.00554,0.00696,0.00781,0.00698,0.00908
Jitter:DDP,0.01109,0.01394,0.01633,0.01505,0.01966


In [4]:
labels = parkinson_data.iloc[:, 0].values
features = parkinson_data.iloc[:, 1:].values

In [10]:
df = parkinson_data.copy()

In [11]:
features = df.loc[:, df.columns != 'status'].values[:, 1:]
labels = df.loc[:, 'status'].values

In [12]:
print(features.shape)
print(labels[0])


(195, 23)
1


In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
scaler = StandardScaler()
scled_features = scaler.fit_transform(features)

ValueError: could not convert string to float: 'phon_R01_S50_6'

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scled_features, labels, test_size=0.3)

In [None]:
print("X_train shape -- > {}".format(X_train.shape))
print("y_train shape -- > {}".format(y_train.shape))
print("X_test shape -- > {}".format(X_test.shape))
print("y_test shape -- > {}".format(y_test.shape))

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
etc = ExtraTreesClassifier(n_estimators=300)

In [None]:
etc.fit(X_train, y_train)

In [None]:
print(etc.feature_importances_)
indices = np.argsort(etc.feature_importances_)[::-1]

In [None]:
plt.figure(num=None, figsize=(14, 10), dpi=80, facecolor='w')
plt.title("Feature importances")
plt.bar(range(features.shape[1]), etc.feature_importances_[indices],
       color="r", align="center")
plt.xticks(range(features.shape[1]), indices)
plt.show()

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

In [None]:
print("KNN with k=5 got {}% accuracy on the test set.".format(accuracy_score(y_test, knn.predict(X_test))*100))

In [None]:
params_dict = {'n_neighbors':[3, 5, 9, 15], 'p':[1, 2, 3], 'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']}
gs = GridSearchCV(knn, param_grid=params_dict, verbose=10, cv=10)

In [None]:
gs.fit(X_train, y_train)

In [None]:
print(gs.best_estimator_)

In [None]:
new_knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=1,
           weights='uniform')
new_knn.fit(X_train, y_train)

In [None]:
print("KNN - fine tuned, got {}% accuracy on the test set.".format(accuracy_score(y_test, new_knn.predict(X_test))*100))

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
print("Logistic regression - default, got {}% accuracy on the test set.".format(accuracy_score(y_test, lr.predict(X_test))*100))

In [None]:
lr_tuned = LogisticRegression(C=1000, penalty='l2')
lr_tuned.fit(X_train, y_train)

In [None]:
print("Logistic regression - tuned, got {}% accuracy on the test set.".format(accuracy_score(y_test, lr_tuned.predict(X_test))*100))

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)


In [None]:
print("Decision tree classifier, got {}% accuracy on the test set.".format(accuracy_score(y_test, dtc.predict(X_test))*100))

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=350)
rfc.fit(X_train, y_train)

In [None]:
print("Random forest classifier, got {}% accuracy on the test set.".format(accuracy_score(y_test, rfc.predict(X_test))*100))

In [None]:
accuracy_tree = cross_val_score(dtc, scled_features, labels, scoring='accuracy', cv=10)

In [None]:
print(np.mean(accuracy_tree)*100)