## classify parkinsons according to voice recordings

In [33]:
# imports
import numpy as np
import pandas as pd
import os, sys
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_validate, cross_val_score

In [2]:
# Read the data
df = pd.read_csv('parkinsons.data')
df.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [3]:
# Get the features and labels
features = df.loc[:,df.columns!='status'].values[:,1:]
labels = df.loc[:,'status'].values

In [11]:
# Get the count of each label (0 and 1) in labels
print(labels[labels==1].shape[0], labels[labels==0].shape[0])

147 48


In [14]:
# Scale the features to between -1 and 1
scaler = MinMaxScaler((-1,1))
x = scaler.fit_transform(features)
y = labels

In [17]:
# Split the dataset
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.2, random_state=7)

In [18]:
# Train the model
model = XGBClassifier()
model.fit(x_train, y_train)

In [None]:
# predict and Calculate the accuracy
y_pred = model.predict(x_test)

In [19]:
# get the accuracy score for test predictions
accuracy_score=accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(accuracy_score*100,2)}%')

94.87179487179486


try to optimize hyperparameters of xgboost for getting better accuracy score:

In [20]:
# cross validation function for xgboost classifier, estimates the results of the classifier and calculates the mean cross validation score
def xgboost_cv(n_estimators, max_depth, gamma, subsample, data, targets):
    estimator = XGBClassifier(
        n_estimators=n_estimators,
        max_depth = max_depth,
        gamma = gamma,
        # min_child_weight=min_child_weight,
        subsample = subsample,
        random_state = 2,
    )
    cval = cross_val_score(estimator, data, targets, cv=5)
    return cval.mean()

In [21]:
# using bayesian optimizer, calculate the cross validation for different hyperparameter values, return the best values

# the black box function here is xgboost_crossval, and it is defined inside the optimize_xgboost function since it needs to use
# the data that is given to optimize_xgboost and the data has to be in the xgboost_crossval's scope
def optimize_xgboost(data, targets):
    def xgboost_crossval(n_estimators, max_depth, gamma, subsample):
        return xgboost_cv(
            n_estimators=int(n_estimators),
            max_depth = int(max_depth),
            gamma = gamma,
            # min_child_weight=min_child_weight,
            subsample=subsample,
            data=data,
            targets=targets,
        )

    optimizer = BayesianOptimization(
        f=xgboost_crossval,
        pbounds={
            "n_estimators": (100, 500),
            "max_depth": (6,15),
            "gamma": (0,10),
            # "min_child_weight": (0,10),
            "subsample": (0.8,1.0)
        },
        verbose=2
    )
    optimizer.maximize(n_iter=25, init_points=10)

    print("Final result:", optimizer.max)

In [26]:
# get the optimized hyperparameters for xgboost
print("--- Optimizing XGBoost ---")
# optimize_xgboost(tfidf_all, y_all_binary)
optimize_xgboost(x_train, y_train)

--- Optimizing XGBoost ---
|   iter    |  target   |   gamma   | max_depth | n_esti... | subsample |
-------------------------------------------------------------------------
| [0m1        [0m | [0m0.878    [0m | [0m3.809    [0m | [0m6.203    [0m | [0m188.1    [0m | [0m0.9653   [0m |
| [0m2        [0m | [0m0.8587   [0m | [0m5.679    [0m | [0m8.334    [0m | [0m230.8    [0m | [0m0.9452   [0m |
| [0m3        [0m | [0m0.8141   [0m | [0m9.455    [0m | [0m8.365    [0m | [0m182.9    [0m | [0m0.9678   [0m |
| [0m4        [0m | [0m0.8718   [0m | [0m4.578    [0m | [0m13.71    [0m | [0m366.4    [0m | [0m0.8013   [0m |
| [0m5        [0m | [0m0.8524   [0m | [0m6.587    [0m | [0m8.341    [0m | [0m362.5    [0m | [0m0.9403   [0m |
| [0m6        [0m | [0m0.8397   [0m | [0m6.015    [0m | [0m12.49    [0m | [0m226.2    [0m | [0m0.8075   [0m |
| [0m7        [0m | [0m0.8714   [0m | [0m2.652    [0m | [0m8.298    [0m | [0m477.1  

In [27]:
# using the optimized hyperparameters, run the xgbclassifier
# spam_xg = XGBClassifier(gamma=0, max_depth=6, n_estimators=202, subsample=1)
xg = XGBClassifier(gamma=0.15, max_depth=7, n_estimators=300, subsample=1)

In [28]:
# fit the training data to the xgboost classifier
xg.fit(x_train, y_train)

In [29]:
# predict results for test data
y_pred = xg.predict(x_test)

In [30]:
# get the accuracy score for test predictions
accuracy_score=accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(accuracy_score*100,2)}%')

Accuracy: 92.31%


In [34]:
#Build confusion matrix
confusion_matrix(y_test, y_pred, labels=[0, 1])

array([[ 5,  2],
       [ 1, 31]], dtype=int64)