## CLASSIFICATION OF PARKISON'S DISEASE DATA

Equipe :
    * Jessica Vilar - 1613176
    * Fernando Tancini - 1711799
    * Andrea Mourelo - 1820000

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline 
plt.style.use('seaborn-whitegrid')
plt.rc('font', family='Arial')
plt.rc('xtick', labelsize=12) 
plt.rc('ytick', labelsize=12) 
plt.rc('font', size=12) 
plt.rc('figure', figsize = (12, 5))

In [2]:
# Set a seed value: 
seed_value= 1001004  
# 1. Set PYTHONHASHSEED environment variable at a fixed value: 
import os
os.environ['PYTHONHASHSEED']=str(seed_value) 
# 2. Set python built-in pseudo-random generator at a fixed value:
import random
random.seed(seed_value) 
# 3. Set numpy pseudo-random generator at a fixed value:
np.random.seed(seed_value) 

# Importando os dados

Data Set Information:

This dataset is composed of a range of biomedical voice measurements from 31 people, 23 with Parkinson's disease (PD). Each column in the table is a particular voice measure, and each row corresponds one of 195 voice recording from these individuals ("name" column). The main aim of the data is to discriminate healthy people from those with PD, according to "status" column which is set to 0 for healthy and 1 for PD. 

=> dados anonimizados mas cada linha nao corresponde a um individuo diferente

In [3]:
filename = 'parkinsons.data'
dfParkinson = pd.read_csv(filename)
#dfParkinson.info()
#dfParkinson.shape  # da (195,24)
dfParkinson.head(15)

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335
5,phon_R01_S01_6,120.552,131.162,113.787,0.00968,8e-05,0.00463,0.0075,0.01388,0.04701,...,0.06985,0.01222,21.378,1,0.415564,0.825069,-4.242867,0.299111,2.18756,0.357775
6,phon_R01_S02_1,120.267,137.244,114.82,0.00333,3e-05,0.00155,0.00202,0.00466,0.01608,...,0.02337,0.00607,24.886,1,0.59604,0.764112,-5.634322,0.257682,1.854785,0.211756
7,phon_R01_S02_2,107.332,113.84,104.315,0.0029,3e-05,0.00144,0.00182,0.00431,0.01567,...,0.02487,0.00344,26.892,1,0.63742,0.763262,-6.167603,0.183721,2.064693,0.163755
8,phon_R01_S02_3,95.73,132.068,91.754,0.00551,6e-05,0.00293,0.00332,0.0088,0.02093,...,0.03218,0.0107,21.812,1,0.615551,0.773587,-5.498678,0.327769,2.322511,0.231571
9,phon_R01_S02_4,95.056,120.103,91.226,0.00532,6e-05,0.00268,0.00332,0.00803,0.02838,...,0.04324,0.01022,21.862,1,0.547037,0.798463,-5.011879,0.325996,2.432792,0.271362


# Seleccionando as colunas

In [4]:
outcome_column = 'status'
# status: 0: healthy, 1: Parkinson's
outcome_labels = {0: 'healthy', 1:'Parkinson`s'}

label_counts = np.bincount(dfParkinson.status)
print(label_counts)

[ 48 147]


Mais pessoas com parkinsons (1) do que sem (0)

In [5]:
# Selecionamos todas as colunas como features, menos 'name' e 'status', essa última sendo nosso target.
features = [col for col in dfParkinson.columns if col not in ['name', 'status']]
features

# Isso pode ser modificado para outros modelos mais tarde, ao por exemplo escolher menos colunas!!

['MDVP:Fo(Hz)',
 'MDVP:Fhi(Hz)',
 'MDVP:Flo(Hz)',
 'MDVP:Jitter(%)',
 'MDVP:Jitter(Abs)',
 'MDVP:RAP',
 'MDVP:PPQ',
 'Jitter:DDP',
 'MDVP:Shimmer',
 'MDVP:Shimmer(dB)',
 'Shimmer:APQ3',
 'Shimmer:APQ5',
 'MDVP:APQ',
 'Shimmer:DDA',
 'NHR',
 'HNR',
 'RPDE',
 'DFA',
 'spread1',
 'spread2',
 'D2',
 'PPE']

In [6]:
# convert feature dataframe and label series to arrays
X = np.array(dfParkinson[features]) # X = array de dados dos pacientes
print(X[0, :])

Y = np.array(dfParkinson[outcome_column]) # Y = outcome "status" dos pacientes
print(Y[0])

[ 1.199920e+02  1.573020e+02  7.499700e+01  7.840000e-03  7.000000e-05
  3.700000e-03  5.540000e-03  1.109000e-02  4.374000e-02  4.260000e-01
  2.182000e-02  3.130000e-02  2.971000e-02  6.545000e-02  2.211000e-02
  2.103300e+01  4.147830e-01  8.152850e-01 -4.813031e+00  2.664820e-01
  2.301442e+00  2.846540e-01]
1


# K-Nearest Neighbor

In [7]:
# Separamos dados de treino e teste (20% de teste) para todos os modelos de K-NN

from sklearn.model_selection import train_test_split

dfTrain, dfTest = train_test_split(dfParkinson, test_size=0.2, 
                                   stratify=dfParkinson[outcome_column],random_state = seed_value)

# convertemos dataframe de features e labels em arrays
X_train = np.array(dfTrain[features])
Y_train = np.array(dfTrain[outcome_column])

# Imprimimos as dimensões das massas de treino
print("Dados de treino: ", X_train.shape[0], Y_train.shape[0])

# convertemos dataframe de features e labels em arrays
X_test = np.array(dfTest[features])
Y_test = np.array(dfTest[outcome_column])

# Imprimimos as dimensões das massas de teste
print("Dados de teste: ", X_test.shape[0], Y_test.shape[0])

Dados de treino:  156 156
Dados de teste:  39 39


Vamos criar varios modelos, variando o numero de vizinhos. Para isso, criamos um algoritmo geral para a implementacao do modelo:

In [8]:
from sklearn import neighbors
from sklearn import datasets
from sklearn import metrics

def knn_model (n_neighbors):
    # Create an instance of K-nearest neighbor classifier
    knn_model_n = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors)

    # Train the classifier with the train data
    knn_model_n.fit(X_train,Y_train)

    # Compute the prediction over the test data set according to the model
    Yhat_n = knn_model_n.predict(X_test)

    print('Numero de vizinhos: ', n_neighbors)
    print('Valores Preditos: ', Yhat_n)
    print ('Valor Predito: ' + str(Yhat_n[-1]), 
           '; Valor Real: ' + str(Y[-1]))
    print('\n')

    return knn_model_n, Yhat_n

### Modelos

In [9]:
# Criacao de varios modelos : 

# Criacao de uma lista com os modelos e os valores preditos:
knn_models = []
Yhat_models = []

for i in range (8):
    knn_model_i, Yhati = knn_model(i+1)
    knn_models.append(knn_model_i)
    Yhat_models.append(Yhati)


Numero de vizinhos:  1
Valores Preditos:  [1 1 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 0 0 1 1 1 1
 1 1]
Valor Predito: 1 ; Valor Real: 0


Numero de vizinhos:  2
Valores Preditos:  [0 0 1 1 0 1 1 1 0 1 0 1 1 0 1 0 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 0 0 1 1 1 1
 1 1]
Valor Predito: 1 ; Valor Real: 0


Numero de vizinhos:  3
Valores Preditos:  [1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 1 1
 1 1]
Valor Predito: 1 ; Valor Real: 0


Numero de vizinhos:  4
Valores Preditos:  [1 0 1 1 0 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 0 1 1 0 1 1 0 0 1 1 1 1
 1 1]
Valor Predito: 1 ; Valor Real: 0


Numero de vizinhos:  5
Valores Preditos:  [1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 1 1
 1 1]
Valor Predito: 1 ; Valor Real: 0


Numero de vizinhos:  6
Valores Preditos:  [1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 0 1 1 1 1
 1 1]
Valor Predito: 1 ; Valor Real: 0


Numero de vizinhos:  7
Valores Preditos:  [1 0 1 1 1

### Avaliando a qualidade dos modelos

Criamos um algoritmo geral para a avaliacao dos modelos :

In [10]:


def avaliacao_knn(knn_model_n, n_neighbors, Yhat_n):
    print('Numero de vizinhos: ', n_neighbors)
    
    # Accuracy
    accuracy_train = knn_model_n.score(X_train, Y_train)
    print('accuracy in training data:', '{:6.4f}'.format(accuracy_train))
    accuracy_test = knn_model_n.score(X_test, Y_test)
    print('accuracy in test data:    ', '{:6.4f}'.format(accuracy_test))
    
    # Confusion matrix
    confm = metrics.confusion_matrix(Y_test, Yhat_n)
    confmT = confm.T
    dfConfusionMatrix = pd.DataFrame(confmT)
    dfConfusionMatrix.columns = ['true ' + str(val) for val in outcome_labels]
    dfConfusionMatrix.index   = ['pred ' + str(val) for val in outcome_labels]
    dfCM = dfConfusionMatrix.iloc[[1,0],[1,0]]  # Positivos primeiro
    TP = dfCM.loc['pred 1', 'true 1'] 
    TN = dfCM.loc['pred 0', 'true 0']
    FP = dfCM.loc['pred 1', 'true 0'] # aka false alarm aka Type II error
    FN = dfCM.loc['pred 0', 'true 1'] # aka miss aka Type I error
    print('true positive (TP): ', TP)
    print('true negative (TN): ', TN)
    print('false positive (FP):', FP)
    print('false negative (FN):', FN)

    # Precision and recall
    
    # precision aka positive predictive value (PPV)
    # = what fraction of the cases that my model got are true positive?
    precision = TP / (TP + FP)
    print('precision   ', '{:7.4f}'.format(precision))
    # recall aka sensitivity aka hit rate aka true positive rate (TPR) = TP / P
    # = what fraction of the positive cases did my model get?
    recall = TP / (TP + FN)
    print('recall      ', '{:7.4f}'.format(recall))
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    print('accuracy    ', '{:7.4f}'.format(accuracy))
    F1_score = 2 * precision * recall / (precision + recall)
    print('F1_score    ', '{:7.4f}'.format(F1_score))
    # specificity aka true negative rate (TNR) = TN / N
    specificity = TN / (TN + FP)
    print('specificity ', '{:7.4f}'.format(specificity))

    # Visao geral
    # relato de várias métricas de qualidade
    print(metrics.classification_report(Y_test, Yhat_n))
    print(pd.DataFrame(metrics.classification_report(Y_test, Yhat_n, output_dict = True)))

    # support = number of samples of the true response that lie in the class
    # avg/total = weighted average (weights are the support values)
    print('\n')

In [11]:
for i in range(len(knn_models)):
    avaliacao_knn(knn_models[i], i+1, Yhat_models[i])

Numero de vizinhos:  1
accuracy in training data: 1.0000
accuracy in test data:     0.8205
true positive (TP):  26
true negative (TN):  6
false positive (FP): 4
false negative (FN): 3
precision     0.8667
recall        0.8966
accuracy      0.8205
F1_score      0.8814
specificity   0.6000
              precision    recall  f1-score   support

           0       0.67      0.60      0.63        10
           1       0.87      0.90      0.88        29

   micro avg       0.82      0.82      0.82        39
   macro avg       0.77      0.75      0.76        39
weighted avg       0.82      0.82      0.82        39

                   0          1  micro avg  macro avg  weighted avg
f1-score    0.631579   0.881356   0.820513   0.756467      0.817311
precision   0.666667   0.866667   0.820513   0.766667      0.815385
recall      0.600000   0.896552   0.820513   0.748276      0.820513
support    10.000000  29.000000  39.000000  39.000000     39.000000


Numero de vizinhos:  2
accuracy in trainin

Os melhores modelos parecem ser o 3 ou o 6, pois tem as melhores acuracias

In [12]:
avaliacao_knn(knn_models[2], 3, Yhat_models[2])
avaliacao_knn(knn_models[5], 6, Yhat_models[5])

Numero de vizinhos:  3
accuracy in training data: 0.8910
accuracy in test data:     0.8462
true positive (TP):  28
true negative (TN):  5
false positive (FP): 5
false negative (FN): 1
precision     0.8485
recall        0.9655
accuracy      0.8462
F1_score      0.9032
specificity   0.5000
              precision    recall  f1-score   support

           0       0.83      0.50      0.62        10
           1       0.85      0.97      0.90        29

   micro avg       0.85      0.85      0.85        39
   macro avg       0.84      0.73      0.76        39
weighted avg       0.84      0.85      0.83        39

                   0          1  micro avg  macro avg  weighted avg
f1-score    0.625000   0.903226   0.846154   0.764113      0.831886
precision   0.833333   0.848485   0.846154   0.840909      0.844600
recall      0.500000   0.965517   0.846154   0.732759      0.846154
support    10.000000  29.000000  39.000000  39.000000     39.000000


Numero de vizinhos:  6
accuracy in trainin

Observamos que os três modelos têm a mesma Confusion matrix, e os três modelos sao bons, pois : 

* A acuracia nos dados de teste é melhor do que todos os outros modelos, mesmo que a acuracia nos dados de treino nao o seja (melhor no modelo 1 ou 2)

* O recall é muito alto, o que é bom porque significa que a maioria dos pacientes com PD sao categorizados como PD

* O F1_score, usado como avaliador do modelo, é o mais alto também

Porém,

* A specificity, que mede a precisao na avaliacao de nao PD, nao é tao boa...

Para saber um pouco mais qual é o melhor modelo dentre esses 2 modelos, vamos variar os datasets de teste e treino

In [13]:
PRC = 0.2
knn_good_models = [knn_models[2],knn_models[5]]  ## Modelos 3 e 6
for j in range(2):
    knn_model = knn_good_models[j]

    acc = np.zeros((10,))
    precision = np.zeros((10,))
    recall = np.zeros((10,))
    f1_score = np.zeros((10,))
    specificity = np.zeros((10,))
    for i in range(10):
        dfTrain, dfTest = train_test_split(dfParkinson, test_size=PRC, stratify=dfParkinson[outcome_column], random_state = seed_value)
        # convertemos dataframe de features e labels em arrays
        X_train = np.array(dfTrain[features])
        Y_train = np.array(dfTrain[outcome_column])
        
        # convertemos dataframe de features e labels em arrays
        X_test = np.array(dfTest[features])
        Y_test = np.array(dfTest[outcome_column])
        
        #knn_model = neighbors.KNeighborsClassifier(n_neighbors=8)
        knn_model.fit(X_train, Y_train)
        Yhat = knn_model.predict(X_test)
        acc[i] = metrics.accuracy_score(Y_test, Yhat)
        precision[i] = metrics.precision_score(Y_test, Yhat)
        recall[i] = metrics.recall_score(Y_test, Yhat)
        f1_score[i] = metrics.f1_score(Y_test, Yhat)
        
    acc.shape=(1,10)
    print('Numero de vizinhos: ' + str((j+1)*3))
    print('=> Mean accuracy: ' + str(np.mean(acc[0])))
    print('=> Mean precision: ' + str(np.mean(precision[0])))
    print('=> Mean recall: ' + str(np.mean(recall[0])))
    print('=> Mean f1_score: ' + str(np.mean(f1_score[0])))
    print('\n')

    
    

Numero de vizinhos: 3
=> Mean accuracy: 0.8461538461538461
=> Mean precision: 0.8484848484848485
=> Mean recall: 0.9655172413793104
=> Mean f1_score: 0.9032258064516129


Numero de vizinhos: 6
=> Mean accuracy: 0.8461538461538461
=> Mean precision: 0.8709677419354839
=> Mean recall: 0.9310344827586207
=> Mean f1_score: 0.9




Os modelos sao bastante similares, mas tenderia a dizer que o melhor é o modelo com 3 vizinhos, pois o recall e f1_score sao melhores