In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from keras.utils import np_utils




data = pd.read_csv("Alzheimer's.csv")

#Converted the group to dummy variables
data.loc[data.Group=='Nondemented', 'Group'] = 0
data.loc[data.Group=='Demented', 'Group'] = 1
data.loc[data.Group=='Converted', 'Group'] = 2

#Converted the M/F to dummy variables
data.loc[data.Sex=='M', 'Sex'] = 0
data.loc[data.Sex=='F', 'Sex'] = 1

#ToDo
#Fill in SES and Mini Mental State missing sample values
#Use the mean of that category for the value
# Get rid of SES data 

data

Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,Sex,Hand,Age,EDUC,SES,Mini Mental State,Clinical Dementia Rating,Estimated total Intracranial Volume,Normalize Whole Brain Volume,Atlas Scaling Factor
0,OAS2_0001,OAS2_0001_MR1,0,1,0,0,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,OAS2_0001_MR2,0,2,457,0,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,OAS2_0002,OAS2_0002_MR1,1,1,0,0,R,75,12,,23.0,0.5,1678,0.736,1.046
3,OAS2_0002,OAS2_0002_MR2,1,2,560,0,R,76,12,,28.0,0.5,1738,0.713,1.010
4,OAS2_0002,OAS2_0002_MR3,1,3,1895,0,R,80,12,,22.0,0.5,1698,0.701,1.034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,OAS2_0185,OAS2_0185_MR2,1,2,842,0,R,82,16,1.0,28.0,0.5,1693,0.694,1.037
369,OAS2_0185,OAS2_0185_MR3,1,3,2297,0,R,86,16,1.0,26.0,0.5,1688,0.675,1.040
370,OAS2_0186,OAS2_0186_MR1,0,1,0,1,R,61,13,2.0,30.0,0.0,1319,0.801,1.331
371,OAS2_0186,OAS2_0186_MR2,0,2,763,1,R,63,13,2.0,30.0,0.0,1327,0.796,1.323


In [2]:
#PCA might be a good technique to select predictors 

#note that PCA performs best when data is normalized (range b/w 0 and 1)

#It is possible to use categorical and continuous predictors 
#for a regression problem. My understanding is you need to make 
#dummy variables for the binary predictors. 

#Variables that we will need to deal with: 
# Hand, Visit, Subject ID, MRI ID

In [3]:
#Attempting PCA on data
#Hand is completely useless as it is identical for all samples
data_drop = data.drop(['Hand','Visit','Subject ID','MRI ID'], axis = 1) #axis = 1 means to drop column not row

#get rid of row 360 and 359 bc they are missing alot of data (both SES and MMS)
data_drop = data_drop.drop([360, 359])


#delete all data points that dont have SES in them (this is where they have NaN)
data_drop = data_drop.dropna()

#dementia status is what we want to predict - change this to single target 
group = data_drop[['Group']] 



data_drop = data_drop.drop(['Group'], axis = 1) #axis = 1 means to drop column not row




In [4]:
#get a list of columns in pandas object 
names_of_data = data_drop.columns.tolist()

#shuffle = false prevents data split being different everytime
X_train, X_test, y_train, y_test = train_test_split(data_drop, group, test_size=0.2, shuffle = False)

#split test into validate and test, again making sure the data is always the same for consistency
#X_test, X_val, y_test, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle = False)

#Normalizing the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#running the actual PCA
from sklearn.decomposition import PCA

pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)


y_train = y_train.astype('int')
y_test = y_test.astype('int')


from sklearn.neighbors import KNeighborsClassifier
accuracy = list()

#Find which one gets the highest accuracy
for K in range(1,30):
    #K = K+1
    classifier = KNeighborsClassifier(n_neighbors = K)

    classifier.fit(X_train, y_train)  #fit the model
    y_pred = classifier.predict(X_test)
    

    accuracy.append(accuracy_score(y_test,y_pred)) #store rmse values
    print('accuracy value for k= ' , K , 'is:', accuracy[K-1])
    
max_value = max(accuracy)
best_k_value = accuracy.index(max_value) + 1
print(best_k_value)
classifier = KNeighborsClassifier(n_neighbors=best_k_value)
classifier.fit(X_train, y_train) 
#relief f algorithm - sorting features 

accuracy value for k=  1 is: 0.7605633802816901
accuracy value for k=  2 is: 0.7746478873239436
accuracy value for k=  3 is: 0.7746478873239436
accuracy value for k=  4 is: 0.7887323943661971
accuracy value for k=  5 is: 0.7605633802816901
accuracy value for k=  6 is: 0.7605633802816901
accuracy value for k=  7 is: 0.7605633802816901
accuracy value for k=  8 is: 0.7605633802816901
accuracy value for k=  9 is: 0.7605633802816901
accuracy value for k=  10 is: 0.7605633802816901
accuracy value for k=  11 is: 0.7605633802816901
accuracy value for k=  12 is: 0.7605633802816901
accuracy value for k=  13 is: 0.7464788732394366
accuracy value for k=  14 is: 0.7605633802816901
accuracy value for k=  15 is: 0.7746478873239436
accuracy value for k=  16 is: 0.7605633802816901
accuracy value for k=  17 is: 0.7605633802816901
accuracy value for k=  18 is: 0.7605633802816901
accuracy value for k=  19 is: 0.7464788732394366
accuracy value for k=  20 is: 0.7464788732394366
accuracy value for k=  21 is:



KNeighborsClassifier(n_neighbors=4)

In [5]:
pd.set_option("display.max_rows", None, "display.max_columns", None)


display(y_train)

y_pred

Unnamed: 0,Group
0,0
1,0
5,0
6,0
7,0
8,0
9,0
13,0
14,0
15,1


array([0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0])

In [6]:
y_pred = classifier.predict(X_test)


from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
results = classification_report(y_test, y_pred)
results[0]

accuracy_score(y_test,y_pred)

[[38  0  0]
 [ 5 17  4]
 [ 3  3  1]]
              precision    recall  f1-score   support

           0       0.83      1.00      0.90        38
           1       0.85      0.65      0.74        26
           2       0.20      0.14      0.17         7

    accuracy                           0.79        71
   macro avg       0.63      0.60      0.60        71
weighted avg       0.77      0.79      0.77        71



0.7887323943661971