In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



data = pd.read_csv("Alzheimer's.csv")

#Converted the group to dummy variables
data.loc[data.Group=='Nondemented', 'Group'] = 0
data.loc[data.Group=='Demented', 'Group'] = 1
data.loc[data.Group=='Converted', 'Group'] = 2

#Converted the M/F to dummy variables
data.loc[data.Sex=='M', 'Sex'] = 0
data.loc[data.Sex=='F', 'Sex'] = 1

#ToDo
#Fill in SES and Mini Mental State missing sample values
#Use the mean of that category for the value
# Get rid of SES data 

data

Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,Sex,Hand,Age,EDUC,SES,Mini Mental State,Clinical Dementia Rating,Estimated total Intracranial Volume,Normalize Whole Brain Volume,Atlas Scaling Factor
0,OAS2_0001,OAS2_0001_MR1,0,1,0,0,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,OAS2_0001_MR2,0,2,457,0,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,OAS2_0002,OAS2_0002_MR1,1,1,0,0,R,75,12,,23.0,0.5,1678,0.736,1.046
3,OAS2_0002,OAS2_0002_MR2,1,2,560,0,R,76,12,,28.0,0.5,1738,0.713,1.010
4,OAS2_0002,OAS2_0002_MR3,1,3,1895,0,R,80,12,,22.0,0.5,1698,0.701,1.034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,OAS2_0185,OAS2_0185_MR2,1,2,842,0,R,82,16,1.0,28.0,0.5,1693,0.694,1.037
369,OAS2_0185,OAS2_0185_MR3,1,3,2297,0,R,86,16,1.0,26.0,0.5,1688,0.675,1.040
370,OAS2_0186,OAS2_0186_MR1,0,1,0,1,R,61,13,2.0,30.0,0.0,1319,0.801,1.331
371,OAS2_0186,OAS2_0186_MR2,0,2,763,1,R,63,13,2.0,30.0,0.0,1327,0.796,1.323


In [2]:
#Variables that we will need to deal with: 
# Hand, Visit, Subject ID, MRI ID

In [3]:
#Attempting PCA on data
#Hand is completely useless as it is identical for all samples
data_drop = data.drop(['Hand','Visit','Subject ID','MRI ID'], axis = 1) #axis = 1 means to drop column not row

#get rid of row 360 and 359 bc they are missing alot of data (both SES and MMS)
data_drop = data_drop.drop([360, 359])


#delete all data points that dont have SES in them (this is where they have NaN)
data_drop = data_drop.dropna()

#dementia status is what we want to predict - change this to single target 
group = data_drop[['Group']] 



data_drop = data_drop.drop(['Group'], axis = 1) #axis = 1 means to drop column not row




In [4]:
#get a list of columns in pandas object 
names_of_data = data_drop.columns.tolist()

#shuffle = false prevents data split being different everytime
X_train, X_test, y_train, y_test = train_test_split(data_drop, group, test_size=0.2, shuffle = False)

#split test into validate and test, again making sure the data is always the same for consistency
#X_test, X_val, y_test, y_val = train_test_split(X_train, y_train, test_size=0.25, shuffle = False)

#Normalizing the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#running the actual PCA
from sklearn.decomposition import PCA

pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)


y_train = y_train.astype('int')
y_test = y_test.astype('int').to_numpy().ravel()

#let us use an SVM as a classifer. We use this becasue we dont have a good idea of how the data is distributed.
#We also do not have a very large dataset, which makes SVM a reasonable choice. In addition, 
#the number of dimensions are less than the number of data points. 


from sklearn import svm

classifier = svm.SVC(kernel='linear') #linear kernel

classifier.fit(X_train, y_train.to_numpy().ravel())


SVC(kernel='linear')

In [5]:
y_pred = classifier.predict(X_test)


from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
results = classification_report(y_test, y_pred)

accuracy_score(y_test, y_pred)

[[38  0  0]
 [ 0 21  5]
 [ 4  1  2]]
              precision    recall  f1-score   support

           0       0.90      1.00      0.95        38
           1       0.95      0.81      0.88        26
           2       0.29      0.29      0.29         7

    accuracy                           0.86        71
   macro avg       0.72      0.70      0.70        71
weighted avg       0.86      0.86      0.86        71



0.8591549295774648

In [6]:
#Verifying that this 100% accuracy is correct

print(y_test)
print(y_pred)



[2 2 2 2 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 1 1 1 1 1 1 0
 0 0 0 0 1 1 0 0 0 1 1 1 2 2 2 0 0 0 0 0 1 1 0 0 0 0 1 1 1 1 1 0 0 0]
[0 2 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 1 1 1 1 1 1 0
 0 0 0 0 2 2 0 0 0 1 1 1 0 0 2 0 0 0 0 0 1 1 0 0 0 0 1 1 2 2 2 0 0 0]
