In [6]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


data = pd.read_csv("Alzheimer's.csv")

#categorical conversion 
#Converted the group to dummy variables
data.loc[data.Group=='Nondemented', 'Group'] = 0
data.loc[data.Group=='Demented', 'Group'] = 1
data.loc[data.Group=='Converted', 'Group'] = 2

#Converted the M/F to dummy variables
data.loc[data.Sex=='M', 'Sex'] = 0
data.loc[data.Sex=='F', 'Sex'] = 1

#ToDo
#Fill in SES and Mini Mental State missing sample values
#Use the mean of that category for the value
# Get rid of SES data 

data


Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,Sex,Hand,Age,EDUC,SES,Mini Mental State,Clinical Dementia Rating,Estimated total Intracranial Volume,Normalize Whole Brain Volume,Atlas Scaling Factor
0,OAS2_0001,OAS2_0001_MR1,0,1,0,0,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,OAS2_0001_MR2,0,2,457,0,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,OAS2_0002,OAS2_0002_MR1,1,1,0,0,R,75,12,,23.0,0.5,1678,0.736,1.046
3,OAS2_0002,OAS2_0002_MR2,1,2,560,0,R,76,12,,28.0,0.5,1738,0.713,1.010
4,OAS2_0002,OAS2_0002_MR3,1,3,1895,0,R,80,12,,22.0,0.5,1698,0.701,1.034
5,OAS2_0004,OAS2_0004_MR1,0,1,0,1,R,88,18,3.0,28.0,0.0,1215,0.710,1.444
6,OAS2_0004,OAS2_0004_MR2,0,2,538,1,R,90,18,3.0,27.0,0.0,1200,0.718,1.462
7,OAS2_0005,OAS2_0005_MR1,0,1,0,0,R,80,12,4.0,28.0,0.0,1689,0.712,1.039
8,OAS2_0005,OAS2_0005_MR2,0,2,1010,0,R,83,12,4.0,29.0,0.5,1701,0.711,1.032
9,OAS2_0005,OAS2_0005_MR3,0,3,1603,0,R,85,12,4.0,30.0,0.0,1699,0.705,1.033


In [7]:
#PCA might be a good technique to select predictors 

#note that PCA performs best when data is normalized (range b/w 0 and 1)

#It is possible to use categorical and continuous predictors 
#for a regression problem. My understanding is you need to make 
#dummy variables for the binary predictors. 

#Variables that we will need to deal with: 
# Hand, Visit, Subject ID, MRI ID

In [8]:
#Attempting PCA on data
#Hand is completely useless as it is identical for all samples
data_drop = data.drop(['Hand','Visit','Subject ID','MRI ID'], axis = 1) #axis = 1 means to drop column not row

#get rid of row 360 and 359 bc they are missing alot of data (both SES and MMS)
data_drop = data_drop.drop([360, 359])


#delete all data points that dont have SES in them (this is where they have NaN)
data_drop = data_drop.dropna()

#dementia status is what we want to predict - change this to single target 
group = data_drop[['Group']] 



data_drop = data_drop.drop(['Group'], axis = 1) #axis = 1 means to drop column not row

In [9]:
data_drop.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 354 entries, 0 to 372
Data columns (total 10 columns):
MR Delay                               354 non-null int64
Sex                                    354 non-null int64
Age                                    354 non-null int64
EDUC                                   354 non-null int64
SES                                    354 non-null float64
Mini Mental State                      354 non-null float64
Clinical Dementia Rating               354 non-null float64
Estimated total Intracranial Volume    354 non-null int64
Normalize Whole Brain Volume           354 non-null float64
Atlas Scaling Factor                   354 non-null float64
dtypes: float64(5), int64(5)
memory usage: 30.4 KB


In [12]:
#get a list of columns in pandas object 
names_of_data = data_drop.columns.tolist()

#shuffle = false prevents data split being different everytime
X_train, X_test, y_train, y_test = train_test_split(data_drop, group, test_size=0.2, shuffle = False)

#split train into validate and test, again making sure the data is always the same for consistency
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, shuffle = False)

#Normalizing the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_val = sc.transform(X_val)



#running the actual PCA
from sklearn.decomposition import PCA

pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)


y_train = y_train.astype('int')
y_test = y_test.astype('int').to_numpy().ravel()


In [13]:
data_drop.loc[355]

MR Delay                                652.000
Sex                                       0.000
Age                                      81.000
EDUC                                     20.000
SES                                       1.000
Mini Mental State                        26.000
Clinical Dementia Rating                  0.500
Estimated total Intracranial Volume    1556.000
Normalize Whole Brain Volume              0.691
Atlas Scaling Factor                      1.128
Name: 355, dtype: float64

In [None]:
# explained_variance = pca.explained_variance_ratio_
print(len(explained_variance))
print(explained_variance)

# Implementing Models

In [16]:
#importing libraries from sklearn for analysis 
from sklearn.metrics import accuracy_score
#from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# Logistic Regression Classifier

In [17]:
#importing logistic regression classifier from sklearn 
from sklearn.linear_model import LogisticRegression

In [18]:
#initalizing logistic regression 
clf = LogisticRegression()

In [26]:
#inspecting labels 
y_train["Group"].unique()

array([0, 1, 2])

In [98]:
#fitting the data for training the model, changing the type of y into an integer to make it compatiable with the library
clf.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [99]:
#testing the accuracy score
clf.score(X_test, y_test)

0.9014084507042254

In [100]:
#generating a classification report to analyze the results of the training using the testing data.
#we can see that group 0 had the best recall, with 2 being the worst recall
y_pred= clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
results = classification_report(y_test, y_pred)
accuracy_score(y_test, y_pred)

[[38  0  0]
 [ 0 26  0]
 [ 4  3  0]]
              precision    recall  f1-score   support

           0       0.90      1.00      0.95        38
           1       0.90      1.00      0.95        26
           2       0.00      0.00      0.00         7

    accuracy                           0.90        71
   macro avg       0.60      0.67      0.63        71
weighted avg       0.81      0.90      0.85        71



  'precision', 'predicted', average, warn_for)


0.9014084507042254

# 

In [102]:
 #creating a pipeline that incorporates a standard scalar to reduce the impact of outliers in data. This drastically improves the performance of the model.  
 import numpy as np
from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf2 = make_pipeline( SVC(gamma='auto'))
clf2.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)


Pipeline(memory=None,
         steps=[('svc',
                 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma='auto',
                     kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [103]:
#getting the accuracy score 
clf2.score(X_train, y_train)

0.9557522123893806

In [104]:
y_pred= clf2.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
results = classification_report(y_test, y_pred)
accuracy_score(y_test, y_pred)

[[38  0  0]
 [ 2 23  1]
 [ 4  3  0]]
              precision    recall  f1-score   support

           0       0.86      1.00      0.93        38
           1       0.88      0.88      0.88        26
           2       0.00      0.00      0.00         7

    accuracy                           0.86        71
   macro avg       0.58      0.63      0.60        71
weighted avg       0.79      0.86      0.82        71



0.8591549295774648

In [123]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [124]:

#initalizing the MLP classifier.
#The max iterations was set to 1000, the hidden layers was set to 500, for improved performance and increased complexity
clf1 = MLPClassifier(random_state=1, max_iter=1000, activation= "logistic", hidden_layer_sizes=500).fit(X_train, y_train)

In [125]:
#getting the accuracy score for the MLP classifier
clf1.score(X_test, y_test)

0.92

In [126]:
y_pred= clf1.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
results = classification_report(y_test, y_pred)
accuracy_score(y_test, y_pred)

[[13  0]
 [ 2 10]]
              precision    recall  f1-score   support

           0       0.87      1.00      0.93        13
           1       1.00      0.83      0.91        12

    accuracy                           0.92        25
   macro avg       0.93      0.92      0.92        25
weighted avg       0.93      0.92      0.92        25



0.92