In [68]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score,learning_curve, GroupKFold,GridSearchCV,StratifiedKFold
from sklearn.metrics import confusion_matrix



In [69]:
df =  pd.read_csv('../2017_07_07/TADPOLE_D1_D2.csv',low_memory=False,header = 0)

In [70]:
#Name of the columns
df.head(0)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12741 entries, 0 to 12740
Columns: 1907 entries, RID to update_stamp_UPENNBIOMK9_04_19_17
dtypes: float64(72), int64(8), object(1827)
memory usage: 185.4+ MB


In [71]:
#Do some test on the data

print(df['DX'].head(1))
print(df['DX_bl'].unique())
print(df['EXAMDATE_BAIPETNMRC_09_12_16'].head(1))
print(df['PTETHCAT'].head(1))
print(df['ABETA_UPENNBIOMK9_04_19_17'].head(1))

0    NL
Name: DX, dtype: object
['CN' 'AD' 'LMCI' 'EMCI' 'SMC']
0    2005-09-22
Name: EXAMDATE_BAIPETNMRC_09_12_16, dtype: object
0    Not Hisp/Latino
Name: PTETHCAT, dtype: object
0     
Name: ABETA_UPENNBIOMK9_04_19_17, dtype: object


In [72]:
df['DX_bl'].unique()
df['DX'].unique()

array(['NL', 'Dementia', 'MCI', 'NL to MCI', 'MCI to Dementia',
       'MCI to NL', 'Dementia to MCI', nan, 'NL to Dementia'],
      dtype=object)

In [73]:
pd.set_option('display.max_columns', None)

#important columns to convert to int: EXAMDATE	DX_bl	DXCHANGE	AGE	PTGENDER	PTEDUCAT	PTETHCAT	PTRACCAT	PTMARRY	

In [74]:
print(df['DXCHANGE'].head(1))
'1=Stable:NL to NL, 2=Stable:MCI to MCI, 3=Stable:AD to AD, ' 
'4=Conv:NL to MCI, 5=Conv:MCI to AD, 6=Conv:NL to AD, ' 
'7=Rev:MCI to NL, 8=Rev:AD to MCI, 9=Rev:AD to NL, -1=Not available'
# 1 7 9 final NL / 2 4 8 final MCI / 3 5 6 final AD

0    1.0
Name: DXCHANGE, dtype: float64


'7=Rev:MCI to NL, 8=Rev:AD to MCI, 9=Rev:AD to NL, -1=Not available'

In [75]:
data = df
print(data.shape)

#Convert to numeric
data['PTGENDER']= data['PTGENDER'].replace(['Male','Female'],[1,0])
data['APOE4']= data['APOE4'].astype(float)
data['AGE']= data['AGE'].astype(float)
data['PTETHCAT']= data['PTETHCAT'].replace(['Hisp/Latino','Not Hisp/Latino','Unknown'],[2,1,0]).astype(int)
data['PTMARRY']= data['PTMARRY'].replace(['Married','Widowed','Divorced','Never married','Unknown'],[0,1,2,3,4]).astype(int)
data['PTRACCAT'] = data['PTRACCAT'].replace(['White','More than one','Black','Asian', 'Am Indian/Alaskan','Hawaiian/Other PI', 'Unknown'],[0,1,2,3,4,5,6]).astype(int)
data['DX_bl'] = data['DX_bl'].replace(['CN' ,'AD' ,'LMCI' ,'EMCI' ,'SMC'],['0','1','2','3','4']).astype(int)
data['DX'] = data['DX'].replace(['NL' ,'Dementia', 'MCI', 'NL to MCI', 'MCI to Dementia', 'MCI to NL','Dementia to MCI','nan' ,'NL to Dementia'],['1','3','4','5','6','7','8','9','10'])
data['DX']=data['DX'].fillna(0)

data['EXAMDATE_bl'] = pd.to_datetime(data['EXAMDATE_bl']).dt.strftime("%Y%m%d").astype(int)
print(data['EXAMDATE_bl'].head(1))

data['EXAMDATE'] = pd.to_datetime(data['EXAMDATE']).dt.strftime("%Y%m%d").astype(int)
print(data['EXAMDATE'].head(1))
#test = data['DXCHANGE'].replace([1,2,3,4,5,6,7,8,9,-1],[1,2,3,2,3,3,1,2,1,-1]).astype(int)
#test=test.fillna(0)
#data['DXCHANGE_TEST']= test

# 1 7 9 final NL / 2 4 8 final MCI / 3 5 6 final AD
#Print some data to check if it is correct
print(data['DX_bl'].head(1))
print(data['DX'].unique())

# drop the columns in data they ate objet type
data = data.select_dtypes(exclude=['object'])
print(data.shape)


(12741, 1907)
0    20050908
Name: EXAMDATE_bl, dtype: int32
0    20050908
Name: EXAMDATE, dtype: int32
0    0
Name: DX_bl, dtype: int32
['1' '3' '4' '5' '6' '7' '8' 0 '10']
(12741, 87)


The main measures to be predicted: DX, ADAS13, Ventricles
Cognitive tests: CDRSB, ADAS11, MMSE, RAVLT_immediate
MRI measures: Hippocampus, WholeBrain, Entorhinal, MidTemp
PET measures: FDG, AV45
CSF measures: ABETA_UPENNBIOMK9_04_19_17  (amyloid-beta level in CSF), TAU_UPENNBIOMK9_04_19_17 (tau level), PTAU_UPENNBIOMK9_04_19_17 (phosphorylated tau level)
Risk factors: APOE4, AGE

In [76]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12741 entries, 0 to 12740
Data columns (total 87 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   RID                       12741 non-null  int64  
 1   SITE                      12741 non-null  int64  
 2   D1                        12741 non-null  int64  
 3   D2                        12741 non-null  int64  
 4   EXAMDATE                  12741 non-null  int32  
 5   DX_bl                     12741 non-null  int32  
 6   DXCHANGE                  8892 non-null   float64
 7   AGE                       12741 non-null  float64
 8   PTGENDER                  12741 non-null  int64  
 9   PTEDUCAT                  12741 non-null  int64  
 10  PTETHCAT                  12741 non-null  int32  
 11  PTRACCAT                  12741 non-null  int32  
 12  PTMARRY                   12741 non-null  int32  
 13  APOE4                     12729 non-null  float64
 14  FDG   

In [77]:
data.drop(columns=['PIB','AV45'])
data.drop(data.iloc[:, 47:83], inplace=True, axis=1)
data.shape

(12741, 51)

In [80]:
data['DX'].unique

KeyError: 'DX'

In [78]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12741 entries, 0 to 12740
Data columns (total 51 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   RID                    12741 non-null  int64  
 1   SITE                   12741 non-null  int64  
 2   D1                     12741 non-null  int64  
 3   D2                     12741 non-null  int64  
 4   EXAMDATE               12741 non-null  int32  
 5   DX_bl                  12741 non-null  int32  
 6   DXCHANGE               8892 non-null   float64
 7   AGE                    12741 non-null  float64
 8   PTGENDER               12741 non-null  int64  
 9   PTEDUCAT               12741 non-null  int64  
 10  PTETHCAT               12741 non-null  int32  
 11  PTRACCAT               12741 non-null  int32  
 12  PTMARRY                12741 non-null  int32  
 13  APOE4                  12729 non-null  float64
 14  FDG                    3352 non-null   float64
 15  PI

In [79]:
# Algorithme des k plus proches voisins
model = KNeighborsClassifier()
y = data['DXCHANGE']
Y = data['DX']
#Y = data['DX']

X = data.drop('DXCHANGE', axis=1)
X = data.drop('DX', axis=1)
#delete bl
y = np.nan_to_num(y)
X= np.nan_to_num(X)
Y = np.nan_to_num(Y)

KeyError: 'DX'

In [None]:

#Split the data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=5)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=5)

model.fit(X_train,y_train)
print(model.score(X_train,y_train))
print(model.score(X_test,y_test))

model.fit(X_train,Y_train)
print(model.score(X_train,Y_train))
print(model.score(X_test,Y_test))

0.6646607360893075
0.5184313725490196
0.6623059480202337
0.48470588235294115


In [None]:
#Take the best parameters for the model
param_grid = {'n_neighbors': np.arange(1, 25),'metric': ['euclidean', 'manhattan']}
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid.fit(X_train, y_train)
#values count 




KeyboardInterrupt: 

In [None]:

grid2 = grid
grid2.fit(X_train, Y_train)

In [None]:
# Print the tuned parameters and score
print(grid.best_params_)
print(grid.best_score_)
print(grid.best_estimator_)

print(model.score(X_train,y_train))
print(model.score(X_test,y_test))


In [None]:
# Print the tuned parameters and score of the current diagnostic
print(grid2.best_params_)
print(grid2.best_score_)
print(grid2.best_estimator_)

model2 = grid2.best_estimator_
print(model2.score(X_train,Y_train))
print(model2.score(X_test,Y_test))

In [None]:
#View the error of the model
confusion_matrix(y_test, model.predict(X_test))


In [None]:

confusion_matrix(Y_test, model2.predict(X_test))


In [None]:
#the learning curve
N, train_score,val_score = learning_curve(model,X_train,y_train,train_sizes = np.linspace(0.2,1.0,5),cv=5)

In [None]:
#the learning curve
N2, train_score2,val_score2 = learning_curve(model2,X_train,Y_train,train_sizes = np.linspace(0.2,1.0,5),cv=5)

In [None]:
#Plot the learning curve
print(N)
plt.plot(N,train_score.mean(axis=1), label='train')
plt.plot(N,val_score.mean(axis=1), label='validation')
plt.xlabel('train_sizes')
plt.legend()


In [None]:
#Plot the learning curve
print(N2)
plt.plot(N2,train_score.mean(axis=1), label='train')
plt.plot(N2,val_score2.mean(axis=1), label='validation')
plt.xlabel('train_sizes')
plt.legend()

In [None]:
#Chose the best split for the data
cv =  GroupKFold(5).get_n_splits(X,y,groups=X[:,5])
cross_val_score(model,X,y,cv=cv)

In [None]:
cv= StratifiedKFold(5)
cross_val_score(model,X,y,cv=cv)

In [None]:
cv2= StratifiedKFold(5)
cross_val_score(model2,X,Y,cv=cv2)