>**Load the libraries, mount drive and load data**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df=pd.read_csv("/content/drive/MyDrive/d2k/tarrc_nongenomic.csv")

>**Data cleaning**

In [5]:
df.iloc[:,600:700]

Unnamed: 0,RBM_ANG_2,RBM_Angiotensinogen,RBM_APO_A1,RBM_APO_CIII,RBM_APO_H,RBM_ASP,RBM_AXL,RBM_BLC,RBM_B2M,RBM_BTC,...,RBM_SGOT,RBM_SHBG,RBM_SOD,RBM_Sortilin,RBM_sRAGE,RBM_SCF,RBM_Tenascin_C,RBM_Testosterone,RBM_TGF_alpha,RBM_THPO
0,,,,,,,,,,,...,,,,,,,,,,
1,3.4,31,.41,29,231,.14,1.3,18,2.1,35.5,...,14,77,18,7.4,3.4,248,2890,2,.155,2.5
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14650,,,,,,,,,,,...,,,,,,,,,,
14651,,,,,,,,,,,...,,,,,,,,,,
14652,,,,,,,,,,,...,,,,,,,,,,
14653,,,,,,,,,,,...,,,,,,,,,,


In [6]:
df.shape

(14655, 787)

In [4]:
mod_df = df.replace(r'^\s*$', np.nan, regex=True)
min_count =  int((45/100)*mod_df.shape[0] + 1) #55% is NA
mod_df = mod_df.dropna(axis=1, thresh=min_count)

In [None]:
#list(mod_df.columns)

In [10]:
mod_df.shape

(14655, 476)

In [5]:
def preprocessing(df):
  df2 = mod_df.drop(mod_df.iloc[:, 0:7],axis = 1)
  df2 = df2.drop(df2.iloc[:, 1:5],axis = 1)
  df2 = df2.drop(df2.iloc[:, 5:16],axis = 1)
  df2=df2.drop(["A43_ADAF","A43_ADBF","A43_ADCF","A43_ADDF","A43_ADEF","A43_ADFF"], axis=1)
  df2 = df2.drop(df2.iloc[:, 8:10],axis = 1)
  df2 = df2.drop(df2.iloc[:, 9:12],axis = 1)
  df2=df2.drop(['A43_ADBS','A43_ADBSU', 'A43_ADCS','A43_ADCSU','A43_ADDS','A43_ADDSU','A43_ADES','A43_ADESU','A43_ADFS','A43_ADFSU'], axis=1)
  df2 = df2.drop(df2.iloc[:, 25:54],axis = 1)
  df2=df2.drop(['A5_CBOTHR','A5_CHRON_OTH','A5_CVOTHR','A5_DEPOTHR','A5_NCOTHR','A5_PDOTHR', 
                'A5_PDOTHRYR', 'A5_PDYR', 'B1_HEARAID', 'B1_HEARING','B1_HEARWAID', 'B1_VISCORR',
                'B1_VISION','B1_VISWCORR','B5_NPIQINF','C1_DATEX','C1_CDRCA','C1_CDRGLOB','C1_CDRHOB',
                'C1_CDRJU','C1_CDRMEM','C1_CDROR','C1_CDRPER', 'C1_CERAD_LL_1','C1_CERAD_LL_2',
                'C1_CERAD_LL_3','C1_CERAD_LL_DELAY', 'C1_CERAD_WR_DISCRIM','C1_CERAD_WR_NO',
                'C1_CERAD_WR_YES', 'C1_FAS_A','C1_FAS_F','C1_FAS_S','C1_GDS15', 'P1_WHYNOTALL',
                'P1_WHOLEBLOOD',  'P1_SERUM','P1_SHARE_AGREE','P1_TIMEDRAWX','P1_TIMEFOODX','P1_BIOSERUM',
                'P1_BUFFY','P1_CSF','P1_DATEDRAWX','P1_DATEFOODX','P1_HRSAFTFOOT', 'P1_PLASMA', 
                'E1_WDOTHREAS','E1_WITHDREW', 'RBM_Rule_Based_Medicine','RBM_Rule_Based_Medicine_tp',
                'X2_PHYEST',  'X2_IDURM','X2_IDURY','X2_MRDURM','X2_MRDURY','X1_HYPER_HAS','X1_HYPERLIP_HAS',
                'X1_HYPERLIP_SR', 'P1_INBIOMARKER','P1_INGENETICS', 'D1_WHODIDDX','P1_DNACOLLECTED',
                'P1_DNATYPE'], axis=1)

  #remove ALL INFORMANT VARIABLES since it does not influence risk of Alzheimer's at all
  df2=df2.drop(['I1_INBIRYR','I1_INCALLS','I1_INDATECONTX','I1_INEDUC','I1_INHISP','I1_INHOWCONTACT',
                'I1_INLIVWTH','I1_INRACE','I1_INRASEC','I1_INRATER','I1_INRELTO','I1_INRELY','I1_INSEX',
                'I1_INVISITS','I1_ISNEWINFORM'],axis=1)
  #AGE categories
  # category=pd.cut(df2.AGE,bins=[49,60,70,80,90,100,110],labels=['Fifties', 'Sixties', 'Seventies','Eighties','Nineties','100+'])
  # df2.insert(7, 'AGE_GROUP', category)
  # df2=df2.drop(['AGE'])



  df2["C1_WAIS3_DIGIF"] = pd.to_numeric(df2["C1_WAIS3_DIGIF"])
  df2.loc[(df2["C1_WAIS3_DIGIF"] < 5), "C1_WAIS3_DIGIF"] = 0
  df2.loc[(df2["C1_WAIS3_DIGIF"] >= 5), "C1_WAIS3_DIGIF"] = 1

  df2.loc[(df2["C1_GDS30"] <= 9), "C1_GDS30"] = 0
  df2.loc[(df2["C1_GDS30"] > 9) & (df2["C1_GDS30"] <= 19), "C1_GDS30"] = 0.5
  df2.loc[(df2["C1_GDS30"] >= 20), "C1_GDS30"] = 1

  return(df2)


#CHECK ALL X1 VARIABLES TO CHECK PROPORTION OF "9"/UNKNOWN. DROP IF OVER 55% UNKNOWN.

# df2=df2.replace(99, np.nan, regex=True)
# min_count =  int((50/100)*df2.shape[0] + 1) #50% is 99, aka missing
# df2 = df2.dropna(axis=1, thresh=min_count)

In [76]:
#df2.shape

(14655, 326)

In [6]:
preprocessing(mod_df)

Unnamed: 0,AGE,A1_RESIDENC,A1_SEX,A3_DADDEM,A3_MOMDEM,A43_ADACONT,A43_ADAFU,A43_ADAPREV,A43_ADBCONT,A43_ADBFU,...,F2_IADL1,F2_IADL2,F2_IADL3,F2_IADL4,F2_IADL5,F2_IADL6,F2_IADL7,F2_IADL8,F2_IADLTOTSCR,P1_PTTYPEDESC
0,80,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AD
1,84,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AD
2,85,1,1,0,1,1,1,0,1,1,...,1,3,3,2,2,4,3,3,21,AD
3,86,1,1,0,1,1,0,1,1,0,...,1,3,4,5,3,4,3,3,26,AD
4,87,1,1,0,1,1,0,1,1,0,...,3,3,4,4,3,4,3,3,27,AD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14650,73,1,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,8,MCI
14651,72,1,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,8,MCI
14652,81,1,1,0,0,0,0,0,0,0,...,1,1,0,0,0,1,1,1,5,MCI
14653,63,1,0,0,1,0,0,0,0,0,...,1,1,1,1,1,1,1,1,8,MCI


In [61]:
#df2.AGE_GROUP.unique()

['Seventies', 'Eighties', 'Sixties', 'Nineties', 'Fifties', '100+']
Categories (6, object): ['Fifties' < 'Sixties' < 'Seventies' < 'Eighties' < 'Nineties' < '100+']

In [77]:
#list(df2.columns)

['AGE',
 'A1_RESIDENC',
 'A1_SEX',
 'A3_DADDEM',
 'A3_MOMDEM',
 'A43_ADACONT',
 'A43_ADAFU',
 'AGE_GROUP',
 'A43_ADAPREV',
 'A43_ADBCONT',
 'A43_ADBFU',
 'A43_ADBPREV',
 'A43_ADCCONT',
 'A43_ADCFU',
 'A43_ADCPREV',
 'A43_ADDCONT',
 'A43_ADDFU',
 'A43_ADDPREV',
 'A43_ADECONT',
 'A43_ADEFU',
 'A43_ADEPREV',
 'A43_ADFCONT',
 'A43_ADFFU',
 'A43_ADFPREV',
 'A43_ADMEDEVER',
 'A44_DRG_TRIAL',
 'A5_ABUSOTHR',
 'A5_ALCOHOL',
 'A5_ARTHRITIC',
 'A5_AUTOIMM',
 'A5_B12DEF',
 'A5_CANCER',
 'A5_CBSTROKE',
 'A5_CBTIA',
 'A5_CVAFIB',
 'A5_CVANGIO',
 'A5_CVBYPASS',
 'A5_CVCHF',
 'A5_CVHATT',
 'A5_CVPACE',
 'A5_DEP2YRS',
 'A5_DIABETES',
 'A5_HYPERCHO',
 'A5_HYPERTEN',
 'A5_IBD',
 'A5_INCONTF',
 'A5_INCONTU',
 'A5_PACKSPER',
 'A5_PD',
 'A5_PSYCDIS',
 'A5_THYROID',
 'A5_TOBAC100',
 'A5_TOBAC30',
 'A5_TOBACLSTYR',
 'A5_TRAUMBRF',
 'A5_TRAUMCHR',
 'A5_TRAUMEXT',
 'B1_BMI',
 'B1_BPDIAS',
 'B1_BPSYS',
 'B1_HEIGHT',
 'B1_HRATE',
 'B1_WEIGHT',
 'B5_DEL',
 'B5_DELSEV',
 'B5_HALL',
 'B5_HALLSEV',
 'B5_AGIT',
 'B5_

> **One Hot Encoding**

In [89]:
df=preprocessing(mod_df)
# y=df['P1_PT_TYPE']
# df.drop(['P1_PT_TYPE'],axis=1)
pd.get_dummies(df.astype(str))

Unnamed: 0,A1_RESIDENC_0,A1_RESIDENC_1,A1_RESIDENC_2,A1_RESIDENC_3,A1_RESIDENC_4,A1_RESIDENC_5,A1_SEX_0,A1_SEX_1,A3_DADDEM_0,A3_DADDEM_1,...,F2_IADLTOTSCR_4,F2_IADLTOTSCR_5,F2_IADLTOTSCR_6,F2_IADLTOTSCR_7,F2_IADLTOTSCR_8,F2_IADLTOTSCR_9,P1_PTTYPEDESC_AD,P1_PTTYPEDESC_MCI,P1_PTTYPEDESC_NC,P1_PTTYPEDESC_Other
0,0,1,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,1,0,0,0
1,0,1,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,1,0,0,0
2,0,1,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14650,0,1,0,0,0,0,1,0,1,0,...,0,0,0,0,1,0,0,1,0,0
14651,0,1,0,0,0,0,1,0,1,0,...,0,0,0,0,1,0,0,1,0,0
14652,0,1,0,0,0,0,0,1,1,0,...,0,1,0,0,0,0,0,1,0,0
14653,0,1,0,0,0,0,1,0,1,0,...,0,0,0,0,1,0,0,1,0,0


>> **Clustering Algorithms**

In [11]:
!pip install kmodes

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kmodes
  Downloading kmodes-0.12.2-py2.py3-none-any.whl (20 kB)
Installing collected packages: kmodes
Successfully installed kmodes-0.12.2


In [12]:
from kmodes.kmodes import KModes

In [13]:
df=preprocessing(mod_df)
df=pd.get_dummies(df.astype(str))

In [17]:
df_subset=df.loc[1:10,] #to check how the clusters look in kmodes

In [18]:
kmode = KModes(n_clusters=3, init = "random", n_init = 5, verbose=1)
clusters = kmode.fit_predict(df_subset)
clusters

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 0, cost: 550.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 2, cost: 888.0
Run 2, iteration: 2/100, moves: 2, cost: 784.0
Run 2, iteration: 3/100, moves: 1, cost: 784.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 0, cost: 847.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 0, cost: 650.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 0, cost: 734.0
Best run was number 1


array([0, 0, 0, 0, 0, 0, 1, 2, 2, 2], dtype=uint16)

In [14]:
# kmode = KModes(n_clusters=3, init = "random", n_init = 5, verbose=1)
# clusters = kmode.fit_predict(df)
# clusters

KeyboardInterrupt: ignored