# Load ADNIMERGE

In [13]:
import functools

In [14]:
%run ./ADNIMERGE-Exploration.ipynb

Following locations can be used : 

`NOTEBOOKS_DIR`, `MANUSCRIPT_DIR`, `CODE_DIR`, `DATA_DIR`

Following locations can be used : 

`NOTEBOOKS_DIR`, `MANUSCRIPT_DIR`, `CODE_DIR`, `DATA_DIR`

Use `loadCSVtoDF` to load data into dataframes from folder: `SOME_DATA_DIR/raw/SOME_DATASET/SOME_MARKER/SOME_LAB` 

<IPython.core.display.Javascript object>

-----------------------
Dataframes are available in the object - dataframes - with keys as [dataset], [marker], [file], [df | dict]

E.g : dataframes["PLASMA"]["ABETA"]["FNIH_SHIMADZU_1_GO_2_2021-05-25"]["df"] and 
dataframes["PLASMA"]["ABETA"]["FNIH_SHIMADZU_1_GO_2_2021-05-25"]["dict"] and 

If the dataset is of not a specific measure, they are under the marker - ALL
E.g : dataframes["CSF"]["ALL"]["LOCALLAB_1_GO_2_3_2022-01-24"]["df"] 


df
dict
The mapping of Brain region to ADNI MRI FreeSurfer column :


`MRI_REGIONS_VOLUME`

*For future purposes, surface area of some regions is also available*

`MRI_REGIONS_SURFACE_AREA`

--
dataframe dictionary for PLASMA ABETA {}: plasma_abeta
with columns :  ['RID', 'VISCODE', 'PLASMA_AB42', 'PLASMA_AB40', 'PLASMA_AB4240', 'update_stamp', 'source']


# Data from biomarkers
* PET
* MRI
* CSF
* Blood Plasma

##  PET Data

In [15]:
PET_ABETA = pet_abeta["UCB_AV45_1_GO_2_3_2021-11-16"]
PET_ABETA["ABP"] = PET_ABETA["PET_SUVR_ABETA"] >= 1.11 #Refer methods pdf
PET_ABETA["ABP_HIGH"] = PET_ABETA["PET_SUVR_ABETA"] >= 1.19 #Refer methods pdf #Comparison of Pittsburgh compound B and florbetapir in cross-sectional and longitudinal studies
# Yi Su 2019
PET_ABETA.ABP = PET_ABETA.ABP.astype(int)
PET_ABETA.ABP_HIGH = PET_ABETA.ABP_HIGH.astype(int)

In [16]:
PET_ABETA.isnull().sum()

RID               0
VISCODE           0
PET_SUVR_ABETA    0
update_stamp      0
source            0
ABP               0
ABP_HIGH          0
dtype: int64

##  MRI Data

In [6]:
def getNulls(df) :
    df = df.isna().sum().reset_index(name='nulls')
    return df[df['nulls'] > 0]

def getNormalisedMRIData(MRI_DATA, columns=None) :
    MRI_MEAN_POPULATION = MRI_DATA[MRI_DATA["VISCODE"] == "bl"].drop(columns=["RID", "VISCODE"]).mean()
    df = MRI_DATA.copy()
    if columns is not None :
        df = df[columns + ['VOL_ICV']]
    mri_fields = df.drop(['RID', 'VISCODE', 'VOL_ICV'], axis=1).columns.tolist()
    for mri_field in mri_fields:
        df[mri_field] = MRI_MEAN_POPULATION[mri_field] * (df[mri_field]/df['VOL_ICV'])
    return df

print('MRI Sources : ', ', '.join([source for source in mri_hipp]))
MRI_ALL = pd.concat([mri_hipp[source].dropna() for source in mri_hipp])
MRI_ALL['VISCODE'].replace('scmri', 'bl', inplace=True)
MRI_ALL['VISCODE'].replace('sc', 'bl', inplace=True)
# Average duplicates by RID, VISCODE
MRI_ALL_AVG = MRI_ALL.groupby(['RID', 'VISCODE'], as_index=False).mean()
MRI_ICV_NORMALISED = getNormalisedMRIData(MRI_ALL_AVG)
MRI_ICV_NORMALISED;

MRI Sources :  UCSF_FS5_1_GO_2_2021-12-13, UCSF_FS6_3_2021-12-13


##  CSF Data

In [34]:
print('CSF Sources considered: ', ', '.join(filter(lambda source: (source != 'UPENN_DIAN_1_GO_2_2018-04-09') and (source != 'UPENN_MASTER_1_GO_2_2018-04-09'), [source for source in csf_all])))
CSF_ALL = pd.concat([csf_all[source] for source in csf_all])
#CSF_ALL = CSF_ALL.dropna()
CSF_ALL = CSF_ALL[(CSF_ALL.source != 'UPENN_DIAN_1_GO_2_2018-04-09') & (CSF_ALL.source != 'UPENN_MASTER_1_GO_2_2018-04-09') ]

CSF Sources considered:  UPENN_ELECSYS_3_Bt1_2019-07-29, UPENN_ELECSYS_1_GO_2_2017-04-19, UPENN_ELECSYS_1_GO_2_3_2021-01-04


In [38]:
CSF_ALL.source.value_counts()

UPENN_ELECSYS_1_GO_2_2017-04-19      2398
UPENN_ELECSYS_3_Bt1_2019-07-29       498 
UPENN_ELECSYS_1_GO_2_3_2021-01-04    216 
Name: source, dtype: int64

In [31]:
CSF_ALL = CSF_ALL[['RID', 'VISCODE', 'CSF_AB42', 'CSF_TAU', 'CSF_PTAU', 'CSF_AB4240']]
# CSF_ALL = CSF_ALL.dropna()
CSF_ALL['CSF_AB42'] = CSF_ALL['CSF_AB42'].astype(str)
CSF_ALL = CSF_ALL[~CSF_ALL['CSF_AB42'].str.startswith('>') &  ~CSF_ALL['CSF_AB42'].str.startswith('<')]
CSF_ALL.CSF_AB42 = CSF_ALL.CSF_AB42.astype(float)
CSF_ALL['CSF_AB42'] = CSF_ALL.CSF_AB42.apply(lambda x: 1700 if (x >= 1700) else x)
CSF_ALL['CSF_TAU'] = CSF_ALL['CSF_TAU'].astype(str)
CSF_ALL = CSF_ALL[~CSF_ALL['CSF_TAU'].str.startswith('>') &  ~CSF_ALL['CSF_TAU'].str.startswith('<')]
CSF_ALL.CSF_TAU = CSF_ALL.CSF_TAU.astype(float)
CSF_ALL['CSF_PTAU'] = CSF_ALL['CSF_PTAU'].astype(str)
CSF_ALL = CSF_ALL[~CSF_ALL['CSF_PTAU'].str.startswith('>') &  ~CSF_ALL['CSF_PTAU'].str.startswith('<')]
CSF_ALL.CSF_PTAU = CSF_ALL.CSF_PTAU.astype(float)
CSF_ALL_AVG = CSF_ALL.groupby(['RID', 'VISCODE'], as_index=False).mean()

#checkDuplicates(CSF_ALL_AVG)
#CSF_ALL_AVG[CSF_ALL_AVG['RID'] == 96 ]

In [33]:
CSF_ALL_AVG.shape

(2663, 6)

In [41]:
# 2019-06-21
# 2022-11-18
# plasma_df = plasma_abeta['BATEMAN_2019-06-21']
plasma_df = plasma_abeta['BATEMAN_2022-11-18']
df = plasma_df.merge(PET_ABETA, on=['RID', 'VISCODE'], how='inner')
df.merge(CSF_ALL_AVG, on=['RID', 'VISCODE'], how='inner')

bl      216
m24     56 
m48     38 
m60     21 
m72     19 
m132    6  
m108    4  
m84     4  
m96     4  
m120    3  
m36     1  
Name: VISCODE, dtype: int64

##  Plasma
* NFL
* PTAU

###  Plasma NFL Data

In [91]:
print('Plasma NFL Sources : ', ', '.join([source for source in plasma_nfl]))
PLASMA_NFL = pd.concat([plasma_nfl[f] for f in plasma_nfl])
PLASMA_NFL_AVG = PLASMA_NFL.groupby(['RID', 'VISCODE'], as_index=False).mean()

Plasma NFL Sources :  BLENNOWLAB_ADNI_1_2018-10-03, BLENNOWLAB_1_GO_2_2018-10-03


###  Plasma PTAU Data

In [92]:
PLASMA_PTAU = plasma_ptau["GOTHENBURG_1_GO_2_2020-06-18"]
PLASMA_PTAU_AVG = PLASMA_PTAU.groupby(['RID', 'VISCODE'], as_index=False).mean()

## Prepare CDR and GDS

In [93]:
gds_df = dataframes['NEUROPSYCH']['GDS']['ALL']['df'][['RID', 'VISCODE2', 'GDTOTAL']]
gds_df.columns = ['RID', 'VISCODE', 'GDTOTAL']
cdr_df = dataframes['NEUROPSYCH']['CDR']['ALL']['df'][['RID', 'VISCODE2', 'CDGLOBAL']]
cdr_df.columns = ['RID', 'VISCODE', 'CDGLOBAL']
gds_df['VISCODE'].replace('sc', 'bl', inplace=True)
cdr_df['VISCODE'].replace('sc', 'bl', inplace=True)

# ADNIMERGE Preprocessing and Preparation

 - Consider subjects with at least one followup and the followup visits of months - 6, 12, 24, 36, 48, 60
 - Exclude the ones with NULL Diagnosis

## Utility methods

In [94]:
def getNulls(df) :
    df = df.isna().sum().reset_index(name='nulls')
    return df[df['nulls'] > 0]

def cnchange(x) :
    if x['DX_bl'] in ['CN', 'SMC'] :
        if x["DX"] == "CN"  :
            return 0
        else : return 1
    else : return -1

def adchange(x) :
    if (x['DX_bl'] == 'AD') :
        if x["DX"] == "Dementia"  :
            return 0
        else : return 1
    else : return -1

def mcitodemchange(x) :
    if x['DX_bl'] in ['LMCI', 'EMCI'] :
        if x["DX"] == "Dementia"  :
            return 1
        else : return 0
    else : return -1

## Map Diagnosis

### Drop rows with DX Null
 - This way, if DX_bl = AD and DX_1stfollow = Null, this participant is not considered even though DX_bl could've been used for training AD+ ?

In [95]:
ADM = ADNIMERGE.copy()
ADM = ADM.dropna(subset=['DX'])
derived_fields = []

In [96]:
dx_bl_mapping = pd.DataFrame({
    'dx': ['CN', 'SMC', 'EMCI', 'MCI', 'LMCI', 'AD', 'Dementia'],
    'prog': ['CN', 'CN', 'MCI', 'MCI', 'MCI', 'Dementia', 'Dementia']
})

sort_mapping = dx_bl_mapping.reset_index().set_index('dx')
ADM['DX_bl_std'] = ADM['DX_bl'].map(sort_mapping['prog'])
derived_fields.append('DX_bl_std')

### Keeping only those with at least one followup


In [97]:
ADM = ADM[ADM.duplicated(['RID'], keep=False)]
ADM['VS_MONTH_CURR'] = ADM.VS_MONTH.apply(lambda x: 1 if (x == 6) or (x <= 60 and x%12 == 0) else 0)
ADM = ADM[ADM['VS_MONTH_CURR'] == 1 ]
ADM = ADM.drop('VS_MONTH_CURR', axis=1)


## Prepare Disease Progression fields

In [98]:
ADM['CN_CHANGE'] = ADM.apply(lambda x : cnchange(x), axis=1)
ADM['AD_CHANGE'] = ADM.apply(lambda x : adchange(x), axis=1)
ADM['MCI_DEM_CHANGE'] = ADM.apply(lambda x : mcitodemchange(x), axis=1)
derived_fields.append('CN_CHANGE')
derived_fields.append('AD_CHANGE')
derived_fields.append('MCI_DEM_CHANGE')
getNulls(ADM);

## Marital status

In [None]:
ADM['PTMARRY_orig'] = ADM['PTMARRY']
ADM['PTMARRY'] = (ADM['PTMARRY_orig'] == 'Married')
ADM['PTMARRY'] = ADM['PTMARRY'].astype(int)

## Add max follow up for each RID

In [99]:
max_month = ADM.groupby("RID")["VS_MONTH"].max().reset_index()
ADM = ADM.merge(max_month, how="left", on='RID', suffixes=[None, '_MAX'])
derived_fields.append('VS_MONTH_MAX')

# Merge ADNIMERGE and Biomarkers

In [100]:
def getADNICombined(base_fields_of_interest) :
    ADM_5YR = ADM[base_fields_of_interest + derived_fields]
    ADM_5YR = ADM_5YR.merge(gds_df, how='left', on=["RID", "VISCODE"])
    ADM_5YR = ADM_5YR.merge(cdr_df, how='left', on=["RID", "VISCODE"])

    ADM_5YR_PET = ADM_5YR.merge(PET_ABETA[["RID", "VISCODE", "PET_SUVR_ABETA", "ABP", "ABP_HIGH"]], how='left', on=["RID", "VISCODE"])
    #getNulls(ADM_COMBINED)
    ADM_5YR_COMBINED = ADM_5YR_PET.merge(MRI_ICV_NORMALISED, how='left', on=["RID", "VISCODE"])

    ADM_5YR_COMBINED = ADM_5YR_COMBINED.merge(CSF_ALL_AVG, how='left', on=["RID", "VISCODE"])

    ADM_5YR_COMBINED = ADM_5YR_COMBINED.merge(PLASMA_NFL_AVG, how='left', on=["RID", "VISCODE"])

    ADM_5YR_COMBINED = ADM_5YR_COMBINED.merge(PLASMA_PTAU_AVG, how='left', on=["RID", "VISCODE"])
    getNulls(ADM_5YR_COMBINED);
    print('ADNIMERGE + Biomarkers, shape :', ADM_5YR_COMBINED.shape)
    return ADM_5YR_COMBINED

In [101]:
display(Markdown('*Use the following dataframe*'))

*Use the following dataframe*

In [104]:
display(Markdown('`getADNICombined(base_fields_of_interest)`'))

`getADNICombined(base_fields_of_interest)`