In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

from scipy.stats import ranksums

In [None]:
import os
working_dir = os.getcwd()
if working_dir.endswith('\\ADProgModel\\notebooks'):
    os.chdir('../')

In [None]:
import src.models.param_estimation_v1 as prestm

### Read data from CSV and create data array for each patient

In [None]:
datatype = 'adni' # 'adni', 'synthetic'
sheetname = 'train'

if datatype == 'adni':
    filename = 'dataset/processed/adni_split0.xls'
elif datatype == 'synthetic':
    filename = 'dataset/processed/synthetic_split0.xls'

df = pd.read_excel(filename, sheet_name=sheetname)

### All patients synthetic data

#### Parameter estimation per subject

In [None]:
subname = 'RID'
reg1_av45 = 'reg1_av45'
reg2_av45 = 'reg2_av45'
reg1_mri = 'reg1_mri'
reg2_mri = 'reg2_mri'
cogvar = 'cogsc'
agename = 'demog1'
tcname = 'Years'
apoestatus = 'demog2'

dfcolnms = prestm.ColumnNames(subname, tcname, agename, cogvar, reg1_mri, reg2_mri, reg1_av45, reg2_av45)

admat = np.matrix([[0,1],[1,0]])
dticlinfo = prestm.DTIMat(admat)


In [None]:
pmdf = prestm.compute_all_params_woY_perpat(df, dfcolnms, dticlinfo)

#### Parameter estimation for groups of subjects

In [None]:
subname = 'RID'
reg1_av45 = 'reg1_av45'
reg2_av45 = 'reg2_av45'
reg1_mri = 'reg1_mri'
reg2_mri = 'reg2_mri'
cogvar = 'cogsc'
agename = 'demog1'
tcname = 'Years'
apoestatus = 'demog2'

dfcolnms = prestm.ColumnNames(subname, tcname, agename, cogvar, reg1_mri, reg2_mri, reg1_av45, reg2_av45)

admat = np.matrix([[0,1],[1,0]])
dticlinfo = prestm.DTIMat(admat)

In [None]:
demog_feat_list = ['demog1','demog2']
grouptypedf = df.loc[(df.Years==0), demog_feat_list].value_counts().reset_index()[demog_feat_list]

pmdf, grouptypedf = prestm.compute_all_params_woY_bygroup(df, dfcolnms, dticlinfo, grouptypedf)

### All parameter estimation for ADNI data

#### Parameter estimation for a person

In [None]:
subname = 'RID'
reg1_av45 = 'HIPPOCAMPAL_SUVR'
reg2_av45 = 'FRONTAL_SUVR'
reg1_mri = 'mri_HIPPO_norm'
reg2_mri = 'mri_FRONT_norm'
cogvar = 'MMSE_norm'
agename = 'CurAGE'
tcname = 'Years'

dfcolnms = prestm.ColumnNames(subname, tcname, agename, cogvar, reg1_mri, reg2_mri, reg1_av45, reg2_av45)

admat = np.matrix([[0,1],[1,0]])
dticlinfo = prestm.DTIMat(admat)


In [None]:
pmdf = prestm.compute_all_params_woY_perpat(df, dfcolnms, dticlinfo)

In [None]:
parname_list = ['beta_estm','alpha1_estm','alpha2_gamma_estm']
xlabel_list = [r'$\hat{\beta}$', r'$\hat{\alpha_1}$', r'$\hat{\alpha_{2}\gamma}$']
xlabel_dict = dict(zip(parname_list, xlabel_list))

ax = plt.figure(figsize=(12,4))
for ii in range(len(parname_list)):
    parname = parname_list[ii]
    myseries = pmdf[parname]
    xlabel = xlabel_dict[parname]
    
    myseries_adj = myseries[myseries.between(myseries.quantile(.05), myseries.quantile(.95))] 

    plt.subplot(1,3,ii+1)
    myseries_adj.hist()
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.xlabel(xlabel, fontsize=15)
    
    if ii==0:
        plt.ylabel('# individuals', fontsize=15)

plt.tight_layout()
plt.show()

##### Studying relationship between 

In [None]:
demog_list = ['CurAGE','PTEDUCAT','PTGENDER_num','APOEPOS']
pmdf = pd.concat((pmdf.set_index('RID'), 
                  df.loc[df.VISCODE=='bl'].set_index('RID')[demog_list]), axis=1)
pmdf.reset_index(inplace=True)

In [None]:
dict_parname_list = []
dict_demog_list = []
dict_pval_list = []

for parname in parname_list:
    
    for demog_name in demog_list:

        if demog_name in ['PTEDUCAT','CurAGE']:
            X = pmdf[demog_name].values
            y = pmdf[parname].values
            X2 = sm.add_constant(X)

            est = sm.OLS(y, X2)
            est2 = est.fit()

            assoc_pvalue = est2.pvalues[1]

        elif demog_name in ['PTGENDER_num', 'APOEPOS']:
            ignore, assoc_pvalue = ranksums(pmdf.loc[(pmdf[demog_name]==0),parname], 
                                      pmdf.loc[(pmdf[demog_name]==1),parname])

        dict_parname_list.append(parname)
        dict_demog_list.append(demog_name)
        dict_pval_list.append(assoc_pvalue)
    
resdf = pd.DataFrame(dict({'param':dict_parname_list, 
                                      'demog':dict_demog_list, 
                                      'pval':dict_pval_list}))

#### Parameter estimation for a group

In [None]:
subname = 'RID'
reg1_av45 = 'HIPPOCAMPAL_SUVR'
reg2_av45 = 'FRONTAL_SUVR'
reg1_mri = 'mri_HIPPO_norm'
reg2_mri = 'mri_FRONT_norm'
cogvar = 'MMSE_norm'
agename = 'CurAGE'
tcname = 'Years'
demog1 = 'PTGENDER'
demog2 = 'APOEPOS'

dfcolnms = prestm.ColumnNames(subname, tcname, agename, cogvar, reg1_mri, reg2_mri, reg1_av45, reg2_av45)

admat = np.matrix([[0,1],[1,0]])
dticlinfo = prestm.DTIMat(admat)

In [None]:
demog_feat_list = ['APOEPOS','PTGENDER_num']
grouptypedf = df[demog_feat_list].value_counts().reset_index()[demog_feat_list]

pmdf, grouptypedf = prestm.compute_all_params_woY_bygroup(df, dfcolnms, dticlinfo, grouptypedf)

#### Create files with parameter estimates of ADNI data

In [None]:
loadfname = 'adni_split0'
df = pd.read_excel('dataset/processed/%s.xls'%(loadfname), sheet_name='train')

In [None]:
save_to_excel_flag = 1

In [None]:
subname = 'RID'
reg1_av45 = 'HIPPOCAMPAL_SUVR'
reg2_av45 = 'FRONTAL_SUVR'
reg1_mri = 'mri_HIPPO_norm'
reg2_mri = 'mri_FRONT_norm'

agename = 'CurAGE'
tcname = 'Years'
demog1 = 'PTGENDER'
demog2 = 'APOEPOS'

admat = np.matrix([[0,1],[1,0]])
dticlinfo = prestm.DTIMat(admat)

In [None]:
grouping_cases_list = [['PTGENDER','APOEPOS']]

pmdf_col_list = ['RID','beta_estm','alpha1_estm','alpha2_gamma_estm','tpo_estm']

train_param_dict = {}

savefname = 'dataset/processed/%s_parameters.xls'%(loadfname)
writer = pd.ExcelWriter(savefname)

for cogvar in ['MMSE_norm']:

    dfcolnms = prestm.ColumnNames(subname, tcname, agename, cogvar, reg1_mri, reg2_mri, reg1_av45, reg2_av45)

    for demog_feat_list in grouping_cases_list:

        # case number
        dictkey = cogvar + '_' + '_'.join(demog_feat_list)
        print(dictkey)

        # get dataframe of cases for given demog features and find their parameters
        grouptypedf = df[demog_feat_list].value_counts().reset_index()[demog_feat_list]
        pmdf, grouptypedf = prestm.compute_all_params_woY_bygroup(df, dfcolnms, dticlinfo, grouptypedf)
        
        # adjustment to groupetypedf for AD subjects
        if 'DX_bl' in demog_feat_list:
            adtypedf = grouptypedf.loc[grouptypedf.DX_bl=='LMCI'].copy() # use the parameter estimates from LMCI folks
            adtypedf['DX_bl'] = 'AD'
            grouptypedf = pd.concat((grouptypedf, adtypedf), axis=0, ignore_index=True)
        
        # include the patients from valid and test set with parameters assigned based on their grouping
        for sheetname in ['valid','test']:
            # load the subject data
            vdf = pd.read_excel('dataset/processed/%s.xls'%(loadfname), sheet_name=sheetname)

            # merge the dataframes based on the demog variables used for grouping 
            vdf.set_index(demog_feat_list, inplace=True)
            nvdf = vdf.merge(grouptypedf.set_index(demog_feat_list), 
                              how='left', left_index=True, right_index=True).reset_index()

            # add tpo estimation information
            nvdf['tpo_estm'] = np.floor(nvdf['CurAGE']) - 50 # based on 2017 Alz and Dem study; their number was 55

            # merge with parameter df 
            pmdf = pd.concat((pmdf, 
                              nvdf.loc[nvdf.Years==0, pmdf_col_list]), axis=0, ignore_index=True)

        # add this to the excel sheet
        if save_to_excel_flag == 1:
            pmdf.to_excel(writer, sheet_name=dictkey, index=False)
        
writer.save()