# Model definitions

In [2]:
def ADNIdemographics(subset, continuous, categorical, category_field = "DX_bl_std", debug=False) :
    total = subset.describe()
    diag_cats = subset[category_field].unique()
    header = np.array(['All Participants'])
    header = np.concatenate((header, np.array(diag_cats)))
    header = np.concatenate((header, np.array(["P-value", "F-Statistic", "Cramer's V", "Pearson"])))
    mux = pd.MultiIndex.from_arrays([[],[]], 
                                      names=['', ''])
    table = pd.DataFrame(columns=header, index=mux)
    emptyRow = {"All Participants" : ""}
    for cat in diag_cats :
        emptyRow[cat] = ''

    ### First counts row
    totalRow = emptyRow.copy()
    totalcnt = len(subset.index)
    totalRow["All Participants"] = totalcnt
    totalByAlcohol = subset.groupby([category_field]).describe()
    for cat in diag_cats :
        cnt = totalByAlcohol['AGE'].loc[(cat)]["count"]
        totalRow[cat] = '%d(%.f)'%(cnt, 100*cnt/totalcnt)
    table = pd.concat([table, pd.DataFrame(totalRow, columns=header, index = [[''], ['Count']])])
    
    ### Continuous Variables
    basic_characteristics = continuous 
    alcoholField = 'C(' +category_field  +')'
#     alcoholField = 'ALCOHOL'
    for characteristic in basic_characteristics :
        a = rp.summary_cont(subset.groupby([category_field])[characteristic])
        totalRow = emptyRow.copy()
        totalRow["All Participants"] = '%.1f(%.1f)'%(total[characteristic].loc['mean'], total[characteristic].loc['std'])
        for cat in diag_cats :
            totalRow[cat] = '%.1f(%.1f)'%(a.loc[(cat)]["Mean"], a.loc[(cat)]["SD"])
        model = smf.ols( characteristic + '~ ' +  alcoholField, data=subset).fit()
        aov_table = sm.stats.anova_lm(model, typ=2)
        pvalue = aov_table.loc[alcoholField]["PR(>F)"]
        totalRow["P-value"] = roundPValue(pvalue)#'%.3f'%(pvalue)
        totalRow["F-Statistic"] = '%.3f'%(aov_table.loc[alcoholField]["F"])
        table = pd.concat([table, pd.DataFrame(totalRow, columns=header, index = [ [''], [display(characteristic)]])])
    
    ### Fields with Categories
    fieldsWithCategories = categorical #['PTGENDER', 'ETHNICRACE', 'PTMARRY', 'APOE4']
    countByAlcohol = subset.groupby([category_field]).count()['RID']
    for field in fieldsWithCategories : 
        countByRace = subset.groupby([field]).count()['RID']
        countByRaceAndAlcohol = subset.groupby([category_field, field]).count()['RID']
        crosstab, test_results, expected = rp.crosstab(subset[field], subset[category_field],
                                                   test= "chi-square",
                                                   expected_freqs= True,
                                                   prop= "cell")
        overallPvalue = test_results.loc[1]["results"]
        totalRow = emptyRow.copy()
        totalRow["P-value"] = roundPValue(overallPvalue)#'%.3f'%(overallPvalue)
        totalRow["Pearson"] = '%.3f'%(test_results.loc[0]["results"])
        totalRow["Cramer's V"] = '%.3f'%(test_results.loc[2]["results"])
        table = pd.concat([table, pd.DataFrame(totalRow, columns=header, index = [[display(field)], [display(field)]])])
        for race in countByRaceAndAlcohol.index.levels[1] : 
            totalRow = emptyRow.copy()
            totalRow ["All Participants"] = '%d(%d)'%(countByRace.loc[(race)], 100*countByRace.loc[(race)]/len(subset.index))
            for alcoholcat in countByRaceAndAlcohol.index.levels[0] : 
                if race in countByRaceAndAlcohol.xs(alcoholcat, level=0, axis=0).index :
                    cnt = countByRaceAndAlcohol[(alcoholcat, race)]
                    pct = (cnt/countByAlcohol.loc[(alcoholcat)])*100
                    totalRow[alcoholcat] = '%d(%d)'%(cnt, pct)
#                 else :
#                     totalRow[alcoholcat] = '-'
            table = pd.concat([table, pd.DataFrame(totalRow, columns=header, index = [[display(field)], [race]])])
    return table

## Infographic Data with counts

In [3]:
def printStudyCharacteristics(df, FOLLOWUP_MONTH, ABP_field) :
    data = {"metric" : [], "count" : []}
    addAndPrint(data, 'Participants with at least one follow-up : ', df.groupby('RID').count().shape[0])
    totalCN = df[df.DX_bl.isin(['CN', 'SMC']) & (df.VISCODE == 'bl')].shape[0]
    addAndPrint(data, 'CN Total (Baseline) : ', totalCN)
    addAndPrint(data, 'AD Total (Baseline): ', df[df.DX_bl.isin(['AD']) & (df.VISCODE == 'bl')].shape[0])
    addAndPrint(data, 'MCI Total (Baseline) : ', df[df.DX_bl.isin(['EMCI', 'LMCI']) & (df.VISCODE == 'bl')].shape[0])
    df_cn_fu = ADM_CN_BL.dropna(subset=['CN_S_'+str(FOLLOWUP_MONTH)])
#     two_yrCN = df[df.DX_bl.isin(['CN', 'SMC']) & (df.VISCODE == 'bl')].dropna(subset=['CN_S_'+str(FOLLOWUP_MONTH)]).shape[0]
    addAndPrint(data, 'CN Total ({} yr. followup) : '.format(FOLLOWUP_MONTH/12), df_cn_fu.shape[0])
    stableCN = df_cn_fu[df_cn_fu['CN_S_'+str(FOLLOWUP_MONTH)] == True]
    addAndPrint(data, 'CN Stable ({} yr. followup): '.format(FOLLOWUP_MONTH/12), stableCN.shape[0])
    addAndPrint(data, 'CN Decline : ', df_cn_fu.shape[0] - stableCN.shape[0])
    df_ad = df.groupby('RID')['AD_CHANGE'].sum().reset_index(name='count')
    df_0 = df_ad[['RID', 'count']][df_ad['count'] == 0]
    addAndPrint(data, 'Dementia Remain : ', ADM_AD_BL.shape[0])
    df_AD_BL = df[(df.DX_bl == "AD") & (df.VS_MONTH == 0)].merge(df_0, on=['RID'])
    df_AD_BL = df_AD_BL.drop('count', axis=1)[(df_AD_BL[ABP_field] == 1)]
    #     ADM_AD = ADM_AD.dropna()
    addAndPrint(data, 'Dementia Remain, AB+ : ', ADM_AD_BL[ADM_AD_BL[ABP_field] == 1].shape[0])
    df_1 = df_ad[['RID', 'count']][df_ad['count'] > 0]
    addAndPrint(data, 'AD Reverse : ', df_1.shape[0])
    df_mci = ADM_MCI.dropna(subset=['ABP_DEM_'+str(FOLLOWUP_MONTH)])
    addAndPrint(data, 'MCI Total ({} yr. followup) : '.format(FOLLOWUP_MONTH/12), df_mci.shape[0])
    mciDecline = df_mci[df_mci['ABP_DEM_'+str(FOLLOWUP_MONTH)] == 1].shape[0]
    addAndPrint(data, 'MCI Remain/Improve ({} yr.) : '.format(FOLLOWUP_MONTH/12), df_mci.shape[0] - mciDecline)
    # df_1 = df_mci[['RID', 'count']][df_mci['count'] > 0]
    addAndPrint(data, 'MCI Decline : ', mciDecline)

    return pd.DataFrame(data)



# General Utils

In [4]:
def getNulls(df) :
    df = df.isna().sum().reset_index(name='nulls')
    return df[df['nulls'] > 0]

def addAndPrint(data, key, value):
    data["metric"].append(key.strip().strip(':'))
    data["count"].append(value)
    # print(key, value)
    
def roundPValue(p):
    if p < 0.01 :
        return round(p, 3)
    else:
        return round(p, 2)



In [5]:
def display(c) :
    d_ = {
        "AGE" : "Age, mean(SD), y",
        "PTEDUCAT" : "Years of Education, mean(SD), y",
        "PTGENDER" : "Gender",
        "FEMALE" : "Female",
        "MALE" : "Male",
        "PTRACE" : "Race",
        "PTMARRY" : "Marital Status", 
        "NEVERMARRIED" : "Never Married", 
        "PTETHNIC" : "Ethnicity", 
        "ETHNICRACE" : "Race / Ethnicity", 
        "PTNOTRT" : "Retired",
        "APOEGN" : "APOE Genotype",
        "APOE4" : "APOE ε4 allele(s), > 0",
        "DIGITTOTAL" : "DSST",
        "FreeRecall" : "FCSRT Free Recall",
        "TotalRecall" : "FCSRT Total Recall",
        "TotalRecall_48" : "Total Recall = 48" ,
        "FCSRT96": "FCSRT96",
        "MMSCORE" : "MMSE",
        "MMSCORE_29" : "MMSE Score = 29",
        "LDELTOTAL" : "Logical Memory Delayed Recall",
        "PACC_MMSE" : "PACC MMSE",
        "PACC_total" : "PACC Score",
        "CFIPTTOTAL" : "CFI Score, Participant",
        "CFISPTOTAL" : "CFI Score, Study Partner",
        "GDTOTAL" : "GDS Score",
        "SLEEPDAY" : "Day time nap, minutes",
        "RAVLT_immediate" : "RAVLT Immediate" ,
        "RAVLT_learning" : "RAVLT Learning",
        "RAVLT_forgetting" :  "RAVLT Forgetting",
        "ADAS11" : "ADAS(cog, 11)",
        "TRABSCOR" : "Trail (B)",
        "VOL_HIPPOCAMPUS" : "HVa",
        "PLASMA_NFL" : "Plasma NFL",
        "PLASMA_PTAU" : "Plasma pTau",
        "CSF_TAU" : "CSF Tau",
        "CSF_PTAU" : "CSF pTau",
        "CSF_AB42" : "CSF Aβ-42"
    }
    
    return d_[c] if c in d_ else c


# Prepare Datasets

## Fix the numbers for non-CS stable, MCI 0-class
 
 - problem: CS-stable = True no problem
 - But for an RID: if MX_VS_MONTH = 12, then CS_S_24 should be Nan, not False

## Add CN-stable by followup fields

In [1]:
def getCNBLData(adm_):
    ADM_CN = adm_[adm_.DX_bl.isin(['CN', 'SMC'])]
    ADM_CN_BL = ADM_CN[ADM_CN.VS_MONTH == 0]
    ADM_CN_FOL = ADM_CN[ADM_CN.VS_MONTH > 0]
    stable_df = pd.DataFrame(adm_.RID.unique())
    stable_df.columns=['RID']
    followup_months = [month for month in adm_["VS_MONTH"].unique() if month > 0]
    for month in followup_months :
        adm_cn_month = ADM_CN_FOL[ADM_CN_FOL["VS_MONTH"] == month]
        DX_month_field = "CN_S_"+str(month)
        adm_cn_month[DX_month_field] = (adm_cn_month["CN_CHANGE"] == 0)
        adm_cn_month[DX_month_field] = adm_cn_month[DX_month_field].astype(int)
        ADM_CN_BL = ADM_CN_BL.merge(adm_cn_month[['RID', DX_month_field]], how='left', on=['RID'])

#         df = adm_[(adm_.VS_MONTH <= month) & (adm_.VS_MONTH_MAX >= month)]\
#                 .groupby('RID')['CN_CHANGE'].sum()\
#                 .reset_index(name='count')
#         df["CN_S_"+str(month)] = (df["count"] == 0)
#         df.drop(columns=['count'], inplace=True)
#         stable_df = stable_df.merge(df, how='left')
    return ADM_CN_BL#.merge(stable_df, how="left", on='RID')

## MCI data with follow-up columns

In [2]:
def getMCIData(adm_, ABP_field, month = None) :
    ADM_MCI = adm_[adm_['MCI_DEM_CHANGE'] >= 0]
    ADM_MCI_BL = ADM_MCI[ADM_MCI.VS_MONTH == 0]
    ADM_MCI_FOL = ADM_MCI[ADM_MCI.VS_MONTH > 0]
    abp = {0: "Aβ-", 1: "Aβ+"}
    ADM_MCI_BL['ABP_cat'] = ADM_MCI_BL[ABP_field].map(abp)  
    ADM_MCI_BL.ABP_cat = ADM_MCI_BL.ABP_cat.astype('category')
    for month in ADM_MCI_FOL["VS_MONTH"].unique() :
        adm_mci_month = ADM_MCI_FOL[ADM_MCI_FOL["VS_MONTH"] == month]
        DX_month_field = "ABP_DEM_"+str(month)
        adm_mci_month[DX_month_field] = (adm_mci_month["MCI_DEM_CHANGE"] == 1)
        adm_mci_month[DX_month_field] = adm_mci_month[DX_month_field].astype(int)
        ADM_MCI_BL = ADM_MCI_BL.merge(adm_mci_month[['RID', DX_month_field]], how='left', on=['RID'])
    return ADM_MCI_BL

## Train and Test Datasets

In [25]:
def getAD_BL(adm_) :
    df = adm_.groupby('RID')['AD_CHANGE'].sum().reset_index(name='count')
    df_0 = df[df['count'] == 0]
    ADM_AD_BL = adm_[(adm_.DX_bl == "AD") & (adm_.VS_MONTH == 0)].merge(df_0, on=['RID'])
    ADM_AD_BL = ADM_AD_BL.drop('count', axis=1)
#     ADM_AD = ADM_AD.dropna()
    print('AD Remain : ', ADM_AD_BL.shape[0])
    return ADM_AD_BL

def getCNandABP_ADTrain(cn_bl, ad_bl, ABP_field, follow_up_month=60) :
    ADM_ABP_DEM = ad_bl[(ad_bl[ABP_field] == 1)]
#     ADM_ABP_DEM = ADM_ABP_DEM.dropna()
    ADM_ABP_DEM["ABP_DEM"] = 1
    # ADM_ABP_DEM = ADM_ABP_DEM.drop(ABP_field, axis=1)
    ADM_CN_S = cn_bl.dropna(subset=['CN_S_'+str(follow_up_month)]).drop(columns=cn_bl.columns[cn_bl.columns.str.contains('CN_S_')])
    ADM_CN_S["ABP_DEM"] = 0
    # ADM_CN_S = ADM_CN_S.drop(ABP_field, axis=1)

#     print('AB Positive (%) : ', 100 * ADM_AD_BL.groupby("ABP").count()["RID"][1]/(ADM_AD_BL.groupby("ABP").count()["RID"].sum()))
    return pd.concat([ADM_ABP_DEM, ADM_CN_S], ignore_index=True)
