In [1]:
from preprocessing.data_cleaning import get_cleaned_data, split_csv, get_features_label
from preprocessing.encoding import encode_data

from exploration.data_correlations import plot_correlations
from exploration.data_statistics import plot_labels_pie_chart, plot_blood_draw_statistics
from exploration.midterm_exploration import plot_feature_against_diagnosis

from modeling.mrmr import plot_accuracy_with_features, perform_mrmr
from modeling.logistic import run_elastic_net, evaluate_results

In [2]:
df = encode_data(get_cleaned_data())
combined, blood_only, clinical_only = split_csv(df)

In [3]:
clinical_only

Unnamed: 0,STUDYID,PATID,VISIT,AGE,A42_VEAS,A5_QUITSMOK,A5_SMOKYRS,B1_BMI,B1_BPDIAS,B1_BPSYS,...,D1_PPAPHIF_nan,D1_PRIONIF_nan,D1_PSPIF_nan,D1_STROKEIF_1.0,D1_STROKEIF_2.0,D1_STROKEIF_3.0,D1_STROKEIF_nan,D1_VASCIF_1.0,D1_VASCIF_2.0,D1_VASCIF_nan
0,1119,11001119,1,80,0.0,,,27.7,60,110,...,0,0,0,0,0,0,0,0,0,0
1,1221,11001221,1,84,0.0,74.0,,24.0,40,130,...,0,0,0,0,0,0,0,0,0,0
2,1221,11001221,2,85,450.0,74.0,,21.2,60,120,...,0,0,0,0,0,0,0,0,0,0
3,1221,11001221,3,86,0.0,74.0,,19.6,60,136,...,0,0,0,0,0,0,0,0,0,0
4,1221,11001221,4,87,0.0,74.0,,19.1,71,150,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14650,200231,81200231,1,73,0.0,,,32.3,81,131,...,0,0,0,0,0,0,0,0,0,0
14651,200232,81200232,1,72,0.0,,,25.9,54,127,...,0,0,0,0,0,0,0,0,0,0
14652,200233,81200233,1,81,0.0,,,32.2,87,144,...,0,0,0,0,0,0,0,0,0,0
14653,200234,81200234,1,63,0.0,,,30.6,77,124,...,0,0,0,0,0,0,0,0,0,0


In [4]:
import pandas as pd
blood_only_features = perform_mrmr(blood_only.drop("PATID", axis=1), pd.Series(blood_only["PATID"]), 100, True)
clinical_only_features = perform_mrmr(clinical_only.drop("PATID", axis=1), pd.Series(clinical_only["PATID"]), 100, True)
combined_features = perform_mrmr(combined.drop("PATID", axis=1), pd.Series(combined["PATID"]), 100, True)

100%|██████████| 100/100 [00:01<00:00, 89.87it/s]
100%|██████████| 100/100 [00:13<00:00,  7.21it/s]
100%|██████████| 100/100 [00:01<00:00, 60.43it/s]


In [None]:
run_elastic_net(blood_only, num_iters=10, pickle="Blood Only Elastic Net")

In [4]:
evaluate_results("Blood Only Elastic Net")

Iteration 0
Best C: 0.1
Best l1 ratio: 0.8
Micro-F1 score: 0.7983193277310925
Feature importances: Index(['APOE_GENOTYPE_nan', 'RBM_Tenascin_C', 'RBM_TNF_beta', 'Q1_GFAP',
       'PROTEO_FACTOR_VII', 'RBM_B2M', 'RBM_IGF_BP_2', 'PROTEO_TIE_2',
       'RBM_PAI_1', 'APOE_GENOTYPE_e3/e3',
       ...
       'APOE_GENOTYPE_e4/e4', 'RBM_Ferritin', 'RBM_SCF', 'RBM_CEA',
       'RBM_MMP_3', 'RBM_HGF', 'PROTEO_PYY', 'RBM_PARC', 'PROTEO_FLT_1',
       'PROTEO_EOTAXIN_HUMAN'],
      dtype='object', length=196)
Confusion matrix:
[[44 17]
 [ 7 51]]

Iteration 1
Best C: 0.1
Best l1 ratio: 0.6
Micro-F1 score: 0.8151260504201681
Feature importances: Index(['APOE_GENOTYPE_nan', 'APOE_GENOTYPE_e2/e3', 'PROTEO_SVCAM_1',
       'RBM_ANG_2', 'PROTEO_TIE_2', 'RBM_MMP_3', 'RBM_VWF',
       'APOE_GENOTYPE_e4/e4', 'RBM_THPO', 'PROTEO_FACTOR_VII',
       ...
       'RBM_LH', 'RBM_MCP_1', 'PROTEO_SAA', 'RBM_G_CSF', 'RBM_IgE',
       'PROTEO_PYY', 'RBM_IGF_BP_2', 'PROTEO_MPO', 'RBM_Prolactin',
       'RBM_TECK'],


In [None]:
run_elastic_net(clinical_only, num_iters=10, pickle="Clinical Only Elastic Net")

In [4]:
evaluate_results("Clinical Only Elastic Net")

Iteration 0
Best C: 10.0
Best l1 ratio: 1
Micro-F1 score: 0.852311939268461
Feature importances: Index(['F2_IADLTOTSCR', 'C1_WMSR_VRI', 'C1_MMSE', 'C1_WMSR_LMEM2',
       'C1_WMSR_VRII', 'C1_WMS3_LMEM2', 'C1_WMS3_VRI', 'D1_DEPIF_2.0',
       'A1_HISPANIC_1.0', 'C1_SS_TRAILB',
       ...
       'D1_BRNINJIF_3.0', 'A5_CBSTROKE_2.0', 'A1_MARISTAT_5.0', 'B1_BPDIAS',
       'B5_DEPDSEV_1.0', 'A5_PACKSPER_8.0', 'A5_CVANGIO_1.0',
       'A5_SEIZURES_2.0', 'C1_SS_TRAILA', 'B1_VISWCORR_1.0'],
      dtype='object', length=266)
Confusion matrix:
[[ 897   23   55]
 [  10 1292   76]
 [  59  205  281]]

Iteration 1
Best C: 100.0
Best l1 ratio: 0.2
Micro-F1 score: 0.8409247757073844
Feature importances: Index(['F2_IADLTOTSCR', 'C1_MMSE', 'C1_WMSR_VRI', 'C1_WMSR_LMEM2',
       'C1_WMSR_VRII', 'C1_WMS3_LMEM2', 'C1_WMSR_LMEM1', 'C1_WMS3_VRI',
       'D1_DEPIF_2.0', 'D1_DEMUNIF_1.0',
       ...
       'B5_DELSEV_1.0', 'B1_HEARING_1.0', 'A3_PROP_PARENTS_DEM_4.5',
       'C1_SS_TRAILA', 'C1_WMSR_DIGTOT', '

In [None]:
run_elastic_net(combined, num_iters=10, pickle="Combined Elastic Net")

In [6]:
evaluate_results("Combined Elastic Net")

Iteration 0
Best C: 0.1
Best l1 ratio: 0.4
Micro-F1 score: 0.9831932773109243
Feature importances: Index(['C1_MMSE', 'APOE_GENOTYPE_nan', 'C1_WMSR_LMEM2', 'C1_SS_TRAILB',
       'C1_WMS3_LMEM2', 'APOE_GENOTYPE_e3/e3', 'B5_NPIQINF', 'C1_WAISR_DIGTOT',
       'C1_WMS3_LMEM1', 'RBM_MIP_1a',
       ...
       'A5_INCONTF_2.0', 'C1_WMS3_VRI', 'B5_ANXSEV_2.0', 'C1_WMSR_VRI',
       'B5_NITESEV_1.0', 'RBM_TRAIL_R3', 'AGE', 'C1_SS_TRAILA', 'RBM_FSH',
       'F2_IADLTOTSCR'],
      dtype='object', length=462)
Confusion matrix:
[[57  2]
 [ 0 60]]

Iteration 1
Best C: 1.0
Best l1 ratio: 0.8
Micro-F1 score: 0.957983193277311
Feature importances: Index(['C1_MMSE', 'C1_SS_TRAILB', 'C1_WMS3_LMEM2', 'C1_WMS3_LMEM1',
       'APOE_GENOTYPE_nan', 'RBM_CTGF', 'C1_WMSR_LMEM2', 'B5_NPIQINF',
       'APOE_GENOTYPE_e2/e3', 'RBM_MIP_1a',
       ...
       'PROTEO_IL_8_HUMAN', 'F2_IADLTOTSCR', 'B1_BPSYS', 'RBM_HGF',
       'PROTEO_CLUSTERIN', 'A5_CVANGIO_2.0', 'RBM_TECK', 'B1_HRATE',
       'RBM_AgRP', 'APOE_GE

In [None]:
plot_labels_pie_chart(df, png="Pie Chart of Diagnoses")

In [None]:
plot_blood_draw_statistics(df, png="Blood Draw Proportions")

In [None]:
plot_correlations(df)

In [None]:
plot_feature_against_diagnosis(
    blood,
    "Q1_Total_tau",
    title="Total Tau Concentration by Diagnosis",
    ylabel="Total Tau Concentration (pg/mL)",
    png="Total Tau Concentration by Diagnosis"
)

In [None]:
plot_feature_against_diagnosis(
    clinical,
    "C1_MMSE",
    title="MMSE Score by Diagnosis",
    ylabel="MMSE Score",
    png="MMSE Score by Diagnosis"
)

In [None]:
plot_feature_against_diagnosis(
    clinical,
    "C1_WMSR_DIGTOT",
    title="WMSR Digit Span Score by Diagnosis",
    ylabel="WMSR Digit Span Score",
    png="WMSR Digit Span Score by Diagnosis"
)

In [None]:
plot_feature_against_diagnosis(
    clinical,
    "C1_CDRSUM",
    title="CDR Score by Diagnosis",
    ylabel="CDR Score",
    png="CDR Score by Diagnosis"
)

In [None]:
import pandas as pd

y, X = get_features_label(df)
y = pd.Series(y)

plot_accuracy_with_features(X, y)