# Further analysis

Here we would like to analyze the combined training data, and set up a recurrent framework.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt


sns.set_style("whitegrid")

Notice that to combine the three tables, train_clinical, train_peptide and train_protein, the most convenient way is to pivot the later two table.

In [None]:
clinical=pd.read_csv("/content/train_clinical_data.csv")
peptides=pd.read_csv("/content/train_peptides.csv")
proteins=pd.read_csv("/content/train_proteins.csv")

In [None]:
# Note that visit_id is a concatenation of both patient_id and visit_month, so it is enough to just keep that
peptides_pivotted=peptides.pivot(columns="Peptide", index="visit_id", values="PeptideAbundance")
peptides_pivotted=peptides_pivotted.reset_index()
peptides_pivotted.head(5)

Peptide,visit_id,AADDTWEPFASGK,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,AAFTEC(UniMod_4)C(UniMod_4)QAADK,AANEVSSADVK,AATGEC(UniMod_4)TATVGKR,AATVGSLAGQPLQER,AAVYHHFISDGVR,ADDKETC(UniMod_4)FAEEGK,ADDKETC(UniMod_4)FAEEGKK,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,10053_0,6580710.0,31204.4,7735070.0,,,,46620.3,236144.0,,...,202274.0,,4401830.0,77482.6,583075.0,76705.7,104260.0,530223.0,,7207.3
1,10053_12,6333510.0,52277.6,5394390.0,,,,57554.5,108298.0,45885.4,...,201009.0,,5001750.0,36745.3,355643.0,92078.1,123254.0,453883.0,49281.9,25332.8
2,10053_18,7129640.0,61522.0,7011920.0,35984.7,17188.0,19787.3,36029.4,708729.0,5067790.0,...,220728.0,,5424380.0,39016.0,496021.0,63203.6,128336.0,447505.0,52389.1,21235.7
3,10138_12,7404780.0,46107.2,10610900.0,,20910.2,66662.3,55253.9,79575.5,6201210.0,...,188362.0,9433.71,3900280.0,48210.3,328482.0,89822.1,129964.0,552232.0,65657.8,9876.98
4,10138_24,13788300.0,56910.3,6906160.0,13785.5,11004.2,63672.7,36819.8,34160.9,2117430.0,...,206187.0,6365.15,3521800.0,69984.6,496737.0,80919.3,111799.0,,56977.6,4903.09


In [None]:
proteins_pivotted=proteins.pivot(columns="UniProt", index="visit_id", values="NPX")
proteins_pivotted=peptides_pivotted.reset_index()
proteins_pivotted.head(5)

Peptide,index,visit_id,AADDTWEPFASGK,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,AAFTEC(UniMod_4)C(UniMod_4)QAADK,AANEVSSADVK,AATGEC(UniMod_4)TATVGKR,AATVGSLAGQPLQER,AAVYHHFISDGVR,ADDKETC(UniMod_4)FAEEGK,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,0,10053_0,6580710.0,31204.4,7735070.0,,,,46620.3,236144.0,...,202274.0,,4401830.0,77482.6,583075.0,76705.7,104260.0,530223.0,,7207.3
1,1,10053_12,6333510.0,52277.6,5394390.0,,,,57554.5,108298.0,...,201009.0,,5001750.0,36745.3,355643.0,92078.1,123254.0,453883.0,49281.9,25332.8
2,2,10053_18,7129640.0,61522.0,7011920.0,35984.7,17188.0,19787.3,36029.4,708729.0,...,220728.0,,5424380.0,39016.0,496021.0,63203.6,128336.0,447505.0,52389.1,21235.7
3,3,10138_12,7404780.0,46107.2,10610900.0,,20910.2,66662.3,55253.9,79575.5,...,188362.0,9433.71,3900280.0,48210.3,328482.0,89822.1,129964.0,552232.0,65657.8,9876.98
4,4,10138_24,13788300.0,56910.3,6906160.0,13785.5,11004.2,63672.7,36819.8,34160.9,...,206187.0,6365.15,3521800.0,69984.6,496737.0,80919.3,111799.0,,56977.6,4903.09


In [None]:
pivotted_peptides_and_protein=peptides_pivotted.merge(proteins_pivotted, on="visit_id", how="outer")
pivotted_peptides_and_protein.head(5)

Peptide,visit_id,AADDTWEPFASGK_x,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K_x,AAFTEC(UniMod_4)C(UniMod_4)QAADK_x,AANEVSSADVK_x,AATGEC(UniMod_4)TATVGKR_x,AATVGSLAGQPLQER_x,AAVYHHFISDGVR_x,ADDKETC(UniMod_4)FAEEGK_x,ADDKETC(UniMod_4)FAEEGKK_x,...,YSLTYIYTGLSK_y,YTTEIIK_y,YVGGQEHFAHLLILR_y,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR_y,YVMLPVADQDQC(UniMod_4)IR_y,YVNKEIQNAVNGVK_y,YWGVASFLQK_y,YYC(UniMod_4)FQGNQFLR_y,YYTYLIMNK_y,YYWGGQYTWDMAK_y
0,10053_0,6580710.0,31204.4,7735070.0,,,,46620.3,236144.0,,...,202274.0,,4401830.0,77482.6,583075.0,76705.7,104260.0,530223.0,,7207.3
1,10053_12,6333510.0,52277.6,5394390.0,,,,57554.5,108298.0,45885.4,...,201009.0,,5001750.0,36745.3,355643.0,92078.1,123254.0,453883.0,49281.9,25332.8
2,10053_18,7129640.0,61522.0,7011920.0,35984.7,17188.0,19787.3,36029.4,708729.0,5067790.0,...,220728.0,,5424380.0,39016.0,496021.0,63203.6,128336.0,447505.0,52389.1,21235.7
3,10138_12,7404780.0,46107.2,10610900.0,,20910.2,66662.3,55253.9,79575.5,6201210.0,...,188362.0,9433.71,3900280.0,48210.3,328482.0,89822.1,129964.0,552232.0,65657.8,9876.98
4,10138_24,13788300.0,56910.3,6906160.0,13785.5,11004.2,63672.7,36819.8,34160.9,2117430.0,...,206187.0,6365.15,3521800.0,69984.6,496737.0,80919.3,111799.0,,56977.6,4903.09


In [None]:
clinical_and_pivotted_peptides_and_protein=pivotted_peptides_and_protein.merge(clinical, on="visit_id", how="outer")
clinical_and_pivotted_peptides_and_protein.head(5)

Unnamed: 0,visit_id,AADDTWEPFASGK_x,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K_x,AAFTEC(UniMod_4)C(UniMod_4)QAADK_x,AANEVSSADVK_x,AATGEC(UniMod_4)TATVGKR_x,AATVGSLAGQPLQER_x,AAVYHHFISDGVR_x,ADDKETC(UniMod_4)FAEEGK_x,ADDKETC(UniMod_4)FAEEGKK_x,...,YYC(UniMod_4)FQGNQFLR_y,YYTYLIMNK_y,YYWGGQYTWDMAK_y,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,10053_0,6580710.0,31204.4,7735070.0,,,,46620.3,236144.0,,...,530223.0,,7207.3,10053.0,0.0,3.0,0.0,13.0,0.0,
1,10053_12,6333510.0,52277.6,5394390.0,,,,57554.5,108298.0,45885.4,...,453883.0,49281.9,25332.8,10053.0,12.0,4.0,2.0,8.0,0.0,
2,10053_18,7129640.0,61522.0,7011920.0,35984.7,17188.0,19787.3,36029.4,708729.0,5067790.0,...,447505.0,52389.1,21235.7,10053.0,18.0,2.0,2.0,0.0,0.0,
3,10053_24,,,,,,,,,,...,,,,10053.0,24.0,4.0,3.0,0.0,0.0,
4,10053_6,,,,,,,,,,...,,,,10053.0,6.0,2.0,4.0,5.0,0.0,


Because it is a time series dataset, we can study the effect of protein and peptide abundance on the updrs score in at a time point, but we can also study the effect of protein or peptide level changes on the change of updrs score in a time period.

To that end, we first split the combined table into several subtables by the month.

The univariant analysis without imputation are conducted before, and to decide which imputation is appropriate, we need to do some bivariant or multivariant analysis.