# Further analysis

Here we would like to analyze the combined training data, and set up a recurrent framework.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt


sns.set_style("whitegrid")

Notice that to combine the three tables, train_clinical, train_peptide and train_protein, the most convenient way is to pivot the later two table.

In [None]:
clinical=pd.read_csv("../Data/train_clinical_data.csv")
peptides=pd.read_csv("../Data/train_peptides.csv")
proteins=pd.read_csv("../Data/train_proteins.csv")

In [None]:
# Note that visit_id is a concatenation of both patient_id and visit_month, so it is enough to just keep that
peptides_pivotted=peptides.pivot(columns="Peptide", index="visit_id", values="PeptideAbundance")
peptides_pivotted=peptides_pivotted.reset_index()
peptides_pivotted.head(5)

Peptide,visit_id,AADDTWEPFASGK,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,AAFTEC(UniMod_4)C(UniMod_4)QAADK,AANEVSSADVK,AATGEC(UniMod_4)TATVGKR,AATVGSLAGQPLQER,AAVYHHFISDGVR,ADDKETC(UniMod_4)FAEEGK,ADDKETC(UniMod_4)FAEEGKK,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,10053_0,6580710.0,31204.4,7735070.0,,,,46620.3,236144.0,,...,202274.0,,4401830.0,77482.6,583075.0,76705.7,104260.0,530223.0,,7207.3
1,10053_12,6333510.0,52277.6,5394390.0,,,,57554.5,108298.0,45885.4,...,201009.0,,5001750.0,36745.3,355643.0,92078.1,123254.0,453883.0,49281.9,25332.8
2,10053_18,7129640.0,61522.0,7011920.0,35984.7,17188.0,19787.3,36029.4,708729.0,5067790.0,...,220728.0,,5424380.0,39016.0,496021.0,63203.6,128336.0,447505.0,52389.1,21235.7
3,10138_12,7404780.0,46107.2,10610900.0,,20910.2,66662.3,55253.9,79575.5,6201210.0,...,188362.0,9433.71,3900280.0,48210.3,328482.0,89822.1,129964.0,552232.0,65657.8,9876.98
4,10138_24,13788300.0,56910.3,6906160.0,13785.5,11004.2,63672.7,36819.8,34160.9,2117430.0,...,206187.0,6365.15,3521800.0,69984.6,496737.0,80919.3,111799.0,,56977.6,4903.09


In [None]:
proteins_pivotted=proteins.pivot(columns="UniProt", index="visit_id", values="NPX")
proteins_pivotted=peptides_pivotted.reset_index()
proteins_pivotted.head(5)

Peptide,index,visit_id,AADDTWEPFASGK,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,AAFTEC(UniMod_4)C(UniMod_4)QAADK,AANEVSSADVK,AATGEC(UniMod_4)TATVGKR,AATVGSLAGQPLQER,AAVYHHFISDGVR,ADDKETC(UniMod_4)FAEEGK,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,0,10053_0,6580710.0,31204.4,7735070.0,,,,46620.3,236144.0,...,202274.0,,4401830.0,77482.6,583075.0,76705.7,104260.0,530223.0,,7207.3
1,1,10053_12,6333510.0,52277.6,5394390.0,,,,57554.5,108298.0,...,201009.0,,5001750.0,36745.3,355643.0,92078.1,123254.0,453883.0,49281.9,25332.8
2,2,10053_18,7129640.0,61522.0,7011920.0,35984.7,17188.0,19787.3,36029.4,708729.0,...,220728.0,,5424380.0,39016.0,496021.0,63203.6,128336.0,447505.0,52389.1,21235.7
3,3,10138_12,7404780.0,46107.2,10610900.0,,20910.2,66662.3,55253.9,79575.5,...,188362.0,9433.71,3900280.0,48210.3,328482.0,89822.1,129964.0,552232.0,65657.8,9876.98
4,4,10138_24,13788300.0,56910.3,6906160.0,13785.5,11004.2,63672.7,36819.8,34160.9,...,206187.0,6365.15,3521800.0,69984.6,496737.0,80919.3,111799.0,,56977.6,4903.09


In [None]:
pivotted_peptides_and_protein=peptides_pivotted.merge(proteins_pivotted, on="visit_id", how="outer")
pivotted_peptides_and_protein.head(5)

Peptide,visit_id,AADDTWEPFASGK_x,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K_x,AAFTEC(UniMod_4)C(UniMod_4)QAADK_x,AANEVSSADVK_x,AATGEC(UniMod_4)TATVGKR_x,AATVGSLAGQPLQER_x,AAVYHHFISDGVR_x,ADDKETC(UniMod_4)FAEEGK_x,ADDKETC(UniMod_4)FAEEGKK_x,...,YSLTYIYTGLSK_y,YTTEIIK_y,YVGGQEHFAHLLILR_y,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR_y,YVMLPVADQDQC(UniMod_4)IR_y,YVNKEIQNAVNGVK_y,YWGVASFLQK_y,YYC(UniMod_4)FQGNQFLR_y,YYTYLIMNK_y,YYWGGQYTWDMAK_y
0,10053_0,6580710.0,31204.4,7735070.0,,,,46620.3,236144.0,,...,202274.0,,4401830.0,77482.6,583075.0,76705.7,104260.0,530223.0,,7207.3
1,10053_12,6333510.0,52277.6,5394390.0,,,,57554.5,108298.0,45885.4,...,201009.0,,5001750.0,36745.3,355643.0,92078.1,123254.0,453883.0,49281.9,25332.8
2,10053_18,7129640.0,61522.0,7011920.0,35984.7,17188.0,19787.3,36029.4,708729.0,5067790.0,...,220728.0,,5424380.0,39016.0,496021.0,63203.6,128336.0,447505.0,52389.1,21235.7
3,10138_12,7404780.0,46107.2,10610900.0,,20910.2,66662.3,55253.9,79575.5,6201210.0,...,188362.0,9433.71,3900280.0,48210.3,328482.0,89822.1,129964.0,552232.0,65657.8,9876.98
4,10138_24,13788300.0,56910.3,6906160.0,13785.5,11004.2,63672.7,36819.8,34160.9,2117430.0,...,206187.0,6365.15,3521800.0,69984.6,496737.0,80919.3,111799.0,,56977.6,4903.09


In [None]:
clinical_and_pivotted_peptides_and_protein=pivotted_peptides_and_protein.merge(clinical, on="visit_id", how="outer")
clinical_and_pivotted_peptides_and_protein.head(80)

Unnamed: 0,visit_id,AADDTWEPFASGK_x,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K_x,AAFTEC(UniMod_4)C(UniMod_4)QAADK_x,AANEVSSADVK_x,AATGEC(UniMod_4)TATVGKR_x,AATVGSLAGQPLQER_x,AAVYHHFISDGVR_x,ADDKETC(UniMod_4)FAEEGK_x,ADDKETC(UniMod_4)FAEEGKK_x,...,YYC(UniMod_4)FQGNQFLR_y,YYTYLIMNK_y,YYWGGQYTWDMAK_y,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,10053_0,6580710.0,31204.4,7735070.0,,,,46620.3,236144.0,,...,530223.0,,7207.3,10053.0,0.0,3.0,0.0,13.0,0.0,
1,10053_12,6333510.0,52277.6,5394390.0,,,,57554.5,108298.0,45885.4,...,453883.0,49281.9,25332.8,10053.0,12.0,4.0,2.0,8.0,0.0,
2,10053_18,7129640.0,61522.0,7011920.0,35984.7,17188.0,19787.3,36029.4,708729.0,5067790.0,...,447505.0,52389.1,21235.7,10053.0,18.0,2.0,2.0,0.0,0.0,
3,10053_24,,,,,,,,,,...,,,,10053.0,24.0,4.0,3.0,0.0,0.0,
4,10053_6,,,,,,,,,,...,,,,10053.0,6.0,2.0,4.0,5.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,11686_60,,,,,,,,,,...,,,,11686.0,60.0,2.0,0.0,1.0,,
76,11686_84,9812280.0,69997.9,9128540.0,32673.0,18331.4,,37451.3,436173.0,4836650.0,...,477134.0,41560.0,36394.0,11686.0,84.0,3.0,0.0,2.0,,
77,11686_96,,,,,,,,,,...,,,,11686.0,96.0,5.0,1.0,0.0,,
78,11928_0,5113420.0,41585.3,2303730.0,,,85583.3,69269.6,67441.2,9465620.0,...,,,46134.2,11928.0,0.0,2.0,2.0,14.0,,


Observe that there are some cases when a patient took the peptide and protein test but didn't take the updrs score test, drop them for now.

In [None]:
print(f"there are {clinical_and_pivotted_peptides_and_protein['visit_month'].isna().sum()} patient who took the peptide and protein test but didn't take the updrs score test")
display(clinical_and_pivotted_peptides_and_protein[clinical_and_pivotted_peptides_and_protein['visit_month'].isna()])
clinical_and_pivotted_peptides_and_protein_drop_untested=clinical_and_pivotted_peptides_and_protein.dropna(subset=['visit_month'])

there are 45 patient who took the peptide and protein test but didn't take the updrs score test


Unnamed: 0,visit_id,AADDTWEPFASGK_x,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K_x,AAFTEC(UniMod_4)C(UniMod_4)QAADK_x,AANEVSSADVK_x,AATGEC(UniMod_4)TATVGKR_x,AATVGSLAGQPLQER_x,AAVYHHFISDGVR_x,ADDKETC(UniMod_4)FAEEGK_x,ADDKETC(UniMod_4)FAEEGKK_x,...,YYC(UniMod_4)FQGNQFLR_y,YYTYLIMNK_y,YYWGGQYTWDMAK_y,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
74,11686_6,11325500.0,63127.9,7556900.0,35169.0,16050.2,,36280.0,436531.0,4123890.0,...,488955.0,44630.0,16124.1,,,,,,,
109,12636_6,7053890.0,7923.89,4928420.0,12736.7,,57734.9,,295976.0,2619950.0,...,436178.0,28616.2,4033.01,,,,,,,
112,12703_108,5060310.0,57069.5,4403600.0,26237.5,2017.37,108172.0,,275486.0,2301210.0,...,476673.0,39807.5,30967.8,,,,,,,
223,13968_6,5048160.0,69232.8,5773080.0,18715.2,627.006,38577.9,35505.0,68332.0,4252570.0,...,450303.0,33200.8,27553.1,,,,,,,
229,14035_6,4493520.0,42678.2,6872460.0,,,35965.6,41708.1,342511.0,5117410.0,...,495339.0,39108.0,27860.0,,,,,,,
279,14450_6,3444250.0,71386.1,2192220.0,45383.2,18042.5,110577.0,,521732.0,3461160.0,...,73098.6,33680.7,21024.1,,,,,,,
360,16347_6,4991850.0,60041.4,6100080.0,23726.2,3984.46,56594.9,33204.7,363078.0,5092880.0,...,392832.0,38910.2,29917.8,,,,,,,
533,20212_6,7997880.0,48308.8,6508820.0,60152.7,14103.0,113208.0,54063.3,30217.8,5357820.0,...,350231.0,61235.1,13797.5,,,,,,,
542,20216_6,7362740.0,54680.2,9865080.0,24689.6,6902.66,66164.4,44277.4,491551.0,5725490.0,...,488018.0,53923.0,24475.9,,,,,,,
593,20581_6,4364600.0,51033.5,11655800.0,25431.5,15084.5,89538.2,49365.6,504598.0,4909790.0,...,154505.0,48365.6,,,,,,,,


Because it is a time series dataset, we can study the effect of protein and peptide abundance on the updrs score in at a time point, but we can also study the effect of protein or peptide level changes on the change of updrs score in a time period for the later construction of a recurrent network.

To that end, we first split the combined table into several subtables by the month.

In [None]:
# Observe that the smallest gap for any adjacent visit month is 3
def divisible_by_three(x) -> bool:
  return x%3==0
def divisible_by_six(x) -> bool:
  return x%6==0
display((~clinical_and_pivotted_peptides_and_protein_drop_untested['visit_month'].apply(divisible_by_three)).sum())
display((~clinical_and_pivotted_peptides_and_protein_drop_untested['visit_month'].apply(divisible_by_six)).sum())

np.int64(0)

np.int64(214)

Observe that the smallest gap is three, as all visit months are divisible by 3, but not all are divisible by 6.

The univariant analysis without imputation are conducted before, and to decide which imputation is appropriate, we need to do some bivariant or multivariant analysis.