In [49]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [50]:
# Load all datasets
train_proteins = pd.read_csv("./input/train_proteins.csv")
train_clinical = pd.read_csv("./input/train_clinical_data.csv")
train_peptides = pd.read_csv("./input/train_peptides.csv")
supplemental_clinical = pd.read_csv("./input/supplemental_clinical_data.csv")

In [51]:
# Pivot the proteins and peptides tables so each protein and peptide is a feature
train_proteins = train_proteins.pivot_table(values="NPX", index="visit_id", columns="UniProt")
train_peptides = train_peptides.pivot_table(values="PeptideAbundance", index="visit_id", columns="Peptide")

In [52]:
# Merge the three tables
train = train_clinical.merge(train_proteins, on="visit_id", how="left")  \
                      .merge(train_peptides, on="visit_id", how="left")

In [53]:
# Set the patient id as index
train = train.set_index(["patient_id"])

In [54]:
# An overview of the new tables
train.head(20)

Unnamed: 0_level_0,visit_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,O00391,O00533,O00584,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
55,55_0,0,10.0,6.0,15.0,,,11254.3,732430.0,39585.8,...,201158.0,16492.3,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
55,55_3,3,10.0,7.0,25.0,,,,,,...,,,,,,,,,,
55,55_6,6,8.0,10.0,34.0,,,13163.6,630465.0,35220.8,...,171079.0,13198.8,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
55,55_9,9,8.0,9.0,30.0,0.0,On,,,,...,,,,,,,,,,
55,55_12,12,10.0,10.0,41.0,0.0,On,15257.6,815083.0,41650.9,...,231772.0,17873.8,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
55,55_18,18,7.0,13.0,38.0,0.0,On,,,,...,,,,,,,,,,
55,55_24,24,16.0,9.0,49.0,0.0,On,,,,...,,,,,,,,,,
55,55_30,30,14.0,13.0,49.0,0.0,On,,,,...,,,,,,,,,,
55,55_36,36,17.0,18.0,51.0,0.0,On,13530.8,753832.0,43048.9,...,185290.0,18580.5,2659660.0,90936.9,679163.0,128593.0,203680.0,498621.0,52792.7,13973.7
55,55_42,42,12.0,20.0,41.0,0.0,On,,,,...,,,,,,,,,,


In [55]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2615 entries, 55 to 65043
Columns: 1202 entries, visit_id to YYWGGQYTWDMAK
dtypes: float64(1199), int64(1), object(2)
memory usage: 24.0+ MB


In [56]:
train = train.drop("visit_id", axis=1)

In [57]:
# Create a list of the ids of the patients
patient_id = list(train.index.unique())

In [58]:
# Check missing values
print(train.isna().sum().sum())

1962306


In [59]:
# Interpolate the missing data of every patient
for patient in patient_id:
    train.loc[patient] = train.loc[patient].interpolate(method="linear").fillna(method="bfill")

In [60]:
# Fill updrs and state of medication values
train.updrs_4 = train.updrs_4.fillna(0)
train.upd23b_clinical_state_on_medication = train.upd23b_clinical_state_on_medication.fillna("Off")

In [61]:
# Adjust updr4 values to possible values
train.updrs_4 = round(train.updrs_4)

In [77]:
# Check missing values after interpolate
print("Total missing values left: {}".format(train.isna().sum().sum()))
print(train.isna().sum().sort_values(ascending=False))

Total missing values left: 0
visit_month                                                              0
MATLYSR                                                                  0
MADEAGSEADHEGTHSTKR                                                      0
M(UniMod_35)YLGYEYVTAIR                                                  0
M(UniMod_35)VQEQC(UniMod_4)C(UniMod_4)HSQLEELHC(UniMod_4)ATGISLANEQDR    0
                                                                        ..
DTDTGALLFIGK                                                             0
DSGVPDRFSGSGSGTDFTLK                                                     0
DSGRDYVSQFEGSALGK                                                        0
DSGFQMNQLR                                                               0
YYWGGQYTWDMAK                                                            0
Length: 1201, dtype: int64


In [78]:
# Fill the remaining na values with the mean of the columns
train = train.fillna(train.mean())

  train = train.fillna(train.mean())


Total missing values: 0


In [79]:
train.head()

Unnamed: 0_level_0,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,O00391,O00533,O00584,O14498,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
55,0,10.0,6.0,15.0,0.0,On,11254.3,732430.0,39585.8,41526.9,...,201158.0,16492.3,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
55,3,10.0,7.0,25.0,0.0,On,12208.95,681447.5,37403.3,41410.95,...,186118.5,14845.55,3964895.0,110139.5,547764.0,117333.5,155229.0,447598.0,43168.45,17801.15
55,6,8.0,10.0,34.0,0.0,On,13163.6,630465.0,35220.8,41295.0,...,171079.0,13198.8,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
55,9,8.0,9.0,30.0,0.0,On,14210.6,722774.0,38435.85,40529.15,...,201425.5,15536.3,4796830.0,114835.5,613338.0,120227.5,163185.0,455072.0,47386.4,21272.5
55,12,10.0,10.0,41.0,0.0,On,15257.6,815083.0,41650.9,39763.3,...,231772.0,17873.8,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1


In [81]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2615 entries, 55 to 65043
Columns: 1201 entries, visit_month to YYWGGQYTWDMAK
dtypes: float64(1199), int64(1), object(1)
memory usage: 24.0+ MB
