In [100]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Load train datasets
train_proteins = pd.read_csv("./input/train_proteins.csv")
train_clinical = pd.read_csv("./input/train_clinical_data.csv")
train_peptides = pd.read_csv("./input/train_peptides.csv")
supplemental_clinical = pd.read_csv("./input/supplemental_clinical_data.csv")

# Load test datasets
test_proteins = pd.read_csv("./example_test_files/test_proteins.csv")
test_clinical = pd.read_csv("./example_test_files/test.csv")
test_peptides = pd.read_csv("./example_test_files/test_peptides.csv")

In [101]:
# Pivot the proteins and peptides tables so each protein and peptide is a feature
train_proteins = train_proteins.pivot_table(values="NPX", index="visit_id", columns="UniProt")
train_peptides = train_peptides.pivot_table(values="PeptideAbundance", index="visit_id", columns="Peptide")

# Merge the three tables
train = train_clinical.merge(train_proteins, on="visit_id", how="left")  \
                      .merge(train_peptides, on="visit_id", how="left")
                      
# Set the patient id as index
train = train.set_index(["patient_id"])

# Drop the visit id column
train = train.drop("visit_id", axis=1)

In [102]:
# Load all datasets
test_proteins = pd.read_csv("./example_test_files/test_proteins.csv")
test_clinical = pd.read_csv("./example_test_files/test.csv")
test_peptides = pd.read_csv("./example_test_files/test_peptides.csv")

# Pivot the proteins and peptides tables so each protein and peptide is a feature
test_proteins = test_proteins.pivot_table(values="NPX", index="visit_id", columns="UniProt")
test_peptides = test_peptides.pivot_table(values="PeptideAbundance", index="visit_id", columns="Peptide")

# Merge the three tables
test = test_clinical.merge(test_proteins, on="visit_id", how="left")  \
                    .merge(test_peptides, on="visit_id", how="left")

# Set the patient id as index
test = test.set_index(["patient_id"])

test = test.drop("visit_id", axis=1)

In [103]:
# Create a list of common columns
train_cols = train.columns.values[6:]
test_cols = test.columns.values[4:]
cols = list(set(train_cols) & set(test_cols))

# Use only common columns
train = train[list(train.columns.values[:6]) + cols]
test = test[list(test.columns.values[:4]) + cols]

In [None]:
# Fill updrs and state of medication values
train.updrs_4 = train.updrs_4.fillna(0).round()
train.upd23b_clinical_state_on_medication = train.upd23b_clinical_state_on_medication.fillna("Off").replace({"On": 1, "Off": 0})

# Create a list of the ids of the patients
patient_id = list(train.index.unique())

# Interpolate the missing data of every patient
for patient in patient_id:
    train.loc[patient] = train.loc[patient].interpolate(method="linear").fillna(method="bfill")
    
# Fill the remaining na values with the mean of the columns
train = train.fillna(train.mean())

In [None]:
# Create a list of the ids of the patients
patient_id_test = list(test.index.unique())

# Interpolate the missing data of every patient
for patient in patient_id_test:
    test.loc[patient] = test.loc[patient].interpolate(method="linear").fillna(method="bfill")

# Fill the remaining na values with the mean of the columns
test = test.fillna(test.mean())

In [83]:
# Now we shall normalize all the columns values with the exception of two columns
updrs_cols = ['updrs_1','updrs_2','updrs_3','updrs_4']
e = ['upd23b_clinical_state_on_medication', 'visit_month']
norm_columns = set(train.columns.values)-set(e)-set(updrs_cols)

for col in norm_columns:
    train[col] = (train[col] - train[col].min()) / (train[col].max() - train[col].min())  

In [None]:
from xgboost import XGBRegressor
import numpy as np

# Define the SMAPE metric
def smape(actual, predicted):
    return np.mean((np.abs(actual - predicted) * 200) / (np.abs(actual) + np.abs(predicted)))

# Create a dictionary to store the models
updrs_predictors = {}

# Define X
X = train.iloc[:, 5:]

# Train a model per each symptom
for updrs in updrs_cols:
    updrs_predictors[updrs] = XGBRegressor().fit(X, train.loc[:,updrs])

In [105]:
train.head()

Unnamed: 0_level_0,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,ATLGPAVRPLPWQR,QKWEAEPVYVQR,O60888,P01876,...,ITYGETGGNSPVQEFTVPGSK,IYLYTLNDNAR,QKPDGVFQEDAPVIHQEMIGGLR,SC(UniMod_4)DNPYIPNGDYSPLR,ASGSPEPAISWFR,ATWSGAVLAGR,SPELQAEAK,DTSC(UniMod_4)VNPPTVQNAYIVSR,KPALEDLR,NLAVSQVVHK
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
55,0,10.0,6.0,15.0,0.0,0,41765.2,292395.0,166850.0,1829650.0,...,7750.27,225990.0,55618.0,24821.3,39688.8,274430.0,195249.0,126571.0,25300.7,286731.0
55,3,10.0,7.0,25.0,0.0,0,,,,,...,,,,,,,,,,
55,6,8.0,10.0,34.0,0.0,0,37862.9,330678.0,170345.0,1930810.0,...,6320.56,163090.0,51706.9,20204.7,38838.0,260187.0,170797.0,122299.0,20306.9,291595.0
55,9,8.0,9.0,30.0,0.0,1,,,,,...,,,,,,,,,,
55,12,10.0,10.0,41.0,0.0,1,52589.8,259537.0,151194.0,1878400.0,...,9854.38,269061.0,64000.9,29320.3,43177.5,305360.0,146704.0,133972.0,21073.9,266769.0
