In [191]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import ensemble
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Load train datasets
train_proteins = pd.read_csv("./input/train_proteins.csv")
train_clinical = pd.read_csv("./input/train_clinical_data.csv")
train_peptides = pd.read_csv("./input/train_peptides.csv")
supplemental_clinical = pd.read_csv("./input/supplemental_clinical_data.csv")

# Load test datasets
test_proteins = pd.read_csv("./example_test_files/test_proteins.csv")
test_clinical = pd.read_csv("./example_test_files/test.csv")
test_peptides = pd.read_csv("./example_test_files/test_peptides.csv")

In [192]:
# Pivot the proteins and peptides tables so each protein and peptide is a feature
train_proteins = train_proteins.pivot_table(values="NPX", index="visit_id", columns="UniProt")
train_peptides = train_peptides.pivot_table(values="PeptideAbundance", index="visit_id", columns="Peptide")

# Merge the three tables
train = train_clinical.merge(train_proteins, on="visit_id", how="left")  \
                      .merge(train_peptides, on="visit_id", how="left")
                      
# Set the patient id as index
train = train.set_index(["patient_id"])

# Drop the visit id column
train = train.drop("visit_id", axis=1)

In [193]:
# Load all datasets
test_clinical = pd.read_csv("./example_test_files/test.csv")
test_proteins = pd.read_csv("./example_test_files/test_proteins.csv")
test_peptides = pd.read_csv("./example_test_files/test_peptides.csv")

# Pivot the proteins and peptides tables so each protein and peptide is a feature
test_proteins = test_proteins.pivot_table(values="NPX", index="patient_id", columns="UniProt")
test_peptides = test_peptides.pivot_table(values="PeptideAbundance", index="patient_id", columns="Peptide")

# Set the patient id as index
test_clinical = test_clinical.set_index(["patient_id"])

# Merge the three tables
test = test_proteins.merge(test_peptides, on="patient_id", how="left")

In [194]:
# Create a list of common columns
train_cols = train.columns.values[6:]
test_cols = test.columns.values
cols = list(set(train_cols) & set(test_cols))

# Use only common columns
train = train[list(train.columns.values[:5]) + cols]
test = test[cols]

In [196]:
# Fill updrs and state of medication values
train.updrs_4 = train.updrs_4.fillna(0).round()

# Create a list of the ids of the patients
patient_id = list(train.index.unique())

# Interpolate the missing data of every patient
for patient in patient_id:
    train.loc[patient] = train.loc[patient].interpolate(method="linear").fillna(method="bfill")
    
# Fill the remaining na values with the mean of the columns
train = train.fillna(train.mean())

In [197]:
# Create a list of the ids of the patients
patient_id_test = list(test.index.unique())

# Interpolate the missing data of every patient
for patient in patient_id_test:
    test.loc[patient] = test.loc[patient].interpolate(method="linear").fillna(method="bfill")

# Fill the remaining na values with the mean of the columns
test = test.fillna(test.mean())

In [198]:
# Normalize all the columns values from the train and the test dataset
for col in cols:
    values_list = np.array(list(train[col]) + list(test[col]))
    train[col] = (train[col] - values_list.min()) / (values_list.max() - values_list.min())
    test[col] = (test[col] - values_list.min()) / (values_list.max() - values_list.min())

In [199]:
# Define a list with symptoms
updrs_cols = ['updrs_1','updrs_2','updrs_3','updrs_4']

In [200]:
# Empty dict for slopes and intercepts
updrs_inter = {}
updrs_slopes = {}

# Slope and intercept for every patient and updrs
for patient in patient_id:
    
    # Create empty dicts of lists
    updrs_inter[patient] = []
    updrs_slopes[patient] = []
    
    for updrs in updrs_cols:
        
        X = train.loc[patient]['visit_month'].values.reshape(-1, 1)
        y = train.loc[patient][updrs].values.reshape(-1, 1)
        lr = LinearRegression()
        lr.fit(X, y)
        updrs_inter[patient].append(float(lr.intercept_))
        updrs_slopes[patient].append(float(lr.coef_))

# Create df for slopes and intercepts
intercepts_df = pd.DataFrame.from_dict(updrs_inter, orient="index", columns=[s + "_inter" for s in updrs_cols])
slopes_df = pd.DataFrame.from_dict(updrs_slopes, orient="index", columns=[s + "_slope" for s in updrs_cols])

# Merge all the data in the train data frame
train = train.merge(intercepts_df, how="left", left_index=True, right_index=True)  \
             .merge(slopes_df, how="left", left_index=True, right_index=True)

In [208]:
from xgboost import XGBRegressor

# Create dictionaries to store the models
updrs_predictors = {}
updrs_intercept_predictors = {}
updrs_slopes_predictors = {}

# Define X
X = train.iloc[:, 5:-8]

# Train a model per each symptom
for updrs in updrs_cols:
    updrs_predictors[updrs] = XGBRegressor()
    updrs_predictors[updrs].fit(X, train.loc[:,updrs])

# Train a model for each intercept
for updrs in updrs_cols:
    updrs_intercept_predictors[updrs] = XGBRegressor()
    updrs_intercept_predictors[updrs].fit(X, train.loc[:,(updrs + "_inter")])
    
# Train a model for each slope
for updrs in updrs_cols:
    updrs_slopes_predictors[updrs] = XGBRegressor()
    updrs_slopes_predictors[updrs].fit(X, train.loc[:,(updrs + "_slope")])

In [212]:
print(X.columns.values)

['MYLGYEYVTAIR' 'DSGEGDFLAEGGGVR' 'HVEPGEPLAPSPQEPQAVGR' ...
 'QRQEELC(UniMod_4)LAR' 'VDSGNDVTDIADDGC(UniMod_4)PKPPEIAHGYVEHSVR'
 'SSQGGSLPSEEK']


In [None]:
print

In [246]:
print(updrs_predictors["updrs_1"].predict(test.iloc[0].values.reshape(1,-1)))

[4.0844693]


In [242]:
print(X.iloc[0].values.reshape(-1,1))

[[0.35778089]
 [0.034282  ]
 [0.49604043]
 ...
 [0.53680141]
 [0.20110066]
 [0.56711566]]


In [213]:
print(test.columns.values)

['MYLGYEYVTAIR' 'DSGEGDFLAEGGGVR' 'HVEPGEPLAPSPQEPQAVGR' ...
 'QRQEELC(UniMod_4)LAR' 'VDSGNDVTDIADDGC(UniMod_4)PKPPEIAHGYVEHSVR'
 'SSQGGSLPSEEK']


In [255]:
# Load sample submission fie
output_doc = pd.read_csv("./example_test_files/sample_submission.csv")
output_doc = output_doc.drop(index=output_doc.index)

#TODO: 
for patient in patient_id_test:
    
    visit_id = test_clinical.loc[patient].visit_id.values[0]
    group_key = test_clinical.loc[patient].group_key.values[0]
    
    for updrs in updrs_cols:
        
        # Updrs values predicted with protein and peptides
        updrs_prediction = updrs_predictors[updrs].predict(test.loc[patient].values.reshape(1,-1))
        
        prediction_id = f'{visit_id}_{updrs}_plus_0_months'
        new_row = {'prediction_id': prediction_id, 'rating': float(updrs_prediction), 'group_key': group_key}
        output_doc = output_doc.append(new_row, ignore_index=True)
        
        # To get the slope
        slope = updrs_slopes_predictors[updrs].predict(test.loc[patient].values.reshape(1,-1))
        
        # To get the intercept
        intercept = updrs_intercept_predictors[updrs].predict(test.loc[patient].values.reshape(1,-1))
        
        for month in range(6, 25, 6):
            
            #Compute the evolution at a given month
            updrs_prediction =  intercept + slope * month
            prediction_id = f'{visit_id}_{updrs}_plus_{month}_months'
            new_row = {'prediction_id': prediction_id, 'rating': float(updrs_prediction), 'group_key': group_key}
            output_doc = output_doc.append(new_row, ignore_index=True)

output_doc.to_csv('submission.csv',  index = False)


  output_doc = output_doc.append(new_row, ignore_index=True)
  output_doc = output_doc.append(new_row, ignore_index=True)
  output_doc = output_doc.append(new_row, ignore_index=True)
  output_doc = output_doc.append(new_row, ignore_index=True)
  output_doc = output_doc.append(new_row, ignore_index=True)
  output_doc = output_doc.append(new_row, ignore_index=True)
  output_doc = output_doc.append(new_row, ignore_index=True)
  output_doc = output_doc.append(new_row, ignore_index=True)
  output_doc = output_doc.append(new_row, ignore_index=True)
  output_doc = output_doc.append(new_row, ignore_index=True)
  output_doc = output_doc.append(new_row, ignore_index=True)
  output_doc = output_doc.append(new_row, ignore_index=True)
  output_doc = output_doc.append(new_row, ignore_index=True)
  output_doc = output_doc.append(new_row, ignore_index=True)
  output_doc = output_doc.append(new_row, ignore_index=True)
  output_doc = output_doc.append(new_row, ignore_index=True)
  output_doc = output_do

In [256]:
output_doc.head(30)

Unnamed: 0,prediction_id,rating,group_key
0,3342_0_updrs_1_plus_0_months,4.084469,0
1,3342_0_updrs_1_plus_6_months,4.741521,0
2,3342_0_updrs_1_plus_12_months,4.875772,0
3,3342_0_updrs_1_plus_18_months,5.010024,0
4,3342_0_updrs_1_plus_24_months,5.144275,0
5,3342_0_updrs_2_plus_0_months,1.876029,0
6,3342_0_updrs_2_plus_6_months,3.377566,0
7,3342_0_updrs_2_plus_12_months,3.403335,0
8,3342_0_updrs_2_plus_18_months,3.429103,0
9,3342_0_updrs_2_plus_24_months,3.454871,0
