In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import LabelEncoder

In [2]:
def smape_plus_1(y_true, y_pred):
    y_true_plus_1 = y_true + 1
    y_pred_plus_1 = y_pred + 1
    metric = np.zeros(len(y_true_plus_1))
    
    numerator = np.abs(y_true_plus_1 - y_pred_plus_1)
    denominator = ((np.abs(y_true_plus_1) + np.abs(y_pred_plus_1)) / 2)
    
    mask_not_zeros = (y_true_plus_1 != 0) | (y_pred_plus_1 != 0)
    metric[mask_not_zeros] = numerator[mask_not_zeros] / denominator[mask_not_zeros]
    
    return 100 * np.nanmean(metric)

In [3]:
# Load a dataset into a Pandas DataFrame
train_proteins = pd.read_csv("train_proteins.csv")
train_peptides = pd.read_csv("train_peptides.csv")
train_clinical = pd.read_csv("train_clinical_data.csv")
train_sup = pd.read_csv("supplemental_clinical_data.csv")

# Feature Engineering

In [4]:
df=train_clinical[(train_clinical.visit_month==0)][['visit_id','updrs_1']]
df.head()

Unnamed: 0,visit_id,updrs_1
0,55_0,10.0
13,942_0,3.0
28,1517_0,11.0
38,1923_0,2.0
45,2660_0,2.0


In [5]:
df_proteins = pd.merge(train_proteins, df, on = 'visit_id', how = 'inner').reset_index()
proteins_Uniprot_updrs = df_proteins.groupby('UniProt').agg(updrs_1_sum = ('updrs_1','mean')).reset_index()
proteins_Uniprot_updrs.head()

Unnamed: 0,UniProt,updrs_1_sum
0,O00391,4.971014
1,O00533,5.319588
2,O00584,5.286458
3,O14498,5.217877
4,O14773,5.371585


In [6]:
df_peptides = pd.merge(train_peptides, df, on = 'visit_id', how = 'inner').reset_index()
peptides_PeptideAbundance_updrs = df_peptides.groupby('Peptide').agg(updrs_1_sum = ('updrs_1','mean')).reset_index()
peptides_PeptideAbundance_updrs.head()

Unnamed: 0,Peptide,updrs_1_sum
0,AADDTWEPFASGK,5.357143
1,AAFGQGSGPIMLDEVQC(UniMod_4)TGTEASLADC(UniMod_4)K,5.296703
2,AAFTEC(UniMod_4)C(UniMod_4)QAADK,5.305699
3,AANEVSSADVK,5.36478
4,AATGEC(UniMod_4)TATVGKR,5.146497


In [7]:
df_0_1 = train_clinical[(train_clinical.visit_month == 3)][['visit_id','updrs_1']]
df_0_2 = train_clinical[(train_clinical.visit_month == 3)][['visit_id','updrs_2']]
df_0_3 = train_clinical[(train_clinical.visit_month == 3)][['visit_id','updrs_3']]
df_0_4 = train_clinical[(train_clinical.visit_month == 3)][['visit_id','updrs_4']]

df_proteins = pd.merge(train_proteins, df_0_1, on = 'visit_id', how = 'inner').reset_index()
proteins_Uniprot_updrs1 = df_proteins.groupby('UniProt').agg(updrs_1_sum = ('updrs_1','mean')).reset_index()

df_proteins = pd.merge(train_proteins, df_0_2, on = 'visit_id', how = 'inner').reset_index()
proteins_Uniprot_updrs2 = df_proteins.groupby('UniProt').agg(updrs_1_sum = ('updrs_2','mean')).reset_index()

df_proteins = pd.merge(train_proteins, df_0_3, on = 'visit_id', how = 'inner').reset_index()
proteins_Uniprot_updrs3 = df_proteins.groupby('UniProt').agg(updrs_1_sum = ('updrs_3','mean')).reset_index()

df_proteins = pd.merge(train_proteins, df_0_4, on = 'visit_id', how = 'inner').reset_index()
proteins_Uniprot_updrs4 = df_proteins.groupby('UniProt').agg(updrs_1_sum = ('updrs_4','mean')).reset_index()

df_peptides = pd.merge(train_peptides, df_0_1, on = 'visit_id', how = 'inner').reset_index()
peptides_PeptideAbundance_updrs1 = df_peptides.groupby('Peptide').agg(updrs_1_sum = ('updrs_1','mean')).reset_index()

df_peptides = pd.merge(train_peptides, df_0_2, on = 'visit_id', how = 'inner').reset_index()
peptides_PeptideAbundance_updrs2 = df_peptides.groupby('Peptide').agg(updrs_1_sum = ('updrs_2','mean')).reset_index()

df_peptides = pd.merge(train_peptides, df_0_3, on = 'visit_id', how = 'inner').reset_index()
peptides_PeptideAbundance_updrs3 = df_peptides.groupby('Peptide').agg(updrs_1_sum = ('updrs_3','mean')).reset_index()

df_peptides = pd.merge(train_peptides, df_0_4, on = 'visit_id', how = 'inner').reset_index()
peptides_PeptideAbundance_updrs4 = df_peptides.groupby('Peptide').agg(updrs_1_sum = ('updrs_4','mean')).reset_index()

df_proteins_fts = [proteins_Uniprot_updrs1, proteins_Uniprot_updrs2, proteins_Uniprot_updrs3, proteins_Uniprot_updrs4]
df_peptides_fts = [peptides_PeptideAbundance_updrs1, peptides_PeptideAbundance_updrs2, peptides_PeptideAbundance_updrs3, peptides_PeptideAbundance_updrs4]
df_lst = [df_0_1, df_0_2, df_0_3, df_0_4]

In [8]:
def features(df, train_proteins, train_peptides, classes):
    proteins_npx_ft =train_proteins.groupby('visit_id').agg(NPX_min=('NPX','min'),
    NPX_max=('NPX','max'), NPX_mean=('NPX','mean'), NPX_std=('NPX','std'),
    NPX_median=('NPX','median'), NPX_var=('NPX','var'), NPX_sum=('NPX','sum'),
    NPX_skew=('NPX','skew'), NPX_sem=('NPX','sem'),
    NPX_first=('NPX','first'), NPX_last=('NPX','last')).reset_index()

    peptides_PeptideAbundance_ft = train_peptides.groupby('visit_id').agg( Abe_min=('PeptideAbundance','min'),
    Abe_max=('PeptideAbundance','max'),  Abe_mean=('PeptideAbundance','mean'),  Abe_std=('PeptideAbundance','std'),
    Abe_median=('PeptideAbundance','median'),  Abe_var=('PeptideAbundance','var'),  Abe_sum=('PeptideAbundance','sum'),
    Abe_skew=('PeptideAbundance','skew'),  Abe_sem=('PeptideAbundance','sem'),
    Abe_first=('PeptideAbundance','first'),  Abe_last=('PeptideAbundance','last')).reset_index()

    df_proteins = pd.merge(train_proteins, proteins_Uniprot_updrs, on = 'UniProt', how = 'left')
    proteins_UniProt_ft = df_proteins.groupby('visit_id').agg(proteins_updrs_1_min=('updrs_1_sum','min'),
    proteins_updrs_1_max=('updrs_1_sum','max'), proteins_updrs_1_mean=('updrs_1_sum','mean'), proteins_updrs_1_std=('updrs_1_sum','std'),
    proteins_updrs_1_median=('updrs_1_sum','median'), proteins_updrs_1_var=('updrs_1_sum','var'), proteins_updrs_1_sum=('updrs_1_sum','sum'),
    proteins_updrs_1_skew=('updrs_1_sum','skew'), proteins_updrs_1_sem=('updrs_1_sum','sem'),
    proteins_updrs_1_first=('updrs_1_sum','first'), proteins_updrs_1_last=('updrs_1_sum','last')).reset_index()
    
    df_peptides = pd.merge(train_peptides, peptides_PeptideAbundance_updrs, on = 'Peptide', how = 'left')
    peptides_ft = df_peptides.groupby('visit_id').agg(peptides_updrs_1__min=('updrs_1_sum','min'),
    peptides_updrs_1__max=('updrs_1_sum','max'), peptides_updrs_1__mean=('updrs_1_sum','mean'), peptides_updrs_1__std=('updrs_1_sum','std'),
    peptides_updrs_1__median=('updrs_1_sum','median'), peptides_updrs_1__var=('updrs_1_sum','var'), peptides_updrs_1__sum=('updrs_1_sum','sum'),
    peptides_updrs_1__skew=('updrs_1_sum','skew'), peptides_updrs_1__sem=('updrs_1_sum','sem'),
    peptides_updrs_1__first=('updrs_1_sum','first'), peptides_updrs_1__last=('updrs_1_sum','last')).reset_index()
    df = pd.merge(df, proteins_npx_ft, on = 'visit_id', how = 'left')
    df = pd.merge(df, peptides_PeptideAbundance_ft, on = 'visit_id', how = 'left')
    df = pd.merge(df, proteins_UniProt_ft, on = 'visit_id', how = 'left')
    df = pd.merge(df, peptides_ft, on = 'visit_id', how = 'left')
    df = df.fillna(df.median())
    return df

In [9]:
train_0=features(df_0_1,train_proteins,train_peptides,0)
train_0

Unnamed: 0,visit_id,updrs_1,NPX_min,NPX_max,NPX_mean,NPX_std,NPX_median,NPX_var,NPX_sum,NPX_skew,...,peptides_updrs_1__max,peptides_updrs_1__mean,peptides_updrs_1__std,peptides_updrs_1__median,peptides_updrs_1__var,peptides_updrs_1__sum,peptides_updrs_1__skew,peptides_updrs_1__sem,peptides_updrs_1__first,peptides_updrs_1__last
0,55_3,10.0,507.771,201446000.0,2.123204e+06,1.448782e+07,103937.0,2.098968e+14,4.458729e+08,12.757831,...,5.620438,5.257241,0.119543,5.286458,0.014291,4678.944093,-1.407811,0.004007,4.971014,5.069767
1,942_3,7.0,507.771,201446000.0,2.123204e+06,1.448782e+07,103937.0,2.098968e+14,4.458729e+08,12.757831,...,5.620438,5.257241,0.119543,5.286458,0.014291,4678.944093,-1.407811,0.004007,4.971014,5.069767
2,3636_3,4.0,507.771,201446000.0,2.123204e+06,1.448782e+07,103937.0,2.098968e+14,4.458729e+08,12.757831,...,5.620438,5.257241,0.119543,5.286458,0.014291,4678.944093,-1.407811,0.004007,4.971014,5.069767
3,4161_3,1.0,507.771,201446000.0,2.123204e+06,1.448782e+07,103937.0,2.098968e+14,4.458729e+08,12.757831,...,5.620438,5.257241,0.119543,5.286458,0.014291,4678.944093,-1.407811,0.004007,4.971014,5.069767
4,5645_3,5.0,507.771,201446000.0,2.123204e+06,1.448782e+07,103937.0,2.098968e+14,4.458729e+08,12.757831,...,5.620438,5.257241,0.119543,5.286458,0.014291,4678.944093,-1.407811,0.004007,4.971014,5.069767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,62723_3,7.0,507.771,201446000.0,2.123204e+06,1.448782e+07,103937.0,2.098968e+14,4.458729e+08,12.757831,...,5.620438,5.257241,0.119543,5.286458,0.014291,4678.944093,-1.407811,0.004007,4.971014,5.069767
111,62792_3,0.0,507.771,201446000.0,2.123204e+06,1.448782e+07,103937.0,2.098968e+14,4.458729e+08,12.757831,...,5.620438,5.257241,0.119543,5.286458,0.014291,4678.944093,-1.407811,0.004007,4.971014,5.069767
112,64669_3,15.0,507.771,201446000.0,2.123204e+06,1.448782e+07,103937.0,2.098968e+14,4.458729e+08,12.757831,...,5.620438,5.257241,0.119543,5.286458,0.014291,4678.944093,-1.407811,0.004007,4.971014,5.069767
113,64674_3,5.0,507.771,201446000.0,2.123204e+06,1.448782e+07,103937.0,2.098968e+14,4.458729e+08,12.757831,...,5.620438,5.257241,0.119543,5.286458,0.014291,4678.944093,-1.407811,0.004007,4.971014,5.069767


In [17]:
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,proteins_updrs_1_mean,proteins_updrs_1_median,proteins_updrs_1_min,proteins_updrs_1_sem,proteins_updrs_1_skew,proteins_updrs_1_std,proteins_updrs_1_sum,proteins_updrs_1_var,updrs_1,visit_id
0,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [47]:
from sklearn.preprocessing import MinMaxScaler

In [48]:
# Ensemle
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import VotingRegressor

In [72]:
from sklearn.linear_model import HuberRegressor,RANSACRegressor,Lasso
from sklearn.linear_model import QuantileRegressor,LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import NuSVR,SVC,SVR

In [86]:
model = {}
mms = MinMaxScaler()
for i in range(1,5):
    print('--------------------------------------------------------')
    print(f'Model {i}')
    train_0 = features(df_lst[i-1], train_proteins, train_peptides, i-1)
    scale_col =[ 'NPX_min','NPX_max','NPX_mean','NPX_std','NPX_median','NPX_var','NPX_sum','NPX_skew','NPX_sem','NPX_first','NPX_last',
              'Abe_min','Abe_max','Abe_mean','Abe_std','Abe_median','Abe_var','Abe_sum','Abe_skew','Abe_sem','Abe_first','Abe_last']
    
    train_0[scale_col] = mms.fit_transform(train_0[scale_col])
    rf = RandomForestRegressor(n_estimators= 2938, min_samples_split=36, min_samples_leaf= 23,
                                max_features='sqrt', max_depth= 125, bootstrap= True)
    ls = Lasso(alpha= 0.1)
    svr= SVR(kernel='poly')
    dt = HuberRegressor()
 
    X = train_0.drop(columns = ['visit_id',f'updrs_{i}'], axis = 1)
    y = train_0[f'updrs_{i}'].astype(np.float32)
    rf.fit(X, y)
    ls.fit(X,y)
    dt.fit(X,y)
    svr.fit(X,y)
    
    print('Train smape rf:',smape_plus_1(train_0[f'updrs_{i}'], rf.predict(train_0.drop(columns = ['visit_id',f'updrs_{i}'], axis = 1))))
    print('Train smape svr:',smape_plus_1(train_0[f'updrs_{i}'], svr.predict(train_0.drop(columns = ['visit_id',f'updrs_{i}'], axis = 1))))
    print('Train smape lasso:',smape_plus_1(train_0[f'updrs_{i}'], ls.predict(train_0.drop(columns = ['visit_id',f'updrs_{i}'], axis = 1))))
    print('Train smape tree:',smape_plus_1(train_0[f'updrs_{i}'], dt.predict(train_0.drop(columns = ['visit_id',f'updrs_{i}'], axis = 1))))

    model['rf_' + str(i)] = rf
    model['svr_'+str(i)]=svr
    model['ls_' + str(i)] = ls
    model['dt_' + str(i)] = dt

--------------------------------------------------------
Model 1
Train smape rf: 56.73335925155448
Train smape svr: 55.23293344750865
Train smape lasso: 56.72919323237675
Train smape tree: 55.41713252785117
--------------------------------------------------------
Model 2
Train smape rf: 54.19519254568609
Train smape svr: 51.921934858744244
Train smape lasso: 54.16614637373719
Train smape tree: 52.426459901130194
--------------------------------------------------------
Model 3
Train smape rf: 38.69594603013132
Train smape svr: 38.41737669797976
Train smape lasso: 38.69425188648785
Train smape tree: 38.57106749805318
--------------------------------------------------------
Model 4
Train smape rf: 1.189125263445383
Train smape svr: 4.590836754565849
Train smape lasso: 1.1699609901794517
Train smape tree: 1.1699610228803055


In [12]:
updrs_3_pred = {}
up3 = train_clinical[['visit_month','updrs_3']].drop_duplicates(['visit_month','updrs_3'])
updrs_3_pred = dict(zip(up3.visit_month, up3.updrs_3))

In [None]:
import amp_pd_peptide
env = amp_pd_peptide.make_env()   
iter_test = env.iter_test()

In [None]:
def map_test(x):
    updrs = x.split('_')[2] + '_' + x.split('_')[3]
    month = int(x.split('_plus_')[1].split('_')[0])
    visit_id = x.split('_')[0] + '_' + x.split('_')[1]
    # set all predictions 0 where updrs equals 'updrs_4'
    if updrs=='updrs_3':
        rating = df[df.visit_id == visit_id]['pred3'].values[0]
    elif updrs=='updrs_4':
        rating = 0
    elif updrs =='updrs_1':
        rating = df[df.visit_id == visit_id]['pred1'].values[0]
    else:
        rating = df[df.visit_id == visit_id]['pred2'].values[0]
    return rating

counter = 0
# The API will deliver four dataframes in this specific order:
for (test, test_peptides, test_proteins, sample_submission) in iter_test:
    df = test[['visit_id']].drop_duplicates('visit_id')
    pred_1= features(df[['visit_id']], test_proteins, test_peptides, 1)
    scale_col = ['NPX_min','NPX_max','NPX_mean','NPX_std','NPX_median','NPX_var',
                 'NPX_sum','NPX_skew','NPX_sem','NPX_first','NPX_last',
              'Abe_min','Abe_max','Abe_mean','Abe_std','Abe_median','Abe_var','Abe_sum',
                 'Abe_skew','Abe_sem','Abe_first','Abe_last']
    pred_1[scale_col] = mms.fit_transform(pred_1[scale_col])
    pred_1 = (model['rf_1'].predict(pred_1.drop(columns = ['visit_id'], axis = 1)) \
              + model['svr_1'].predict(pred_1.drop(columns = ['visit_id'], axis = 1))\
                 + model['ls_1'].predict(pred_1.drop(columns = ['visit_id'], axis = 1))\
                        +model['dt_1'].predict(pred_1.drop(columns = ['visit_id'], axis = 1)))/4
    df['pred1'] = np.ceil(pred_1 + 0)
    
    
    pred_2= features(df[['visit_id']], test_proteins, test_peptides, 2)
    scale_col = ['NPX_min','NPX_max','NPX_mean','NPX_std','NPX_median','NPX_var',
                 'NPX_sum','NPX_skew','NPX_sem','NPX_first','NPX_last',
              'Abe_min','Abe_max','Abe_mean','Abe_std','Abe_median','Abe_var','Abe_sum',
                 'Abe_skew','Abe_sem','Abe_first','Abe_last']
    pred_2[scale_col] = mms.fit_transform(pred_2[scale_col])
    pred_2 = (model['rf_2'].predict(pred_2.drop(columns = ['visit_id'], axis = 1)) \
              +model['svr_2'].predict(pred_2.drop(columns = ['visit_id'], axis = 1))\
                 + model['ls_2'].predict(pred_2.drop(columns = ['visit_id'], axis = 1))\
                   +model['dt_2'].predict(pred_2.drop(columns = ['visit_id'], axis = 1)))/4
    df['pred2'] = np.ceil(pred_2 + 0.5)
    
    
    pred_3 = features(df[['visit_id']], test_proteins, test_peptides, 3)
    scale_col =  ['NPX_min','NPX_max','NPX_mean','NPX_std','NPX_median','NPX_var',
                 'NPX_sum','NPX_skew','NPX_sem','NPX_first','NPX_last',
              'Abe_min','Abe_max','Abe_mean','Abe_std','Abe_median','Abe_var','Abe_sum',
                 'Abe_skew','Abe_sem','Abe_first','Abe_last']
    pred_3[scale_col] = mms.fit_transform(pred_3[scale_col])
    pred_3 = (model['rf_3'].predict(pred_3.drop(columns = ['visit_id'], axis = 1)) \
                +model['svr_3'].predict(pred_3.drop(columns = ['visit_id'], axis = 1))\
                    + model['ls_3'].predict(pred_3.drop(columns = ['visit_id'], axis = 1))\
                       + model['dt_3'].predict(pred_3.drop(columns = ['visit_id'], axis = 1)))/4
    df['pred3'] = np.ceil(pred_3 + 1.5)
    
    
    pred_4 = features(df[['visit_id']], test_proteins, test_peptides, 4)
    scale_col =  ['NPX_min','NPX_max','NPX_mean','NPX_std','NPX_median','NPX_var',
                 'NPX_sum','NPX_skew','NPX_sem','NPX_first','NPX_last',
              'Abe_min','Abe_max','Abe_mean','Abe_std','Abe_median','Abe_var','Abe_sum',
                 'Abe_skew','Abe_sem','Abe_first','Abe_last']
    pred_4[scale_col] = mms.fit_transform(pred_4[scale_col])
    pred_4 = (model['rf_4'].predict(pred_4.drop(columns = ['visit_id'], axis = 1)) \
                +model['svr_4'].predict(pred_4.drop(columns = ['visit_id'], axis = 1))\
                    + model['ls_4'].predict(pred_4.drop(columns = ['visit_id'], axis = 1))\
                       + model['dt_4'].predict(pred_4.drop(columns = ['visit_id'], axis = 1)))/4
    df['pred4'] = np.ceil(pred_4 + 3.5)
    
    
    sample_submission['rating'] = sample_submission['prediction_id'].apply(map_test)
    env.predict(sample_submission)
    
    if counter == 0:
        display(test)
        display(sample_submission)
        
    counter += 1