# Import modules

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import LabelEncoder

In [2]:
def smape_plus_1(y_true, y_pred):
    y_true_plus_1 = y_true + 1
    y_pred_plus_1 = y_pred + 1
    metric = np.zeros(len(y_true_plus_1))
    
    numerator = np.abs(y_true_plus_1 - y_pred_plus_1)
    denominator = ((np.abs(y_true_plus_1) + np.abs(y_pred_plus_1)) / 2)
    
    mask_not_zeros = (y_true_plus_1 != 0) | (y_pred_plus_1 != 0)
    metric[mask_not_zeros] = numerator[mask_not_zeros] / denominator[mask_not_zeros]
    
    return 100 * np.nanmean(metric)

# Reading Data

In [3]:
# # Load a dataset into a Pandas DataFrame
# train_proteins = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv")
# train_peptides = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv")
# train_clinical = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv")

In [4]:
# Load a dataset into a Pandas DataFrame
train_proteins = pd.read_csv("train_proteins.csv")
train_peptides = pd.read_csv("train_peptides.csv")
train_clinical = pd.read_csv("train_clinical_data.csv")
train_sub = pd.read_csv("supplemental_clinical_data.csv")

In [5]:
train_proteins.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX
0,55_0,0,55,O00391,11254.3
1,55_0,0,55,O00533,732430.0
2,55_0,0,55,O00584,39585.8
3,55_0,0,55,O14498,41526.9
4,55_0,0,55,O14773,31238.0


In [6]:
train_peptides.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.3
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.0
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.0
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.7


In [7]:
train_clinical.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,,
1,55_3,55,3,10.0,7.0,25.0,,
2,55_6,55,6,8.0,10.0,34.0,,
3,55_9,55,9,8.0,9.0,30.0,0.0,On
4,55_12,55,12,10.0,10.0,41.0,0.0,On


In [8]:
train_clinical.head()

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,55_0,55,0,10.0,6.0,15.0,,
1,55_3,55,3,10.0,7.0,25.0,,
2,55_6,55,6,8.0,10.0,34.0,,
3,55_9,55,9,8.0,9.0,30.0,0.0,On
4,55_12,55,12,10.0,10.0,41.0,0.0,On


# Feature Engineering

In [9]:
# Function to prepare dataset with all the steps mentioned above:
def prepare_dataset(train_proteins, train_peptides):
    # Step 1: Grouping 
    df_protein_grouped = train_proteins.groupby(['visit_id','UniProt'])['NPX'].mean().reset_index()
    df_protein_grouped = train_proteins.groupby(['visit_id','UniProt'])['NPX'].median().reset_index()
    df_protein_grouped = train_proteins.groupby(['visit_id','UniProt'])['NPX'].var().reset_index()
    df_protein_grouped = train_proteins.groupby(['visit_id','UniProt'])['NPX'].std().reset_index()
    df_protein_grouped = train_proteins.groupby(['visit_id','UniProt'])['NPX'].min().reset_index()
    df_protein_grouped = train_proteins.groupby(['visit_id','UniProt'])['NPX'].max().reset_index()
    df_protein_grouped = train_proteins.groupby(['visit_id','UniProt'])['NPX'].first().reset_index()
    df_protein_grouped = train_proteins.groupby(['visit_id','UniProt'])['NPX'].last().reset_index()
    df_protein_grouped = train_proteins.groupby(['visit_id','UniProt'])['NPX'].sum().reset_index()
    df_protein_grouped = train_proteins.groupby(['visit_id','UniProt'])['NPX'].sem().reset_index()
    df_protein_grouped = train_proteins.groupby(['visit_id','UniProt'])['NPX'].prod().reset_index()
    df_protein_grouped = train_proteins.groupby(['visit_id','UniProt'])['NPX'].quantile(0.10).reset_index()
    df_protein_grouped = train_proteins.groupby(['visit_id','UniProt'])['NPX'].quantile(0.25).reset_index()
    df_protein_grouped = train_proteins.groupby(['visit_id','UniProt'])['NPX'].quantile(0.75).reset_index()
    df_protein_grouped = train_proteins.groupby(['visit_id','UniProt'])['NPX'].quantile(0.90).reset_index()
    df_peptide_grouped = train_peptides.groupby(['visit_id','Peptide'])['PeptideAbundance'].mean().reset_index()
    df_peptide_grouped = train_peptides.groupby(['visit_id','Peptide'])['PeptideAbundance'].median().reset_index()
    df_peptide_grouped = train_peptides.groupby(['visit_id','Peptide'])['PeptideAbundance'].std().reset_index()
    df_peptide_grouped = train_peptides.groupby(['visit_id','Peptide'])['PeptideAbundance'].var().reset_index()
    df_peptide_grouped = train_peptides.groupby(['visit_id','Peptide'])['PeptideAbundance'].min().reset_index()
    df_peptide_grouped = train_peptides.groupby(['visit_id','Peptide'])['PeptideAbundance'].max().reset_index()
    df_peptide_grouped = train_peptides.groupby(['visit_id','Peptide'])['PeptideAbundance'].prod().reset_index()
    df_peptide_grouped = train_peptides.groupby(['visit_id','Peptide'])['PeptideAbundance'].first().reset_index()
    df_peptide_grouped = train_peptides.groupby(['visit_id','Peptide'])['PeptideAbundance'].last().reset_index()
    df_peptide_grouped = train_peptides.groupby(['visit_id','Peptide'])['PeptideAbundance'].sum().reset_index()
    df_peptide_grouped = train_peptides.groupby(['visit_id','Peptide'])['PeptideAbundance'].sem().reset_index()
    df_peptide_grouped = train_peptides.groupby(['visit_id','Peptide'])['PeptideAbundance'].quantile(0.10).reset_index()
    df_peptide_grouped = train_peptides.groupby(['visit_id','Peptide'])['PeptideAbundance'].quantile(0.25).reset_index()
    df_peptide_grouped = train_peptides.groupby(['visit_id','Peptide'])['PeptideAbundance'].quantile(0.75).reset_index()
    df_peptide_grouped = train_peptides.groupby(['visit_id','Peptide'])['PeptideAbundance'].quantile(0.90).reset_index()
    
    # Step 2: Pivoting
    df_protein = df_protein_grouped.pivot(index='visit_id',columns = 'UniProt', values = 'NPX').rename_axis(columns=None).reset_index()
    df_peptide = df_peptide_grouped.pivot(index='visit_id',columns = 'Peptide', values = 'PeptideAbundance').rename_axis(columns=None).reset_index()
    
    # Step 3: Merging
    pro_pep_df = df_protein.merge(df_peptide, on = ['visit_id'], how = 'left')
    
    return pro_pep_df
    

In [10]:
pro_pep_df = prepare_dataset(train_proteins, train_peptides)

In [11]:
pro_pep_df.shape

(1113, 1196)

In [12]:
# Display the first 5 examples
pro_pep_df.head(5)

Unnamed: 0,visit_id,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,O43505,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVM(UniMod_35)LPVADQDQC(UniMod_4)IR,YVMLPVADQDQC(UniMod_4)IR,YVNKEIQNAVNGVK,YWGVASFLQK,YYC(UniMod_4)FQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,10053_0,9104.27,402321.0,,,7150.57,2497.84,83002.9,15113.6,167327.0,...,202274.0,,4401830.0,77482.6,583075.0,76705.7,104260.0,530223.0,,7207.3
1,10053_12,10464.2,435586.0,,,,,197117.0,15099.1,164268.0,...,201009.0,,5001750.0,36745.3,355643.0,92078.1,123254.0,453883.0,49281.9,25332.8
2,10053_18,13235.7,507386.0,7126.96,24525.7,,2372.71,126506.0,16289.6,168107.0,...,220728.0,,5424380.0,39016.0,496021.0,63203.6,128336.0,447505.0,52389.1,21235.7
3,10138_12,12600.2,494581.0,9165.06,27193.5,22506.1,6015.9,156313.0,54546.4,204013.0,...,188362.0,9433.71,3900280.0,48210.3,328482.0,89822.1,129964.0,552232.0,65657.8,9876.98
4,10138_24,12003.2,522138.0,4498.51,17189.8,29112.4,2665.15,151169.0,52338.1,240892.0,...,206187.0,6365.15,3521800.0,69984.6,496737.0,80919.3,111799.0,,56977.6,4903.09


In [13]:
feats=['upd23b_clinical_state_on_medication']
le=LabelEncoder()
df=train_clinical
for f in feats :
    le.fit(df[f])
    train_clinical[f]=le.transform(train_clinical[f])

# Training

In [14]:
import warnings
warnings.simplefilter(action='ignore')
from sklearn.model_selection import train_test_split,TimeSeriesSplit,KFold,GroupKFold
from sklearn.metrics import mean_squared_error

In [15]:
# models= {}
# target = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"]
# features = [
#     'visit_month',"upd23b_clinical_state_on_medication","patient_id","visit_id"
# ]
# scores=[]
# for u in target:
#     # temp= pro_pep_df.merge(train_clinical[['visit_id', 'patient_id', 'visit_month', u]], on = ['visit_id'], how = 'left')
#     # Drop NAs
#     temp=train_clinical.fillna(0)
#     temp = temp.dropna(subset=[u]) 

#     if u == 'updrs_3':
#         temp = temp[temp[u] != 0]
        
#     # Train data
#     X = temp.copy()
#     y = temp[["updrs_1", "updrs_2", "updrs_3", "updrs_4"]]
#     train_oof_preds = np.zeros((temp.shape[0], 4))
#     scores=[]
#     kf=KFold(n_splits=5)
#     for train_index,test_index in kf.split(X,y):
        
#         X_Train, X_Test = X.iloc[train_index], X.iloc[test_index]
#         y_Train, y_Test = y.iloc[train_index], y.iloc[test_index]
#         x_train_features = pd.DataFrame(X_Train[features])
#         x_test_features = pd.DataFrame(X_Test[features])
    
#         model=Ridge()
#         model.fit(x_train_features[features],y_Train)
#         oof_preds = model.predict(x_test_features[features])
#         oof_preds[:, 3] = 0
#         train_oof_preds[test_index] = np.rint(oof_preds)

#         reshaped_truth = y_Test.to_numpy().reshape(-1, 1)
#         new_preds = np.rint(oof_preds)
#         reshaped_preds = new_preds.reshape(-1, 1)
#         local_smape = smape_plus_1(reshaped_truth.flatten(), reshaped_preds.flatten())
#         scores.append(local_smape)
#         print(": SMAPE+1 = {}".format(local_smape))

#         models[u] = model

# smape_baseline = np.mean(scores)
# print("--> Overall results for out of fold predictions")
# print(": SMAPE+1 = {}".format(smape_baseline))

In [16]:
from sklearn.linear_model import HuberRegressor,RANSACRegressor,Lasso
from sklearn.linear_model import QuantileRegressor,LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import NuSVR,SVC,SVR

In [17]:
data=pro_pep_df.merge(train_clinical, on = ['visit_id'], how = 'left')

In [20]:
models= {}
target = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"]
scores=[]
for u in target:
    temp= data
    

    temp=temp.fillna(temp.median())
    temp = temp.dropna(subset=[u]) 

    if u == 'updrs_3':
        temp = temp[temp[u] != 0]
        
    # Train data
    X = temp.copy()
    y = temp[["updrs_1", "updrs_2", "updrs_3", "updrs_4"]]
    train_oof_preds = np.zeros((temp.shape[0], 4))
    features=['visit_id']
    scores=[]
    kf=GroupKFold(n_splits=5)
    for train_index,test_index in kf.split(X,y, groups=temp["patient_id"]):
        
        X_Train, X_Test = X.iloc[train_index], X.iloc[test_index]
        y_Train, y_Test = y.iloc[train_index], y.iloc[test_index]
        x_train_features = pd.DataFrame(X_Train[features])
        x_test_features = pd.DataFrame(X_Test[features])
    
        model=LinearRegression()
        model.fit(x_train_features,y_Train)
        oof_preds = model.predict(x_test_features)
        oof_preds[:, 3] = 0
        train_oof_preds[test_index] = np.rint(oof_preds)

        reshaped_truth = y_Test.to_numpy().reshape(-1, 1)
        new_preds = np.rint(oof_preds)
        reshaped_preds = new_preds.reshape(-1, 1)
        local_smape = smape_plus_1(reshaped_truth.flatten(), reshaped_preds.flatten())
        scores.append(local_smape)
        print(": SMAPE+1 = {}".format(local_smape))

        models[u] = model

smape_baseline = np.mean(scores)
print("--> Overall results for out of fold predictions")
print(": SMAPE+1 = {}".format(smape_baseline))

: SMAPE+1 = 51.53338453602051
: SMAPE+1 = 63.577808600831275
: SMAPE+1 = 61.471238366320854
: SMAPE+1 = 55.08489178271875
: SMAPE+1 = 62.50509208421189
: SMAPE+1 = 51.53338453602051
: SMAPE+1 = 63.577808600831275
: SMAPE+1 = 61.471238366320854
: SMAPE+1 = 55.08489178271875
: SMAPE+1 = 62.50509208421189
: SMAPE+1 = 46.89391431135682
: SMAPE+1 = 50.49040657706707
: SMAPE+1 = 51.656711144178026
: SMAPE+1 = 55.03829144913341
: SMAPE+1 = 52.9599315896048
: SMAPE+1 = 51.53338453602051
: SMAPE+1 = 63.577808600831275
: SMAPE+1 = 61.471238366320854
: SMAPE+1 = 55.08489178271875
: SMAPE+1 = 62.50509208421189
--> Overall results for out of fold predictions
: SMAPE+1 = 58.83448307402066


In [120]:
def get_predictions(my_train,pro, models):

    # Forecast
    my_train = my_train.fillna(my_train.median())
    
    for u in target:
        
        # Here is where we will save the final results
        my_train['result_' + str(u)] = 0
  
        # Predict    
        X = my_train[["visit_month","upd23b_clinical_state_on_medication","patient_id","visit_id"]]
        
        # updrs_4 will have only 0's, so we update for others
        if u != 'updrs_4':
            my_train['result_' + str(u)] = models[u].predict(X)
                   
    # Format for final submission
    result = pd.DataFrame()

    for m in [0, 6, 12, 24]:
        for u in [1, 2, 3, 4]:

            temp = my_train[["visit_id", "result_updrs_" + str(u)]]
            temp["prediction_id"] = temp["visit_id"] + "_updrs_" + str(u) + "_plus_" + str(m) + "_months"
            temp["rating"] = temp["result_updrs_" + str(u)]
            temp = temp [['prediction_id', 'rating']]

            result = result.append(temp)            
    result = result.drop_duplicates(subset=['prediction_id', 'rating'])

    return result

# Run once to check results
get_predictions(train_clinical,None, models)


Unnamed: 0,prediction_id,rating
0,55_0_updrs_1_plus_0_months,4.969711
1,55_3_updrs_1_plus_0_months,4.999091
2,55_6_updrs_1_plus_0_months,5.028470
3,55_9_updrs_1_plus_0_months,7.557414
4,55_12_updrs_1_plus_0_months,7.586295
...,...,...
2610,65043_48_updrs_4_plus_24_months,0.000000
2611,65043_54_updrs_4_plus_24_months,0.000000
2612,65043_60_updrs_4_plus_24_months,0.000000
2613,65043_72_updrs_4_plus_24_months,0.000000


# submission

In [None]:
import amp_pd_peptide_310
env = amp_pd_peptide_310.make_env()   # initialize the environment
iter_test = env.iter_test()  

In [None]:
for (test, test_peptides, test_proteins, sample_submission) in iter_test:
        
    result =get_predictions(test, test_proteins, models)

    env.predict(result)   # register your predictions