In [15]:
import time
import gym
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets
import pandas as pd

from stable_baselines3 import PPO, SAC
from stable_baselines3.common.vec_env.dummy_vec_env import DummyVecEnv
from stable_baselines3.common.logger import Logger, configure
from stable_baselines3.common.monitor import Monitor
from stable_baselines3 import dqn


import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn import metrics

In [2]:
!tree "./kaggle/input/"

[1;36m./kaggle/input/[0m
└── [1;36mamp-parkinsons-disease-progression-prediction[0m
    ├── [1;36mamp_pd_peptide[0m
    │   ├── __init__.py
    │   └── competition.cpython-37m-x86_64-linux-gnu.so
    ├── [1;36mexample_test_files[0m
    │   ├── sample_submission.csv
    │   ├── test.csv
    │   ├── test_peptides.csv
    │   └── test_proteins.csv
    ├── public_timeseries_testing_util.py
    ├── supplemental_clinical_data.csv
    ├── train_clinical_data.csv
    ├── train_peptides.csv
    └── train_proteins.csv

4 directories, 11 files


In [3]:
df_train_clin = pd.read_csv("./kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv")
df_train_pept = pd.read_csv("./kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv")
df_train_prot = pd.read_csv("./kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv")

In [4]:
patients = {}
for e in range(1,5):
    for m in [0,6,12,24]:
        df_train_clin[f'updrs_{e}_plus_{m}_months'] = 0

for patient in df_train_clin.patient_id.unique():
    temp = df_train_clin[df_train_clin.patient_id == patient]
    month_list = []
    month_windows = [0,6,12,24]
    for month in temp.visit_month.values:
        month_list.append([month, month + 6, month + 12, month + 24])
    for month in range(len(month_list)):
        for x in range(1,5):
            arr = temp[temp.visit_month.isin(month_list[month])][f'updrs_{x}'].fillna(0).to_list()
            if len(arr) == 4:
                for e, i in enumerate(arr):
                    m = month_list[month][0]
                    temp.loc[temp.visit_month == m,[f'updrs_{x}_plus_{month_windows[e]}_months']] = i
            else:
                temp = temp[~temp.visit_month.isin(month_list[month])]
    patients[patient] = temp

In [5]:
formatted_clin = pd.concat(patients.values(), ignore_index=True).set_index('visit_id').iloc[:,7:]
formatted_clin

Unnamed: 0_level_0,updrs_1_plus_0_months,updrs_1_plus_6_months,updrs_1_plus_12_months,updrs_1_plus_24_months,updrs_2_plus_0_months,updrs_2_plus_6_months,updrs_2_plus_12_months,updrs_2_plus_24_months,updrs_3_plus_0_months,updrs_3_plus_6_months,updrs_3_plus_12_months,updrs_3_plus_24_months,updrs_4_plus_0_months,updrs_4_plus_6_months,updrs_4_plus_12_months,updrs_4_plus_24_months
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
55_0,10,8,10,16,6,10,10,9,15,34,41,49,0,0,0,0
55_6,8,10,7,14,10,10,13,13,34,41,38,49,0,0,0,0
55_12,10,7,16,17,10,13,9,18,41,38,49,51,0,0,0,0
55_18,7,16,14,12,13,9,13,20,38,49,49,41,0,0,0,0
55_24,16,14,17,17,9,13,18,16,49,49,51,52,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65043_12,4,6,4,2,7,7,8,7,14,13,0,5,0,0,0,0
65043_18,6,4,3,9,7,8,4,10,13,0,4,15,0,0,0,0
65043_24,4,3,2,7,8,4,7,6,0,4,5,13,0,0,0,0
65043_30,3,2,9,4,4,7,10,8,4,5,15,11,0,0,0,1


In [6]:
protfeatures = df_train_prot.pivot(index='visit_id', columns='UniProt', values='NPX')
protfeatures.columns = [x+'_prot' for x in protfeatures.columns]
protfeatures.head()

Unnamed: 0_level_0,O00391_prot,O00533_prot,O00584_prot,O14498_prot,O14773_prot,O14791_prot,O15240_prot,O15394_prot,O43505_prot,O60888_prot,...,Q9HDC9_prot,Q9NQ79_prot,Q9NYU2_prot,Q9UBR2_prot,Q9UBX5_prot,Q9UHG2_prot,Q9UKV8_prot,Q9UNU6_prot,Q9Y646_prot,Q9Y6R7_prot
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10053_0,9104.27,402321.0,,,7150.57,2497.84,83002.9,15113.6,167327.0,129048.0,...,,9469.45,94237.6,,23016.0,177983.0,65900.0,15382.0,,19017.4
10053_12,10464.2,435586.0,,,,,197117.0,15099.1,164268.0,108114.0,...,,14408.4,,,28537.0,171733.0,65668.1,,9295.65,25697.8
10053_18,13235.7,507386.0,7126.96,24525.7,,2372.71,126506.0,16289.6,168107.0,163776.0,...,317477.0,38667.2,111107.0,,37932.6,245188.0,59986.1,10813.3,,29102.7
10138_12,12600.2,494581.0,9165.06,27193.5,22506.1,6015.9,156313.0,54546.4,204013.0,56725.0,...,557904.0,44556.9,155619.0,14647.9,36927.7,229232.0,106564.0,26077.7,21441.8,7642.42
10138_24,12003.2,522138.0,4498.51,17189.8,29112.4,2665.15,151169.0,52338.1,240892.0,85767.1,...,,47836.7,177619.0,17061.1,25510.4,176722.0,59471.4,12639.2,15091.4,6168.55


In [7]:
peptfeatures = df_train_pept.pivot_table(index='visit_id', 
                                   columns='UniProt', 
                                   values='PeptideAbundance', 
                                   aggfunc='mean')
peptfeatures.columns = [x+'_pept' for x in peptfeatures.columns]
peptfeatures.head()

Unnamed: 0_level_0,O00391_pept,O00533_pept,O00584_pept,O14498_pept,O14773_pept,O14791_pept,O15240_pept,O15394_pept,O43505_pept,O60888_pept,...,Q9HDC9_pept,Q9NQ79_pept,Q9NYU2_pept,Q9UBR2_pept,Q9UBX5_pept,Q9UHG2_pept,Q9UKV8_pept,Q9UNU6_pept,Q9Y646_pept,Q9Y6R7_pept
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10053_0,9104.27,57474.485714,,,7150.57,2497.84,41501.45,15113.6,167327.0,129048.0,...,,9469.45,94237.6,,23016.0,88991.65,65900.0,15382.0,,19017.4
10053_12,10464.2,62226.528571,,,,,98558.3,15099.1,164268.0,108114.0,...,,14408.4,,,28537.0,85866.7,65668.1,,9295.65,25697.8
10053_18,13235.7,72483.685714,7126.96,24525.7,,2372.71,42168.533333,16289.6,168107.0,163776.0,...,317477.0,19333.6,111107.0,,18966.33,61296.99,59986.1,10813.3,,14551.35
10138_12,12600.2,61822.5625,9165.06,27193.5,11253.045,6015.9,52104.496667,27273.2,204013.0,56725.0,...,557904.0,22278.45,155619.0,14647.9,18463.845,57307.8275,106564.0,26077.7,21441.8,7642.42
10138_24,12003.2,65267.1625,4498.51,17189.8,14556.2,2665.15,50389.850333,26169.05,240892.0,85767.1,...,,23918.4,177619.0,17061.1,12755.2,44180.5975,59471.4,12639.2,15091.4,6168.55


In [8]:
df = protfeatures.merge(formatted_clin, left_index=True,right_index=True,how='right')
print(f'\nNA values: {df[protfeatures.columns].isna().sum().sum()/(len(df)*len(protfeatures.columns)):.2%}')
df['visit_month'] = df.reset_index().visit_id.str.split('_').apply(lambda x: int(x[1])).values
df.head()


NA values: 53.64%


Unnamed: 0_level_0,O00391_prot,O00533_prot,O00584_prot,O14498_prot,O14773_prot,O14791_prot,O15240_prot,O15394_prot,O43505_prot,O60888_prot,...,updrs_2_plus_24_months,updrs_3_plus_0_months,updrs_3_plus_6_months,updrs_3_plus_12_months,updrs_3_plus_24_months,updrs_4_plus_0_months,updrs_4_plus_6_months,updrs_4_plus_12_months,updrs_4_plus_24_months,visit_month
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
55_0,11254.3,732430.0,39585.8,41526.9,31238.0,4202.71,177775.0,62898.2,333376.0,166850.0,...,9,15,34,41,49,0,0,0,0,0
55_6,13163.6,630465.0,35220.8,41295.0,26219.9,4416.42,165638.0,62567.5,277833.0,170345.0,...,13,34,41,38,49,0,0,0,0,6
55_12,15257.6,815083.0,41650.9,39763.3,30703.6,4343.6,151073.0,66963.1,332401.0,151194.0,...,18,41,38,49,51,0,0,0,0,12
55_18,,,,,,,,,,,...,20,38,49,49,41,0,0,0,0,18
55_24,,,,,,,,,,,...,16,49,49,51,52,0,0,0,0,24


In [9]:
df = peptfeatures.merge(df, left_index=True,right_index=True,how='right')
print(f'\nNA values: {df[peptfeatures.columns].isna().sum().sum()/(len(df)*len(peptfeatures.columns)):.2%}')
df['visit_month'] = df.reset_index().visit_id.str.split('_').apply(lambda x: int(x[1])).values
df.head()


NA values: 53.64%


Unnamed: 0_level_0,O00391_pept,O00533_pept,O00584_pept,O14498_pept,O14773_pept,O14791_pept,O15240_pept,O15394_pept,O43505_pept,O60888_pept,...,updrs_2_plus_24_months,updrs_3_plus_0_months,updrs_3_plus_6_months,updrs_3_plus_12_months,updrs_3_plus_24_months,updrs_4_plus_0_months,updrs_4_plus_6_months,updrs_4_plus_12_months,updrs_4_plus_24_months,visit_month
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
55_0,11254.3,91553.775,19792.9,41526.9,15619.025,4202.71,59258.35,31449.1,333376.0,166850.0,...,9,15,34,41,49,0,0,0,0,0
55_6,13163.6,78808.175,17610.4,41295.0,13109.945,4416.42,55212.92,31283.75,277833.0,170345.0,...,13,34,41,38,49,0,0,0,0,6
55_12,15257.6,101885.55,20825.5,39763.3,15351.82,4343.6,50357.543333,33481.55,332401.0,151194.0,...,18,41,38,49,51,0,0,0,0,12
55_18,,,,,,,,,,,...,20,38,49,49,41,0,0,0,0,18
55_24,,,,,,,,,,,...,16,49,49,51,52,0,0,0,0,24


In [10]:
visit_month_list = df.reset_index().visit_id.str.split('_').apply(lambda x: int(x[1])).unique().tolist()
# protein_list = protfeatures.columns.to_list()

In [11]:
X = df.drop(formatted_clin.columns, axis=1)
y = df[formatted_clin.columns]
print('\nX and y shapes:')
X.shape, y.shape


X and y shapes:


((954, 455), (954, 16))

In [12]:
def random_sample_imputation(df, var):
    random_sample = df[var].dropna().sample(df[var].isna().sum(), random_state=42, replace=True)
    random_sample.index = df[df[var].isna()].index
    df.loc[df[var].isna(), var] = random_sample
    return df

In [13]:
for col in X.columns:
    X = random_sample_imputation(X, col)

In [14]:
X.reset_index(drop=True).values

array([[1.12543000e+04, 9.15537750e+04, 1.97929000e+04, ...,
        2.38337000e+04, 1.89535000e+04, 0.00000000e+00],
       [1.31636000e+04, 7.88081750e+04, 1.76104000e+04, ...,
        1.77225000e+04, 1.66427000e+04, 6.00000000e+00],
       [1.52576000e+04, 1.01885550e+05, 2.08255000e+04, ...,
        2.85361000e+04, 1.92909000e+04, 1.20000000e+01],
       ...,
       [1.46595000e+04, 1.32752688e+05, 2.32202000e+04, ...,
        2.12863000e+04, 3.95879000e+04, 2.40000000e+01],
       [9.70133000e+03, 3.73184250e+04, 1.22750450e+04, ...,
        1.35426000e+04, 1.15170000e+04, 3.00000000e+01],
       [1.12543000e+04, 7.22122250e+04, 1.44731800e+04, ...,
        2.79938000e+04, 2.22152000e+04, 3.60000000e+01]])

In [17]:
model = SAC.load('./artifacts/best_models/SAC/best_model.zip')

In [18]:
model

<stable_baselines3.sac.sac.SAC at 0x13bc23610>

In [22]:
from sklearn.preprocessing import MinMaxScaler
scaler_X = MinMaxScaler()
scaler_Y = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_Y.fit_transform(y)

In [27]:
preds = model.predict(X_scaled)

In [30]:
preds = preds[0]

In [31]:
def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    pos_ind = dem != 0
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100 * np.mean(smap)

In [33]:
y_scaled.shape

(954, 16)

In [34]:
preds.shape

(954, 1)

In [38]:
y_scaled[:, 0].shape

(954,)

In [39]:
smape(y_true=y_scaled[:, 0], y_pred=preds.reshape(-1))

52.09086709489229