In [1]:
# This file is a modified version of the original:
# https://www.kaggle.com/code/dott1718/1st-place-solution/notebook?scriptVersionId=129798049

In [2]:
# %load_ext autoreload
# %autoreload 2

In [3]:
#import inspect
# import pandas as pd
# print(pd.__version__)

In [4]:
import sys # my_
sys.path.append('/kaggle/input/amp-pd')
sys.path.append('/kaggle/input/parkinson-progression-utils')# my_
#sys.path.append('amp_pd_peptide') # my_

In [5]:
import amp_pd_peptide # my_

In [6]:
import numpy as np
import pandas as pd
import random
from tqdm import tqdm

from copy import copy
import os

from types import SimpleNamespace
import torch
from torch import nn

import lightgbm as lgb

torch.set_num_threads(1)
# sets the number of threads used for intraop parallelism on CPU.
# intraoperative parallelism generally refers to parallel computing techniques used within a single operation -
# - or task.

In [7]:
from sklearn.model_selection import KFold

In [8]:
from joblib import Parallel, delayed

In [9]:
import utils as utils

In [10]:
def load_data(base_path = "data"):
    proteins = pd.read_csv(f"{base_path}/train_proteins.csv")
    peptides = pd.read_csv(f"{base_path}/train_peptides.csv")
    clinical = pd.read_csv(f"{base_path}/train_clinical_data.csv")
    supplement = pd.read_csv(f"{base_path}/supplemental_clinical_data.csv")
    return proteins, peptides, clinical, supplement

base_path = "/kaggle/input/amp-parkinsons-disease-progression-prediction"
#base_path = "amp-parkinsons-disease-progression-prediction"
proteins, peptides, clinical, supplement = load_data(base_path)

# supplement.loc[supplement["visit_month"] == 5, "visit_month"].unique() => array([5])
supplement.loc[supplement["visit_month"] == 5, "visit_month"] = 6


In [11]:
supplement.head(3)

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
0,35_0,35,0,5.0,3.0,16.0,0.0,
1,35_36,35,36,6.0,4.0,20.0,0.0,
2,75_0,75,0,4.0,6.0,26.0,0.0,


In [12]:


def max_dif(val, lst):
    lst0 = [x for x in lst if x < val]
    if len(lst0) == 0:
        return -1
    return val - max(lst0)

def count_prev_visits(val, lst):
    lst0 = [x for x in lst if x < val]
    return len(lst0)

class DataPrep:
    def __init__(self, target_horizons=[0, 6, 12, 24], test_vmonths = [0, 6, 12, 18, 24, 36, 48, 60, 72, 84]):
        self.target_horizons = target_horizons
        self.test_vmonths = test_vmonths

    def fit(self, proteins_df, peptides_df, clinical_df):
        pass

    def fe(self, sample, proteins_df, peptides_df, clinical_df):
        for v_month in [0, 6, 12, 18, 24, 36, 48, 60, 72, 84]:
            
            p = list(clinical_df[clinical_df["visit_month"] == v_month]["patient_id"].unique())

            sample[f"visit_{v_month}m"] = sample.apply(lambda x: (x["patient_id"] in p) and (x["visit_month"] >= v_month), axis=1).astype(int)
            
            # sample[f"visit_{v_month}m"].head(3) =>
            # 0     1
            # 26    1
            # 28    1
            # Name: visit_0m, dtype: int64
            
            p = list(proteins_df[proteins_df["visit_month"] == v_month]["patient_id"].unique())
            
            # [f"btest_{v_month}m"] column value is True when its row value corresponding to ["patient_id"] column is in p - 
            # - and row value corresponding to ["visit_month"] column is greater than or equal to v_month.
            sample[f"btest_{v_month}m"] = sample.apply(lambda x: (x["patient_id"] in p) and (x["visit_month"] >= v_month), axis=1).astype(int)

            sample[f"t_month_eq_{v_month}"] = (sample["target_month"] == v_month).astype(int)
            sample[f"v_month_eq_{v_month}"] = (sample["visit_month"] == v_month).astype(int)

        for hor in self.target_horizons:
            sample[f"hor_eq_{hor}"] = (sample["horizon"] == hor).astype(int)

        sample["horizon_scaled"] = sample["horizon"] / 24.0

        blood_samples = proteins_df["visit_id"].unique()
        
        sample["blood_taken"] = sample.apply(lambda x: x["visit_id"] in blood_samples, axis=1).astype(int)
        
        # .head(3) => return 3 row of each "patient_id" group.
        # clinical_df.groupby("patient_id")[["patient_id","visit_month"]].head(3) => 
        #       patient_id  visit_month
        # 0             55            0
        # 2             55            6
        # 4             55           12
        # 13           942            0
        # ...          ...          ...
        # 2588       64674           12
        # 2600       65043            0
        # 2602       65043            6
        # 2604       65043           12        
        
        # clinical_df.groupby("patient_id")["visit_month"].apply(lambda x: list(set(x))) =>
        # patient_id
        # 55               [0, 36, 6, 12, 48, 18, 24, 60]
        # 942      [0, 36, 6, 72, 12, 48, 18, 84, 24, 60]
        #                           ...                  
        # 64674    [0, 36, 6, 72, 12, 48, 18, 84, 24, 60]
        # 65043    [0, 36, 6, 72, 12, 48, 18, 84, 24, 60]
        # Name: visit_month, Length: 248, dtype: object        
         
        all_visits = clinical_df.groupby("patient_id")["visit_month"].apply(lambda x: list(set(x))).to_dict()
        # all_visits[55] => [0, 36, 6, 12, 48, 18, 24, 60]        
        
        # .get(x["patient_id"], []) => return [] when key is not found.
        all_non12_visits = sample.apply(lambda x: [xx for xx in all_visits.get(x["patient_id"], []) if xx <= x["visit_month"] and xx%12 != 0], axis=1)
        # all_non12_visits => 
        # 0             []
        # 26            []
        # 28           [6]
        #           ...   
        # 30768    [6, 18]
        # Length: 6059, dtype: object
    
        sample["count_non12_visits"] = all_non12_visits.apply(lambda x: len(x)) 
        # sample["count_non12_visits"] =>
        # 0        0
        # 26       0
        # 28       1
        #         ..
        # 30768    2
        # Name: count_non12_visits, Length: 6059, dtype: int64        

        return sample

    def transform_train(self, proteins_df, peptides_df, clinical_df):
        
        # tmp = clinical_df.rename({"visit_month":"target_month", "visit_id":"visit_id_target"}, axis=1)
        
        # tmp.head(3) =>
        #      visit_id_target  patient_id  target_month  updrs_1  updrs_2  updrs_3   updrs_4   upd23b_clinical_state_on_medication  
        # 0            55_0          55             0     10.0      6.0     15.0       NaN                     NaN  
        # 1            55_3          55             3     10.0      7.0     25.0       NaN                     NaN  
        # 2            55_6          55             6      8.0     10.0     34.0       NaN                     NaN     
        
        # len(tmp[tmp["patient_id"]==55]) => 13
        
        # len(tmp[
        #             (tmp["visit_id_target"]=='55_0') & (tmp["patient_id"]==55) & (tmp["target_month"]==0) & (tmp["updrs_1"]==10.0)\
        #             & (tmp["updrs_2"]==6.0) & (tmp["updrs_3"]==15.0) & (tmp["updrs_4"].isna())\
        #             & (tmp["upd23b_clinical_state_on_medication"].isna())
        #            ]) => 1
        
        # .nunique(axis=0) => Count number of distinct elements in specified axis.
        # tmp[tmp["patient_id"]==55][["target_month", "visit_id_target"]].nunique(axis=0) =>
        # target_month       13
        # visit_id_target    13
        # dtype: int64
        
    
        sample = clinical_df.rename({"visit_month":"target_month", "visit_id":"visit_id_target"}, axis=1).\
            merge(clinical_df[["patient_id", "visit_month", "visit_id"]], how="left", on="patient_id")
        
        # sample.head(3) => 
        #     visit_id_target  patient_id  target_month  updrs_1  updrs_2  updrs_3  updrs_4 upd23b_clinical_state_on_medication  visit_month visit_id  
        # 0            55_0          55             0     10.0      6.0     15.0     NaN                                 NaN            0     55_0  
        # 1            55_0          55             0     10.0      6.0     15.0     NaN                                 NaN            3     55_3  
        # 2            55_0          55             0     10.0      6.0     15.0     NaN                                 NaN            6     55_6         
        
        # len(sample[
        #           (sample["visit_id_target"]=='55_0') & (sample["patient_id"]==55) & (sample["target_month"]==0) & (sample["updrs_1"]==10.0)\
        #           & (sample["updrs_2"]==6.0) & (sample["updrs_3"]==15.0) & (sample["updrs_4"].isna())\
        #           & (sample["upd23b_clinical_state_on_medication"].isna())
        #          ]) => 13

        
        
        sample["horizon"] = sample["target_month"] - sample["visit_month"]
        
        # sample["horizon"].head(3) => 
        # 0    0
        # 1   -3
        # 2   -6
        # Name: horizon, dtype: int64
        
        sample = sample[sample["horizon"].isin(self.target_horizons)]
        sample = sample[sample["visit_month"].isin(self.test_vmonths)]

        # features
        sample = self.fe(sample,
            proteins_df[proteins_df["visit_month"].isin(self.test_vmonths)],
            peptides_df[peptides_df["visit_month"].isin(self.test_vmonths)],
            clinical_df[clinical_df["visit_month"].isin(self.test_vmonths)])                
        
        # targets reshape
        res = []
        for tgt_i in np.arange(1, 5):
            delta_df = sample.copy()
            if f"updrs_{tgt_i}" in delta_df.columns:
                delta_df["target"] = delta_df[f"updrs_{tgt_i}"]
                delta_df["target_norm"] = delta_df["target"] / 100
            delta_df["target_i"] = tgt_i
            res.append(delta_df)
            
        sample = pd.concat(res, axis=0).reset_index(drop=True)
        
        if f"updrs_1" in sample.columns:
            sample = sample.drop(["updrs_1", "updrs_2", "updrs_3", "updrs_4"], axis=1)
        
        for tgt_i in np.arange(1, 5):
            sample[f"target_n_{tgt_i}"] = (sample["target_i"] == tgt_i).astype(int)

        return sample
    
    def transform_test(self, proteins_df, peptides_df, test_df, sub_df):
        sub = sub_df.copy()
        sub["patient_id"] = sub["prediction_id"].apply(lambda x: int(x.split("_")[0]))                
        sub["visit_month"] = sub["prediction_id"].apply(lambda x: int(x.split("_")[1]))
        sub["visit_id"] = sub.apply(lambda x: str(x["patient_id"]) + "_" + str(x["visit_month"]), axis=1)

        sample = sub[["patient_id", "visit_month", "visit_id", "prediction_id"]]

        sample["horizon"] = sample["prediction_id"].apply(lambda x: int(x.split("_")[5]))
        sample["target_i"] = sample["prediction_id"].apply(lambda x: int(x.split("_")[3]))
        sample["visit_month"] = sample["visit_month"]
        sample["target_month"] = sample["visit_month"] + sample["horizon"]
        del sample["prediction_id"]

        # Features
        sample = self.fe(sample, proteins_df, peptides_df, test_df)

        for tgt_i in np.arange(1, 5):
            sample[f"target_n_{tgt_i}"] = (sample["target_i"] == tgt_i).astype(int)

        return sample

dp3 = DataPrep()
dp3.fit(proteins, peptides, clinical)

sample3 = dp3.transform_train(proteins, peptides, clinical)
sample3 = sample3[~sample3["target"].isnull()]
sample3["is_suppl"] = 0
# sample3["target"].nunique() => 72

sup_sample3 = dp3.transform_train(proteins, peptides, supplement)
sup_sample3 = sup_sample3[~sup_sample3["target"].isnull()]
sup_sample3["is_suppl"] = 1

print(sample3.shape)
print(sup_sample3.shape)

(22216, 62)
(14728, 62)


In [13]:

class LGBClassModel1(utils.BaseModel):
    def __init__(self, params, features) -> None:
        self.params = params
        self.features = features
    
    def fit(self, df_train):
        if self.features is None:
            self.features = [col for col in df_train.columns if col.startswith("v_")]
        
        # len(df_train) => 36944
        # df_train["target"].nunique() => 75
        lgb_train = lgb.Dataset(df_train[self.features], df_train["target"])
        
        params0 = {k:v for k,v in self.params.items() if k not in ["n_estimators"]}
        self.m_gbm = lgb.train(params0, lgb_train, num_boost_round=self.params["n_estimators"])
        return self

    def predict_proba(self, df_valid):
        # self.m_gbm.predict(df_valid[self.features]).shape => (64, 87)        
        # self.m_gbm.predict(df_valid[self.features])[0].shape => (87,)
        # self.m_gbm.predict(df_valid[self.features])[0][:5] => [0.17259543 0.07053191 0.06519931 0.06149227 0.05900777]        # self.m_gbm.predict(df_valid[self.features])[0][:5] => [0.17259543 0.07053191 0.06519931 0.06149227 0.05900777]        # self.m_gbm.predict(df_valid[self.features])[0][:5] => [0.17259543 0.07053191 0.06519931 0.06149227 0.05900777]        # self.m_gbm.predict(df_valid[self.features])[0][:5] => [0.17259543 0.07053191 0.06519931 0.06149227 0.05900777]
        return self.m_gbm.predict(df_valid[self.features])

    def predict(self, df_valid):
        return utils.opt_smape1p(self.predict_proba(df_valid))

params = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class': 87,
        "n_estimators": 350, # 300 # 1 my_

        'learning_rate': 0.019673004699536346,
        'num_leaves': 208,
        'max_depth': 14,
        'min_data_in_leaf': 850,
        'feature_fraction': 0.5190632906197453,
        'lambda_l1': 7.405660751699475e-08,
        'lambda_l2': 0.14583961675675494,
        'max_bin': 240,
    
        'verbose': -1,# -1
        'force_col_wise': True,
        'n_jobs': -1,
    }

features = ["target_i", "target_month", "horizon", "visit_month", "visit_6m", "blood_taken"]
features += ["visit_18m", "is_suppl"]
features += ["count_non12_visits"]
features += ["visit_48m"]
# len(features) => 10

# pd.concat([sample3, sup_sample3], axis=0)["target"].nunique() => 75
# pd.concat([sample3, sup_sample3], axis=0)["target"].unique()[-10:] => array([85., 67., 57., 78., 61., 62., 65., 72., 70., 69.])
# pd.concat([sample3, sup_sample3], axis=0)["target"].unique().min() => 0.0
# pd.concat([sample3, sup_sample3], axis=0)["target"].unique().max() => 86.0

model_lgb = LGBClassModel1(params, features)
model_lgb = model_lgb.fit(pd.concat([sample3, sup_sample3], axis=0))# [:1000] my_
    

In [14]:
class NNRegModel1(utils.BaseModel):
    def __init__(self, cfg, features=None) -> None:
        self.cfg = cfg
        #self.features = features
    
    def fit(self, df_train):
        self.models = [utils.run_train(self.cfg, df_train, None, None, verbose=False) for _ in range(self.cfg.bag_size)]
        return self

    def predict(self, df_valid):
        preds = np.vstack([utils.run_test(model, self.cfg, df_valid) for model in self.models])
        
        if self.cfg.bag_agg_function == "max":
            return np.max(preds, axis=0)
        elif self.cfg.bag_agg_function == "median":
            return np.median(preds, axis=0)
        else:
            return np.mean(preds, axis=0)


cfg = SimpleNamespace(**{})

cfg.tr_collate_fn = None
cfg.val_collate_fn = None
#cfg.CustomDataset = CustomDataset
#cfg.net = Net

cfg.target_column = "target_norm"
cfg.output_dir = "results/nn_temp"
cfg.seed = -1
cfg.eval_epochs = 1
cfg.mixed_precision = True # True my_
cfg.device = "cuda"
#cfg.device = "cpu"

cfg.n_classes = 1
cfg.batch_size = 64 # 128, 64 my_
cfg.batch_size_val = 256
cfg.n_hidden = 64
cfg.n_layers = 2 #3
cfg.num_workers = 2
cfg.drop_last = False
cfg.gradient_clip = 1.0

cfg.bag_size = 1
cfg.bag_agg_function = "mean"
cfg.lr = 0.02196 # .002196
cfg.warmup = 0
cfg.epochs = 200 # 10 # 200 my_

cfg.features = ["visit_6m"]
cfg.features += ["visit_18m"]
cfg.features += ["visit_48m"]
#cfg.features = [f"visit_{v}m" for v in [6, 12, 18, 24, 36, 48, 60, 72, 84]]
#cfg.features += ["blood_taken"]
cfg.features += [c for c in sample3.columns if c.startswith("t_month_eq_")]
cfg.features += [c for c in sample3.columns if c.startswith("v_month_eq_")]
cfg.features += [c for c in sample3.columns if c.startswith("hor_eq_")]
#cfg.features += ['count_non12_visits'] # cfg.features += ['count_non12_visits'] my_
cfg.features += [c for c in sample3.columns if c.startswith("target_n_")]
cfg.features += ["is_suppl"]
cfg.features += ["horizon_scaled"]

# cfg.features => ['visit_6m', 't_month_eq_0', 't_month_eq_6', 't_month_eq_12', 't_month_eq_18', 't_month_eq_24', 
# 't_month_eq_36', 't_month_eq_48', 't_month_eq_60', 't_month_eq_72', 't_month_eq_84', 'v_month_eq_0', 
# 'v_month_eq_6', 'v_month_eq_12', 'v_month_eq_18', 'v_month_eq_24', 'v_month_eq_36', 'v_month_eq_48', 
# 'v_month_eq_60', 'v_month_eq_72', 'v_month_eq_84', 'hor_eq_0', 'hor_eq_6', 'hor_eq_12', 'hor_eq_24', 
# 'target_n_1', 'target_n_2', 'target_n_3', 'target_n_4', 'visit_18m', 'visit_48m', 'is_suppl', 'horizon_scaled']

# len(cfg.features) => 33

concat_ = pd.concat([sample3, sup_sample3], axis=0)
x, y = concat_['count_non12_visits'].min(), concat_['count_non12_visits'].max()
concat_['count_non12_visits'] = (concat_['count_non12_visits'] - x) / ((y - x) + 1e-3)

model_nn = NNRegModel1(cfg)
model_nn = model_nn.fit(concat_)




epoch 1 and total loss 1.3019481373360087
epoch 2 and total loss 1.2662018811010611
epoch 3 and total loss 1.2653088657188665
epoch 4 and total loss 1.2646043762642134
epoch 5 and total loss 1.267188469500021
epoch 6 and total loss 1.2659629824824723
epoch 7 and total loss 1.2613689629570863
epoch 8 and total loss 1.2620420490021929
epoch 9 and total loss 1.2619297771716242
epoch 10 and total loss 1.261253144805105
epoch 11 and total loss 1.2606869500317648
epoch 12 and total loss 1.260913731268828
epoch 13 and total loss 1.2633359317039905
epoch 14 and total loss 1.2631099529344867
epoch 15 and total loss 1.2627506563423825
epoch 16 and total loss 1.2617471587668259
epoch 17 and total loss 1.262698964756531
epoch 18 and total loss 1.2609749957789995
epoch 19 and total loss 1.2589691263757503
epoch 20 and total loss 1.2603133627870466
epoch 21 and total loss 1.2619670758627524
epoch 22 and total loss 1.2605813382971431
epoch 23 and total loss 1.2613128848515929
epoch 24 and total loss 

In [15]:
env = amp_pd_peptide.make_env() # my_
iter_test = env.iter_test() # my_
# tmp = list(iter_test) # my_

In [16]:
def repl(x1, x2, cond):
    res = x1.copy()
    res[cond] = x2[cond]
    return res

all_test_peptides = None
all_test_proteins = None
all_test_df = None

for (test_df, test_peptides, test_proteins, sample_submission) in iter_test:
    all_test_df = pd.concat([all_test_df, test_df], axis=0)
    
    # pd.read_csv("example_test_files/test.csv").head(3) =>
    #    visit_id  visit_month  patient_id updrs_test           row_id  group_key
    # 0   3342_0            0        3342    updrs_1   3342_0_updrs_1          0
    # 1   3342_0            0        3342    updrs_2   3342_0_updrs_2          0
    # 2   3342_0            0        3342    updrs_3   3342_0_updrs_3          0

    all_test_proteins = pd.concat([all_test_proteins, test_proteins], axis=0)
    # pd.read_csv("example_test_files/test_proteins.csv").shape => (453,6)
    all_test_peptides = pd.concat([all_test_peptides, test_peptides], axis=0)
    
    # pd.read_csv("example_test_files/sample_submission.csv").head()
    #                    prediction_id  rating  group_key
    # 0   3342_0_updrs_1_plus_0_months       0          0
    # 1   3342_0_updrs_1_plus_6_months       0          0    
    
    sample_test = dp3.transform_test(all_test_proteins, all_test_peptides, all_test_df, sample_submission)
    
    # sample_test.shape => (64, 57)
    
    sample_test["is_suppl"] = 0    
    sample_test["preds_lgb"] = model_lgb.predict(sample_test)
    
    # sample_test["preds_lgb"].shape => (64,)   
    
    x, y = sample_test['count_non12_visits'].min(), sample_test['count_non12_visits'].max()
    sample_test['count_non12_visits'] = (sample_test['count_non12_visits'] - x) / ((y - x) + 1e-3)
    
    sample_test["preds_nn"] = np.round(np.clip(model_nn.predict(sample_test), 0, None))
    
    sample_submission["rating"] = np.round( (sample_test["preds_lgb"]*0.52 + sample_test["preds_nn"]*0.48) )# / 2)
    
    env.predict(sample_submission)

    #break # my_

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [17]:
# 1.2540

In [18]:
sub = pd.read_csv('/kaggle/working/submission.csv')
sub

Unnamed: 0,prediction_id,rating
0,3342_0_updrs_1_plus_0_months,5.0
1,3342_0_updrs_1_plus_6_months,6.0
2,3342_0_updrs_1_plus_12_months,5.0
3,3342_0_updrs_1_plus_24_months,6.0
4,3342_0_updrs_2_plus_0_months,4.0
...,...,...
59,50423_6_updrs_3_plus_24_months,22.0
60,50423_6_updrs_4_plus_0_months,0.0
61,50423_6_updrs_4_plus_6_months,0.0
62,50423_6_updrs_4_plus_12_months,0.0
