In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
pip install pydicom

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pydicom
  Downloading pydicom-2.3.1-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-2.3.1


In [None]:
import numpy as np
import pandas as pd
import pydicom
import os
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from PIL import Image
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, GroupKFold

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.models as M

In [None]:
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go

In [None]:
def seed_everything(seed=2020):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything(42)

In [None]:
ROOT = "/content/drive/My Drive/FYP/BayesianRidge/"
BATCH_SIZE= 128

In [None]:
df_train = pd.read_csv(ROOT+'train.csv')
df_test = pd.read_csv(ROOT+'test.csv')
sub = pd.read_csv(ROOT+'sample_submission.csv')

print('Train shape: ', df_train.shape)
print('Number of unique customers in train: {}'.format(df_train['Patient'].nunique()))
print('Test shape:', df_test.shape)

Train shape:  (1549, 7)
Number of unique customers in train: 176
Test shape: (5, 7)


In [None]:
df_base = df_train.drop_duplicates(subset='Patient', keep='first')
df_base = df_base[['Patient', 'Weeks', 'FVC', 
                   'Percent', 'Age']].rename(columns={'Weeks': 'base_week',
                                                      'Percent': 'base_percent',
                                                      'Age': 'base_age',
                                                      'FVC': 'base_FVC'})
df_base.head(3)

Unnamed: 0,Patient,base_week,base_FVC,base_percent,base_age
0,ID00007637202177411956430,-4,2315,58.253649,79
9,ID00009637202177434476278,8,3660,85.282878,69
18,ID00010637202177584971671,0,3523,94.724672,60


In [None]:
df_train['visit'] = 1
df_train['visit'] = df_train[['Patient', 'visit']].groupby('Patient').cumsum()
df_train = df_train.loc[df_train['visit'] > 0, :]

In [None]:
# Merge with base info
df_train = pd.merge(df_train,
                    df_base,
                    on='Patient',
                    how='left')
print(df_train.shape)
df_train.head(3)

(1549, 12)


Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,visit,base_week,base_FVC,base_percent,base_age
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,1,-4,2315,58.253649,79
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,2,-4,2315,58.253649,79
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,3,-4,2315,58.253649,79


In [None]:
df_train['weeks_passed'] = df_train['Weeks'] - df_train['base_week']
df_train = pd.get_dummies(df_train, columns=['Sex', 'SmokingStatus'])
sub['Patient'] = sub['Patient_Week'].apply(lambda x: x.split('_')[0])
sub['Weeks'] = sub['Patient_Week'].apply(lambda x: x.split('_')[1]).astype(int)
sub.head()

Unnamed: 0,Patient_Week,FVC,Confidence,Patient,Weeks
0,ID00419637202311204720264_-12,2000,100,ID00419637202311204720264,-12
1,ID00421637202311550012437_-12,2000,100,ID00421637202311550012437,-12
2,ID00422637202311677017371_-12,2000,100,ID00422637202311677017371,-12
3,ID00423637202312137826377_-12,2000,100,ID00423637202312137826377,-12
4,ID00426637202313170790466_-12,2000,100,ID00426637202313170790466,-12


In [None]:
df_test = df_test.rename(columns={'Weeks': 'base_week', 
                                  'Percent': 'base_percent',
                                  'Age': 'base_age',
                                  'FVC': 'base_FVC'})
df_test = pd.merge(sub,
                   df_test,
                   on='Patient',
                   how='right')
df_test = pd.get_dummies(df_test, columns=['Sex', 'SmokingStatus'])
df_test['weeks_passed'] = df_test['Weeks'] - df_test['base_week']
df_test.head()

Unnamed: 0,Patient_Week,FVC,Confidence,Patient,Weeks,base_week,base_FVC,base_percent,base_age,Sex_Male,SmokingStatus_Ex-smoker,SmokingStatus_Never smoked,weeks_passed
0,ID00419637202311204720264_-12,2000,100,ID00419637202311204720264,-12,6,3020,70.186855,73,1,1,0,-18
1,ID00419637202311204720264_-11,2000,100,ID00419637202311204720264,-11,6,3020,70.186855,73,1,1,0,-17
2,ID00419637202311204720264_-10,2000,100,ID00419637202311204720264,-10,6,3020,70.186855,73,1,1,0,-16
3,ID00419637202311204720264_-9,2000,100,ID00419637202311204720264,-9,6,3020,70.186855,73,1,1,0,-15
4,ID00419637202311204720264_-8,2000,100,ID00419637202311204720264,-8,6,3020,70.186855,73,1,1,0,-14


In [None]:
missing_columns = np.setdiff1d(df_train.drop(['Patient', 'FVC', 'Percent', 'Age', 'visit'], axis = 1).columns, df_test.columns)
if len(missing_columns) > 0:
    print('/!\ Missing columns in test: ', missing_columns)
    for col in missing_columns:
        df_test[col] = 0

/!\ Missing columns in test:  ['Sex_Female' 'SmokingStatus_Currently smokes']


In [None]:
def OSIC_metric(y_true, y_pred, y_pred_std):
    delta = np.clip(abs(y_true - y_pred), 0, 1000)
    std_clipped = np.clip(y_pred_std, 70, np.inf)
    return np.mean(-(np.sqrt(2)*delta/std_clipped) - np.log(np.sqrt(2)*std_clipped))

In [None]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import BayesianRidge

class Model():
    def __init__(self, model=BayesianRidge(alpha_1=0.1,alpha_2=0.1,lambda_1=0.03,lambda_2=0.01), n_splits=2):
        self.regressor = model
        self.n_splits = n_splits
        self.gkf = GroupKFold(n_splits=n_splits)
        self.train_cols = ['Weeks', 'base_week', 'base_FVC', 
                           'base_percent', 'base_age', 'weeks_passed', 'Sex_Female',
                           'Sex_Male', 'SmokingStatus_Currently smokes', 
                           'SmokingStatus_Ex-smoker', 'SmokingStatus_Never smoked']
    
    def fit(self, X, y):
        self.regressor.fit(X, y)
            
    def predict(self, X):
        pred = self.regressor.predict(X, return_std=True)        
        return pred
    
    def fit_predict_cv(self, df, df_test=pd.DataFrame()):
        
        scores = np.zeros((self.n_splits, ))
        oof = np.zeros((len(df), ))
        oof_std = np.zeros_like(oof)
        
        if len(df_test) > 0:
            pred_sub = np.zeros((len(df_test), self.n_splits))
            pred_sub_std = np.zeros_like(pred_sub)
        
        target = 'FVC'
        
        for i, (train_idx, val_idx) in enumerate(self.gkf.split(df, groups=df['Patient'])):
            X_train = df.loc[train_idx, self.train_cols]
            y_train = df.loc[train_idx, target]
            X_val = df.loc[val_idx, self.train_cols]
            y_val = df.loc[val_idx, target]
            
            self.fit(X_train, y_train)
            
            pred_train, pred_train_std = self.predict(X_train)
            pred_val, pred_val_std = self.predict(X_val)
            
            if len(df_test) > 0:
                pred_sub[:, i], pred_sub_std[:, i] = self.predict(df_test[self.train_cols])
            
            oof[val_idx] = pred_val
            oof_std[val_idx] = pred_val_std
            print('Train score: {0:.2f} | Test score: {1:.2f}'.format(OSIC_metric(y_train, pred_train, pred_train_std),
                                                                    OSIC_metric(y_val, pred_val, pred_val_std)))
        print('OOF score: {0:.4f}'.format(OSIC_metric(df[target], oof, oof_std)))
        res = dict()
        res['oof'] = oof
        res['oof_std'] = oof_std
        
        if len(df_test) > 0:
            res['pred_sub'] = pred_sub.mean(axis=1)
            res['pred_sub_std'] = pred_sub_std.mean(axis=1)
        
        return res
fvc_model = Model()
res = fvc_model.fit_predict_cv(df_train, df_test)

Train score: -6.67 | Test score: -6.73
Train score: -6.71 | Test score: -6.68
OOF score: -6.7055


In [None]:
df_test['FVC'] = res['pred_sub']
df_test['Confidence'] = res['pred_sub_std']

submission = sub[['Patient_Week']]
submission = pd.merge(submission,
                      df_test[['Patient_Week', 'FVC', 'Confidence']],
                      on='Patient_Week',
                      how='left')
submission.head()

Unnamed: 0,Patient_Week,FVC,Confidence
0,ID00419637202311204720264_-12,3032.728801,228.1754
1,ID00421637202311550012437_-12,2800.319445,226.908005
2,ID00422637202311677017371_-12,1978.540662,227.09469
3,ID00423637202312137826377_-12,3345.307117,228.793297
4,ID00426637202313170790466_-12,2913.295962,227.879964


In [None]:
submission.to_csv(ROOT+"/submission_BayesianRidge.csv", index=False)