In [20]:
import pandas as pd
import numpy as np
import random
from matplotlib.pyplot import pie, axis, show
import seaborn as sns
import missingno as msno
from scipy import stats
import matplotlib.pyplot as plt
import yaml

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import RFE, SelectKBest, f_regression, mutual_info_regression
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn import linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
import statsmodels.regression.linear_model as sm
from sklearn.gaussian_process import GaussianProcessRegressor as GPR
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, CompoundKernel
import sklearn_relief as sr
from skrebate import ReliefF
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import ElasticNet
import lightgbm as ltb
from sklearn.svm import SVR
from scipy.stats import ks_2samp
from tabulate import tabulate

from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.over_sampling import BorderlineSMOTE

from sklearn.multioutput import RegressorChain, MultiOutputRegressor

import torch
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(42)

from helper import preprocess, countUsers, get_features_relieff, get_features_ref_single, get_features_ref,\
    get_features_kbest, outlier_detect, cross_val, get_scores

%matplotlib inline


In [21]:
df = pd.read_csv('data/X_train.csv', sep = ',',decimal = '.', encoding = 'utf-8', engine ='python', index_col=0)

In [22]:
# Read common variables from a YAML file
with open('../common_variables.yaml', 'r') as file:
    common_data = yaml.safe_load(file)

In [23]:
target_variable = 'hdl_12m'
df, X_train, X_test, Y_train, Y_test, X, Y, scaler, df_missing_val, df_missing_val_original, df_original = preprocess(df, 0.25, target_variable)
df = df.drop(['ldl_12m', 'hba1c_12m', 'hdl_12m', 'days_hdl', 'days_ldl', 'days_hba1c', 'days_bmi'], axis=1)
X_train = X_train.drop(['ldl_12m', 'hba1c_12m', 'bmi_12m', 'days_hdl', 'days_ldl', 'days_hba1c', 'days_bmi'], axis=1)
X_test = X_test.drop(['ldl_12m', 'hba1c_12m', 'bmi_12m', 'days_hdl', 'days_ldl', 'days_hba1c', 'days_bmi'], axis=1)

Shape of data : (2660, 127)
Shape of data after excluding missing response: (702, 127)
Shape of full data after selecting date range dates > 21 days (475, 118)


In [24]:
df_missing_val

Unnamed: 0,id,init_year,drug_class,Lower_MD_mmol_mol,Upper_MD_mmol_mol,hba1c_bl_18m,hba1c_bl_6m,sp,ika,t2d_dur_y,...,dg132,dg133,n_of_dis,days_hba1c,days_bmi,days_hdl,days_ldl,hba1c_12m,ldl_12m,bmi_12m
4868,0.003788,0.750,1.0,0.881795,0.665044,0.171717,0.238095,1.0,0.782609,0.472222,...,0.0,0.0,0.181818,0.723404,0.212682,0.102099,0.102564,0.184211,0.520833,0.147700
6963,0.005355,0.000,0.0,1.000000,0.387942,0.101010,0.000000,1.0,0.608696,0.083333,...,0.0,0.0,0.181818,0.996960,0.262219,0.102099,0.136890,0.315789,0.270833,0.227964
13607,0.061374,0.750,0.0,1.000000,0.387942,0.050505,0.095238,1.0,0.550725,0.166667,...,0.0,0.0,0.909091,0.954407,0.208058,0.102099,0.133995,0.184211,0.333333,0.391503
1550,0.001214,0.750,1.0,0.881795,0.665044,0.191919,0.396825,1.0,0.753623,0.388889,...,0.0,0.0,0.181818,0.209726,0.188243,0.102099,0.136890,0.355263,0.270833,0.212937
12953,0.009929,0.500,0.0,1.000000,0.387942,0.252525,0.238095,0.0,0.666667,0.166667,...,0.0,0.0,0.090909,0.805471,0.212682,0.102099,0.113730,0.328947,0.916667,0.147700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12024,0.009194,0.500,0.0,0.845424,0.443363,0.070707,0.253968,1.0,0.826087,0.166667,...,0.0,0.0,0.363636,0.844985,0.350066,0.102099,0.007444,0.697368,0.458333,0.271400
4446,0.003433,0.625,0.0,1.000000,0.387942,0.151515,0.031746,0.0,0.956522,0.027778,...,0.0,0.0,0.363636,0.376900,0.212682,0.102099,0.136890,0.223684,0.270833,0.147700
5306,0.004093,0.875,1.0,0.881795,0.665044,0.131313,0.015873,1.0,0.536232,0.166667,...,1.0,0.0,0.272727,1.057751,0.215984,0.102099,0.148056,0.276316,0.312500,0.296841
13507,0.058918,0.125,0.0,1.000000,0.387942,0.181818,0.047619,1.0,0.420290,0.083333,...,0.0,0.0,0.090909,1.000000,0.212682,0.102099,0.184036,0.315789,0.604167,0.147700


In [25]:
# # train with whole dataset and test with drug class 2,3 and 4 data
is_train_with_all=False
if(is_train_with_all):
    combined_df = pd.concat([X_test, Y_test], axis=1)
    testdf = combined_df[(combined_df['drug_class'] == 0.25) | 
                         (combined_df['drug_class'] == 0.375) ]
    X_test = testdf.drop([response_variable], axis = 1)
    Y_test = testdf[response_variable]
    
X_test_original = X_test.copy()

In [26]:
""""
Use drug_class

2=GLP-1 analogues (A10BJ)
3=DPP-4 inhibitors (A10BH)
4=SGLT2 inhibitors (A10BK)
"""

if(is_train_with_all):
    sglt_val = 0.375
    dpp_val = 0.25
else:
    sglt_val = 1
    dpp_val = 0


X_test_ = pd.DataFrame(X_test)
X_train_ = pd.DataFrame(X_train)

X_train = X_train.drop(['init_year'], axis = 1)
X_test = X_test.drop(['init_year'], axis = 1)

print('==== sample count in preprocessed data =======')
print(' number of dpp4 : ', countUsers(3, df))
print(' number of sglt2 : ', countUsers(4, df))

print('==== sample count in training data =======')
print(' number of dpp4 : ', countUsers(dpp_val, X_train_))
print(' number of sglt2 : ', countUsers(sglt_val, X_train_))

print('==== sample count in testing data =======')
print(' number of dpp4 : ', countUsers(dpp_val, X_test_))
print(' number of sglt2 : ', countUsers(sglt_val, X_test_))

 number of dpp4 :  232
 number of sglt2 :  242
 number of dpp4 :  168
 number of sglt2 :  187
 number of dpp4 :  64
 number of sglt2 :  55


In [27]:
# TODO FROM HERE

# feature selection
items = [
#     'sp',
#     'smoking',
    ]
k = 10 # Select top 25 features
    
random.seed(42)

#feats = get_features_ref_single(X_train, Y_train, 8)
#feats = get_features_relieff(X_train, Y_train['hdl_12m'] ,6)
#feats = get_features_kbest(X_train, Y_train,8)
feats = ['drug_class', 'MD_RCT_mmol_mol', 'hba1c_bl_18m', 'ldl', 'hdl', 'gluk', 'met_oad0'] # k best 8

print(feats)
selected_features=feats
        
X_train = X_train[selected_features]
X_test = X_test[selected_features]
number_of_features = len(selected_features)

['drug_class', 'Lower_MD_mmol_mol', 'Upper_MD_mmol_mol', 'hba1c_bl_18m', 'ldl', 'hdl', 'gluk', 'met_oad0']


  y = column_or_1d(y, warn=True)


In [28]:
################# OUTLIER CODE ################
print('Shape of training data before removing outliers:', np.shape(X_train))
print('Shape of test data before removing outliers:', np.shape(X_test))
    
out_train, out_test = outlier_detect(X_train, Y_train, X_test, Y_test)
response_variable_list = [target_variable]

train_ = X_train.copy()
train_[response_variable_list] = Y_train.values
    
test_ = X_test.copy()
test_[response_variable_list] = Y_test.values
    
train_ = pd.DataFrame(train_.drop(out_train, axis = 0))
test_ = pd.DataFrame(test_.drop(out_test, axis = 0))
    
Y_train = train_[response_variable_list]
X_train = train_.drop(response_variable_list, axis=1)
    
Y_test = test_[response_variable_list]
X_test = test_.drop(response_variable_list, axis=1)
    
print('Shape of training data after removing outliers:', np.shape(X_train))
print('Shape of test data after removing outliers:', np.shape(X_test))

################

Shape of training data before removing outliers: (355, 8)
Shape of test data before removing outliers: (119, 8)
Training set outliers: []
Testing set outliers: [7132]
Shape of training data after removing outliers: (355, 8)
Shape of test data after removing outliers: (118, 8)


In [29]:
train = X_train.copy()
train[response_variable_list] = Y_train[response_variable_list].copy()


# Models

In [30]:

model = XGBRegressor(
    n_estimators=40, 
    eta=0.05, 
    subsample=0.9, 
    colsample_bytree=1,
    alpha=0.1,
    max_depth = 10,
    max_leaves = 8,
    learning_rate =0.1
)

#model = CatBoostRegressor(iterations=20,learning_rate=0.1, depth=6)

#model = RandomForestRegressor(n_estimators=150, max_depth=10, random_state=123)
#model = MLPRegressor(random_state=123, max_iter=2000,hidden_layer_sizes = 250,learning_rate= 'adaptive')

model = cross_val(model, train , X_train, Y_train, response_variable_list)
model.fit(X_train, Y_train)
# make a prediction

yhat = model.predict(X_test)
# summarize prediction
print(yhat[1])
original_data_pred, model_results, model_results_drugs_ori, score_ori = get_scores(model, X_test, Y_test, X_train, Y_train)


Cross validation variance 0.04007642643740578
Cross validation mean score 0.4867083740654893
1.3833172
R2 score Training : 0.7337099762450296
R2 score Testing : 0.6741254946598847
RMSE (Target): 0.20075276485102797


In [12]:
df_missing_val = df_missing_val[selected_features]
mv_pred_test_numpy = model.predict(df_missing_val)


In [13]:
len(mv_pred_test_numpy)

1958

In [14]:
df_missing_val_original['hdl_12m'] = mv_pred_test_numpy

In [15]:
df_missing_val_original['hdl_12m']

4868     1.469387
6963     1.035665
13607    1.058428
1550     1.355858
12953    1.121261
           ...   
12024    1.047101
4446     1.113280
5306     1.352804
13507    1.022585
13427    1.109180
Name: hdl_12m, Length: 1958, dtype: float32

In [16]:
df_original

Unnamed: 0,id,init_year,drug_class,Lower_MD_mmol_mol,Upper_MD_mmol_mol,hba1c_bl_18m,hba1c_bl_6m,sp,ika,t2d_dur_y,...,date_bmi_12m,date_hdl_12m,days_hba1c,days_bmi,days_hdl,days_ldl,hba1c_12m,ldl_12m,hdl_12m,bmi_12m
8087,106358,2018,3,-6.3394,-5.1371,50.0,55.0,1,83.0,4,...,,2019-10-23,380.0,,380.0,380.0,50.0,2.2,1.06,
6448,84221,2019,4,-7.7603,-4.5906,60.0,58.0,1,56.0,13,...,2020-11-05,2020-04-07,313.0,357.0,229.0,313.0,59.0,1.7,1.47,35.154137
7758,101936,2019,4,-12.8974,-4.9185,50.0,55.0,2,56.0,9,...,2020-08-18,2019-10-31,264.0,301.0,2628.0,,37.0,,1.47,31.770000
5968,78066,2017,4,-7.7603,-4.5906,49.0,75.0,2,70.0,4,...,2018-10-08,2018-10-04,402.0,404.0,402.0,402.0,83.0,3.7,1.02,33.692711
13659,1058511,2019,4,-12.8974,-4.9185,,58.0,1,67.0,3,...,,2020-10-21,490.0,,380.0,380.0,66.0,4.1,1.54,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13625,1052922,2016,4,-7.7603,-4.5906,68.0,69.0,1,42.0,5,...,2016-11-24,2017-06-30,361.0,106.0,402.0,402.0,49.0,3.6,1.34,34.651993
180,2496,2020,4,-6.8859,-4.3720,76.0,82.0,1,79.0,28,...,,2021-01-29,331.0,,331.0,331.0,97.0,2.4,0.92,
10528,136698,2013,3,-6.3394,-5.1371,71.0,61.0,2,65.0,4,...,,2014-02-05,99.0,,99.0,99.0,92.0,2.1,0.89,
4548,59911,2014,3,-6.3394,-5.1371,85.0,,2,77.0,6,...,2015-06-11,2014-11-11,,339.0,306.0,306.0,74.0,1.9,0.81,41.290001


In [17]:
result_df = pd.concat([df_original, df_missing_val_original])

In [18]:
result_df.to_csv('data/mvhdl.csv', index=True)

In [19]:
result_df[['hdl_12m']]

Unnamed: 0,hdl_12m
8087,1.060000
6448,1.470000
7758,1.470000
5968,1.020000
13659,1.540000
...,...
12024,1.047101
4446,1.113280
5306,1.352804
13507,1.022585
