In [1]:
import pandas as pd
import numpy as np
import random
from matplotlib.pyplot import pie, axis, show
import seaborn as sns
import missingno as msno
from scipy import stats
import matplotlib.pyplot as plt
import yaml

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import RFE, SelectKBest, f_regression, mutual_info_regression
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn import linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
import statsmodels.regression.linear_model as sm
from sklearn.gaussian_process import GaussianProcessRegressor as GPR
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, CompoundKernel
import sklearn_relief as sr
from skrebate import ReliefF
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import ElasticNet
import lightgbm as ltb
from sklearn.svm import SVR
from scipy.stats import ks_2samp
from tabulate import tabulate

from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.over_sampling import BorderlineSMOTE

from sklearn.multioutput import RegressorChain, MultiOutputRegressor

import torch
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(42)

from helper import preprocess, countUsers, get_features_relieff, get_features_ref_single, get_features_ref,\
    get_features_kbest, outlier_detect, cross_val, get_scores

%matplotlib inline


In [2]:
df = pd.read_csv('data/X_train.csv', sep = ',',decimal = '.', encoding = 'utf-8', engine ='python', index_col=0)

In [3]:
# Read common variables from a YAML file
with open('../common_variables.yaml', 'r') as file:
    common_data = yaml.safe_load(file)

In [4]:
target_variable = 'ldl_12m'
df, X_train, X_test, Y_train, Y_test, X, Y, scaler, df_missing_val, df_missing_val_original, df_original = preprocess(df, 0.25, target_variable)
df = df.drop(['bmi_12m', 'hba1c_12m', 'hdl_12m', 'days_bmi', 'days_hdl', 'days_ldl', 'days_hba1c'], axis=1)
X_train = X_train.drop(['bmi_12m', 'hba1c_12m', 'hdl_12m', 'days_bmi', 'days_hdl', 'days_ldl', 'days_hba1c'], axis=1)
X_test = X_test.drop(['bmi_12m', 'hba1c_12m', 'hdl_12m', 'days_bmi', 'days_hdl', 'days_ldl', 'days_hba1c'], axis=1)

Shape of data : (2660, 126)
Shape of data after excluding missing response: (1566, 126)
Shape of full data after selecting date range dates > 21 days (1084, 117)


In [5]:
df_missing_val

Unnamed: 0,id,init_year,drug_class,Lower_MD_mmol_mol,Upper_MD_mmol_mol,hba1c_bl_18m,hba1c_bl_6m,sp,ika,t2d_dur_y,...,dg132,dg133,n_of_dis,days_hba1c,days_bmi,days_hdl,days_ldl,hba1c_12m,hdl_12m,bmi_12m
6963,0.005358,0.000,0.0,1.000000,0.387942,0.152381,0.000000,1.0,0.616438,0.083333,...,0.0,0.0,0.153846,0.996951,0.176597,0.102099,0.14585,0.247312,0.199153,0.000281
1550,0.001216,0.750,1.0,0.881795,0.665044,0.238095,0.390625,1.0,0.753425,0.388889,...,0.0,0.0,0.153846,0.207317,0.128590,0.102099,0.14585,0.279570,0.199153,0.000262
7758,0.005990,0.750,1.0,0.454437,0.498783,0.142857,0.031250,1.0,0.506849,0.250000,...,0.0,0.0,0.538462,0.692073,0.123446,0.827290,0.14585,0.010753,0.364407,0.000350
3104,0.002381,0.375,1.0,0.881795,0.665044,0.152381,0.125000,0.0,0.671233,0.527778,...,0.0,0.0,0.230769,0.868902,0.130304,0.102099,0.14585,0.225806,0.199153,0.000370
6673,0.005122,0.000,0.0,1.000000,0.387942,0.152381,0.171875,1.0,0.534247,0.111111,...,0.0,0.0,0.076923,1.000000,0.150450,0.102099,0.14585,0.193548,0.199153,0.000274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,0.000210,0.625,0.0,1.000000,0.387942,0.152381,0.234375,0.0,0.452055,0.083333,...,0.0,0.0,0.230769,1.000000,0.150450,0.102099,0.14585,0.193548,0.199153,0.000274
12764,0.009767,0.250,1.0,0.454437,0.498783,0.285714,0.109375,1.0,0.684932,0.138889,...,0.0,0.0,0.384615,0.835366,0.123018,0.102099,0.14585,0.129032,0.199153,0.000294
6266,0.004819,0.500,1.0,0.881795,0.665044,0.152381,0.078125,0.0,0.534247,0.222222,...,0.0,0.0,0.153846,0.375000,0.147878,0.102099,0.14585,0.182796,0.199153,0.000615
8510,0.006535,1.000,1.0,0.881795,0.665044,0.190476,0.093750,1.0,0.739726,0.361111,...,1.0,0.0,0.615385,0.192073,0.093442,0.102099,0.14585,0.193548,0.199153,0.000245


In [6]:
# # train with whole dataset and test with drug class 2,3 and 4 data
is_train_with_all=False
if(is_train_with_all):
    combined_df = pd.concat([X_test, Y_test], axis=1)
    testdf = combined_df[(combined_df['drug_class'] == 0.25) | 
                         (combined_df['drug_class'] == 0.375) ]
    X_test = testdf.drop([response_variable], axis = 1)
    Y_test = testdf[response_variable]
    
X_test_original = X_test.copy()

In [7]:
""""
Use drug_class

2=GLP-1 analogues (A10BJ)
3=DPP-4 inhibitors (A10BH)
4=SGLT2 inhibitors (A10BK)
"""

if(is_train_with_all):
    sglt_val = 0.375
    dpp_val = 0.25
else:
    sglt_val = 1
    dpp_val = 0


X_test_ = pd.DataFrame(X_test)
X_train_ = pd.DataFrame(X_train)

X_train = X_train.drop(['init_year'], axis = 1)
X_test = X_test.drop(['init_year'], axis = 1)

print('==== sample count in preprocessed data =======')
print(' number of dpp4 : ', countUsers(3, df))
print(' number of sglt2 : ', countUsers(4, df))

print('==== sample count in training data =======')
print(' number of dpp4 : ', countUsers(dpp_val, X_train_))
print(' number of sglt2 : ', countUsers(sglt_val, X_train_))

print('==== sample count in testing data =======')
print(' number of dpp4 : ', countUsers(dpp_val, X_test_))
print(' number of sglt2 : ', countUsers(sglt_val, X_test_))

 number of dpp4 :  441
 number of sglt2 :  643
 number of dpp4 :  336
 number of sglt2 :  477
 number of dpp4 :  105
 number of sglt2 :  166


In [8]:
# TODO FROM HERE

# feature selection
items = ['C10A']
k = 10 # Select top 25 features
  
random.seed(42)


#feats = get_features_ref_single(X_train, Y_train,5)
#feats = get_features_relieff(X_train, Y_train['ldl_12m'] ,5)
feats = get_features_kbest(X_train, Y_train,10)
selected_features=feats

for item in items:
    if item not in selected_features:
        selected_features = np.append(selected_features, item)

selected_features = ['hba1c_bl_6m', 'ika', 'ldl', 'insulin', 'sum_diab_drugs', 'hyperten', 'chd',
 'cvd_comp', 'obese', 'C02A', 'C10A']  # kbest 10     
print(selected_features)
  
X_train = X_train[selected_features]
X_test = X_test[selected_features]
number_of_features = len(selected_features)

  y = column_or_1d(y, warn=True)


['hba1c_bl_6m', 'ika', 'ldl', 'insulin', 'sum_diab_drugs', 'hyperten', 'chd', 'cvd_comp', 'obese', 'C02A', 'C10A']


In [9]:
################# OUTLIER CODE ################
print('Shape of training data before removing outliers:', np.shape(X_train))
print('Shape of test data before removing outliers:', np.shape(X_test))
    
out_train, out_test = outlier_detect(X_train, Y_train, X_test, Y_test)
response_variable_list = [target_variable]

train_ = X_train.copy()
train_[response_variable_list] = Y_train.values
    
test_ = X_test.copy()
test_[response_variable_list] = Y_test.values
    
train_ = pd.DataFrame(train_.drop(out_train, axis = 0))
test_ = pd.DataFrame(test_.drop(out_test, axis = 0))
    
Y_train = train_[response_variable_list]
X_train = train_.drop(response_variable_list, axis=1)
    
Y_test = test_[response_variable_list]
X_test = test_.drop(response_variable_list, axis=1)
    
print('Shape of training data after removing outliers:', np.shape(X_train))
print('Shape of test data after removing outliers:', np.shape(X_test))

################

Shape of training data before removing outliers: (813, 11)
Shape of test data before removing outliers: (271, 11)
Training set outliers: [4795, 5599]
Testing set outliers: []
Shape of training data after removing outliers: (811, 11)
Shape of test data after removing outliers: (271, 11)


In [10]:
train = X_train.copy()
train[response_variable_list] = Y_train[response_variable_list].copy()


# Models

In [14]:
model1 = XGBRegressor(
    n_estimators=20, 
    eta=0.04, 
    subsample=0.6, 
    colsample_bytree=0.9,
    alpha=0.4,
    max_depth = 12,
    max_leaves = 10,
    learning_rate =0.15)

model3 = CatBoostRegressor(iterations=50,learning_rate=0.1, depth=6)

model2 = RandomForestRegressor(n_estimators=200, max_depth=20, random_state=123)
#model = MLPRegressor(random_state=123, max_iter=200,hidden_layer_sizes = 32,learning_rate= 'adaptive')
model = VotingRegressor([('xgb', model1), ('rfr', model2), ('catboost', model3)])
model = cross_val(model, train , X_train, Y_train, response_variable_list)
model.fit(X_train, Y_train)
# make a prediction

yhat = model.predict(X_test)
# summarize prediction
print(yhat[1])
original_data_pred, model_results, model_results_drugs_ori, score_ori = get_scores(model, X_test, Y_test, X_train, Y_train)


0:	learn: 0.8133700	total: 1.8ms	remaining: 88.1ms
1:	learn: 0.7950515	total: 2.7ms	remaining: 64.8ms
2:	learn: 0.7770583	total: 3.33ms	remaining: 52.2ms
3:	learn: 0.7589603	total: 3.89ms	remaining: 44.8ms
4:	learn: 0.7470547	total: 4.16ms	remaining: 37.4ms
5:	learn: 0.7346679	total: 4.77ms	remaining: 35ms
6:	learn: 0.7232546	total: 5.34ms	remaining: 32.8ms
7:	learn: 0.7142888	total: 5.9ms	remaining: 31ms
8:	learn: 0.7063227	total: 6.47ms	remaining: 29.5ms
9:	learn: 0.6961583	total: 7.25ms	remaining: 29ms
10:	learn: 0.6896614	total: 8.2ms	remaining: 29.1ms
11:	learn: 0.6828735	total: 8.8ms	remaining: 27.9ms
12:	learn: 0.6771193	total: 9.49ms	remaining: 27ms
13:	learn: 0.6713167	total: 10.2ms	remaining: 26.1ms
14:	learn: 0.6664809	total: 10.9ms	remaining: 25.3ms
15:	learn: 0.6614395	total: 11.6ms	remaining: 24.8ms
16:	learn: 0.6565035	total: 12.5ms	remaining: 24.4ms
17:	learn: 0.6525123	total: 13.2ms	remaining: 23.4ms
18:	learn: 0.6484894	total: 14ms	remaining: 22.8ms
19:	learn: 0.64476

22:	learn: 0.6458553	total: 12.8ms	remaining: 15.1ms
23:	learn: 0.6447216	total: 13.3ms	remaining: 14.4ms
24:	learn: 0.6431210	total: 14ms	remaining: 14ms
25:	learn: 0.6396710	total: 14.6ms	remaining: 13.4ms
26:	learn: 0.6367701	total: 15.1ms	remaining: 12.9ms
27:	learn: 0.6347022	total: 15.7ms	remaining: 12.4ms
28:	learn: 0.6318380	total: 16.3ms	remaining: 11.8ms
29:	learn: 0.6297918	total: 16.9ms	remaining: 11.2ms
30:	learn: 0.6281290	total: 17.5ms	remaining: 10.7ms
31:	learn: 0.6245604	total: 18ms	remaining: 10.1ms
32:	learn: 0.6232089	total: 18.6ms	remaining: 9.59ms
33:	learn: 0.6199941	total: 19.2ms	remaining: 9.02ms
34:	learn: 0.6183168	total: 19.7ms	remaining: 8.46ms
35:	learn: 0.6149365	total: 20.3ms	remaining: 7.91ms
36:	learn: 0.6135272	total: 20.9ms	remaining: 7.34ms
37:	learn: 0.6122517	total: 21.2ms	remaining: 6.71ms
38:	learn: 0.6095546	total: 21.8ms	remaining: 6.15ms
39:	learn: 0.6061802	total: 22.4ms	remaining: 5.6ms
40:	learn: 0.6039539	total: 22.9ms	remaining: 5.04ms


44:	learn: 0.6034452	total: 27.6ms	remaining: 3.07ms
45:	learn: 0.6012740	total: 28.4ms	remaining: 2.47ms
46:	learn: 0.6002770	total: 29ms	remaining: 1.85ms
47:	learn: 0.5979820	total: 29.5ms	remaining: 1.23ms
48:	learn: 0.5946357	total: 30.1ms	remaining: 614us
49:	learn: 0.5929343	total: 30.7ms	remaining: 0us
2.1843605151316257
R2 score Training : 0.4890170377333418
R2 score Testing : 0.2141432750805683
RMSE (Target): 0.7334032723732129


In [12]:
df_missing_val = df_missing_val[selected_features]
mv_pred_test_numpy = model.predict(df_missing_val)


In [13]:
len(mv_pred_test_numpy)

1094

In [14]:
df_missing_val_original['ldl_12m'] = mv_pred_test_numpy

In [15]:
df_missing_val_original['ldl_12m']

6963     2.567392
1550     2.868762
7758     2.379622
3104     2.227180
6673     3.008403
           ...   
279      2.779563
12764    2.090636
6266     2.333322
8510     1.316129
4446     2.483881
Name: ldl_12m, Length: 1094, dtype: float32

In [16]:
df_original

Unnamed: 0,id,init_year,drug_class,Lower_MD_mmol_mol,Upper_MD_mmol_mol,hba1c_bl_18m,hba1c_bl_6m,sp,ika,t2d_dur_y,...,date_bmi_12m,date_hdl_12m,days_hba1c,days_bmi,days_hdl,days_ldl,hba1c_12m,ldl_12m,hdl_12m,bmi_12m
8087,106358,2018,3,-6.3394,-5.1371,50.0,55.0,1,83.0,4,...,,2019-10-23,380.0,,380.0,380.0,50.0,2.2,1.06,
4868,64542,2019,4,-7.7603,-4.5906,58.0,68.0,2,76.0,17,...,,,274.0,,,274.0,49.0,2.9,,
6448,84221,2019,4,-7.7603,-4.5906,60.0,58.0,1,56.0,13,...,2020-11-05,2020-04-07,313.0,357.0,229.0,313.0,59.0,1.7,1.47,35.154137
13607,1043704,2019,3,-6.3394,-5.1371,46.0,59.0,2,60.0,6,...,2019-07-09,,350.0,343.0,,350.0,49.0,2.0,,37.654320
12953,168968,2017,3,-6.3394,-5.1371,66.0,68.0,1,68.0,6,...,,,301.0,,,301.0,60.0,4.8,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12024,156466,2017,3,-8.1975,-5.0278,48.0,69.0,2,79.0,6,...,2018-01-22,,314.0,558.0,,44.0,88.0,2.6,,31.100306
7001,91628,2016,4,-6.8859,-4.3720,65.0,74.0,2,70.0,7,...,2017-02-07,2017-04-04,385.0,307.0,385.0,385.0,65.0,2.4,1.45,31.673470
5306,69736,2020,4,-7.7603,-4.5906,54.0,54.0,2,59.0,6,...,2021-11-17,,384.0,355.0,,384.0,56.0,1.9,,32.488628
13507,1001933,2014,3,-6.3394,-5.1371,59.0,,2,51.0,3,...,,,,,,471.0,59.0,3.3,,


In [17]:
result_df = pd.concat([df_original, df_missing_val_original])

In [18]:
result_df.to_csv('data/mvldl.csv', index=True)

In [19]:
result_df[['ldl_12m']]

Unnamed: 0,ldl_12m
8087,2.200000
4868,2.900000
6448,1.700000
13607,2.000000
12953,4.800000
...,...
279,2.779563
12764,2.090636
6266,2.333322
8510,1.316129
