In [1]:
import pandas as pd
import numpy as np
import random
from matplotlib.pyplot import pie, axis, show
import seaborn as sns
import missingno as msno
from scipy import stats
import matplotlib.pyplot as plt
import yaml

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import RFE, SelectKBest, f_regression, mutual_info_regression
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn import linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
import statsmodels.regression.linear_model as sm
from sklearn.gaussian_process import GaussianProcessRegressor as GPR
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, CompoundKernel
import sklearn_relief as sr
from skrebate import ReliefF
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import ElasticNet
import lightgbm as ltb
from sklearn.svm import SVR
from scipy.stats import ks_2samp
from tabulate import tabulate

from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.over_sampling import BorderlineSMOTE

from sklearn.multioutput import RegressorChain, MultiOutputRegressor

import torch
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(42)

from helper import preprocess, countUsers, get_features_relieff, get_features_ref_single, get_features_ref,\
    get_features_kbest, outlier_detect, cross_val, get_scores
%matplotlib inline


In [2]:
df = pd.read_csv('../data/X_train.csv', sep = ',',decimal = '.', encoding = 'utf-8', engine ='python', index_col=0)

In [3]:
# Read common variables from a YAML file
with open('../../common_variables.yaml', 'r') as file:
    common_data = yaml.safe_load(file)

In [4]:
target_variable = 'bmi_12m'
df, X_train, X_test, Y_train, Y_test, X, Y, scaler, df_missing_val, df_missing_val_original, df_original = preprocess(df, 0.25, target_variable)
df = df.drop(['ldl_12m', 'hba1c_12m', 'hdl_12m', 'days_ldl'], axis=1)
X_train = X_train.drop(['ldl_12m', 'hba1c_12m', 'hdl_12m', 'days_ldl'], axis=1)
X_test = X_test.drop(['ldl_12m', 'hba1c_12m', 'hdl_12m', 'days_ldl'], axis=1)


Shape of data : (2660, 125)
Shape of data after excluding missing response: (1719, 125)
Shape of full data after selecting date range dates > 21 days (1202, 116)


In [5]:
df_missing_val

Unnamed: 0,id,init_year,drug_class,MD_RCT_mmol_mol,hba1c_bl_18m,hba1c_bl_6m,sp,ika,t2d_dur_y,P_Krea,...,dg132,dg133,n_of_dis,days_hba1c,days_bmi,days_hdl,days_ldl,hba1c_12m,ldl_12m,hdl_12m
8087,0.008856,0.625,0.0,0.960794,0.142857,0.031250,0.0,0.876712,0.111111,0.094828,...,0.0,0.0,0.571429,1.043732,0.175237,0.112277,0.154727,0.152174,0.36,0.287234
4868,0.005371,0.750,1.0,0.882382,0.219048,0.234375,1.0,0.780822,0.472222,0.219828,...,0.0,0.0,0.142857,0.734694,0.175237,0.104962,0.109792,0.141304,0.50,0.313830
12953,0.014073,0.500,0.0,0.960794,0.295238,0.234375,0.0,0.671233,0.166667,0.081897,...,0.0,0.0,0.071429,0.813411,0.175237,0.104962,0.121238,0.260870,0.88,0.313830
13659,0.088201,0.750,1.0,0.411912,0.161905,0.078125,0.0,0.657534,0.083333,0.125000,...,0.0,0.0,0.357143,1.364431,0.175237,0.112277,0.154727,0.326087,0.74,0.542553
7498,0.008208,0.125,0.0,0.960794,0.123810,0.000000,0.0,0.726027,0.305556,0.176724,...,0.0,0.0,0.142857,0.775510,0.175237,0.080789,0.112760,0.173913,0.74,0.356383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,0.000298,0.625,0.0,0.960794,0.161905,0.234375,0.0,0.452055,0.083333,0.392241,...,0.0,0.0,0.214286,1.000000,0.175237,0.104962,0.148368,0.184783,0.32,0.313830
180,0.000200,0.875,1.0,1.000000,0.390476,0.453125,0.0,0.821918,0.777778,0.125000,...,0.0,0.0,0.285714,0.900875,0.175237,0.096692,0.133955,0.663043,0.40,0.212766
10528,0.011384,0.000,0.0,0.960794,0.342857,0.125000,1.0,0.630137,0.111111,0.275862,...,0.0,0.0,0.214286,0.224490,0.175237,0.022901,0.035608,0.608696,0.34,0.196809
4446,0.004868,0.625,0.0,0.960794,0.200000,0.031250,0.0,0.945205,0.027778,0.133621,...,0.0,0.0,0.285714,0.402332,0.175237,0.104962,0.148368,0.173913,0.32,0.313830


In [6]:
# # train with whole dataset and test with drug class 2,3 and 4 data
is_train_with_all=False
if(is_train_with_all):
    combined_df = pd.concat([X_test, Y_test], axis=1)
    testdf = combined_df[(combined_df['drug_class'] == 0.25) | 
                         (combined_df['drug_class'] == 0.375) ]
    X_test = testdf.drop([response_variable], axis = 1)
    Y_test = testdf[response_variable]
    
X_test_original = X_test.copy()

In [7]:
""""
Use drug_class

2=GLP-1 analogues (A10BJ)
3=DPP-4 inhibitors (A10BH)
4=SGLT2 inhibitors (A10BK)
"""

if(is_train_with_all):
    sglt_val = 0.375
    dpp_val = 0.25
else:
    sglt_val = 1
    dpp_val = 0


X_test_ = pd.DataFrame(X_test)
X_train_ = pd.DataFrame(X_train)

X_train = X_train.drop(['init_year'], axis = 1)
X_test = X_test.drop(['init_year'], axis = 1)

print('==== sample count in preprocessed data =======')
print(' number of dpp4 : ', countUsers(3, df))
print(' number of sglt2 : ', countUsers(4, df))

print('==== sample count in training data =======')
print(' number of dpp4 : ', countUsers(dpp_val, X_train_))
print(' number of sglt2 : ', countUsers(sglt_val, X_train_))

print('==== sample count in testing data =======')
print(' number of dpp4 : ', countUsers(dpp_val, X_test_))
print(' number of sglt2 : ', countUsers(sglt_val, X_test_))

 number of dpp4 :  416
 number of sglt2 :  772
 number of dpp4 :  304
 number of sglt2 :  587
 number of dpp4 :  112
 number of sglt2 :  185


In [8]:
# TODO FROM HERE

# feature selection
items = [
#     'sp',
#     'smoking',
    ]
k = 10 # Select top 25 features
    
random.seed(42)

#feats = get_features_ref_single(X_train, Y_train,3)
#feats = get_features_relieff(X_train, Y_train['bmi_12m'] ,15)
#feats = get_features_kbest(X_train, Y_train,10)
feats = ['sp', 'ika', 't2d_dur_y', 'pkrea_luo', 'bmi', 'dpp4', 'hdl', 'trigly', 'obese'] # kbest 10
print(feats)
selected_features=feats
        
X_train = X_train[selected_features]
X_test = X_test[selected_features]
number_of_features = len(selected_features)

['sp', 'ika', 't2d_dur_y', 'pkrea_luo', 'bmi', 'dpp4', 'hdl', 'trigly', 'obese']


In [9]:
################# OUTLIER CODE ################
print('Shape of training data before removing outliers:', np.shape(X_train))
print('Shape of test data before removing outliers:', np.shape(X_test))
    
out_train, out_test = outlier_detect(X_train, Y_train, X_test, Y_test)
response_variable_list = [target_variable]
train_ = X_train.copy()
train_[response_variable_list] = Y_train.values
    
test_ = X_test.copy()
test_[response_variable_list] = Y_test.values
    
train_ = pd.DataFrame(train_.drop(out_train, axis = 0))
test_ = pd.DataFrame(test_.drop(out_test, axis = 0))
    
Y_train = train_[response_variable_list]
X_train = train_.drop(response_variable_list, axis=1)
    
Y_test = test_[response_variable_list]
X_test = test_.drop(response_variable_list, axis=1)
    
print('Shape of training data after removing outliers:', np.shape(X_train))
print('Shape of test data after removing outliers:', np.shape(X_test))

################

Shape of training data before removing outliers: (891, 9)
Shape of test data before removing outliers: (297, 9)
Training set outliers: [6014, 682, 9516, 9844, 9933]
Testing set outliers: [1075]
Shape of training data after removing outliers: (886, 9)
Shape of test data after removing outliers: (296, 9)


In [10]:
train = X_train.copy()
train[response_variable_list] = Y_train[response_variable_list].copy()




# Models

In [11]:
model = XGBRegressor(
    n_estimators=70, 
    eta=0.06, 
    subsample=0.9, 
    colsample_bytree=0.8,
    alpha=0.04,
    max_depth = 15,
    max_leaves = 5,
    learning_rate =0.1
)

#model = CatBoostRegressor(iterations=20,learning_rate=0.1, depth=6)

#model = RandomForestRegressor(n_estimators=150, max_depth=10, random_state=123)
#model = MLPRegressor(random_state=123, max_iter=1000,hidden_layer_sizes = 375,learning_rate= 'adaptive')

model = cross_val(model, train , X_train, Y_train, response_variable_list)
model.fit(X_train, Y_train)
# make a prediction

yhat = model.predict(X_test)
# summarize prediction
print(yhat[1])
original_data_pred, model_results, model_results_drugs_ori, score_ori = get_scores(model, X_test, Y_test, X_train, Y_train)



Cross validation variance 0.0014695235700876717
Cross validation mean score 0.8809306812614756
32.060623
R2 score Training : 0.9121742902905198
R2 score Testing : 0.8640322175887674
RMSE (Target): 2.020606765575703


In [12]:
df_missing_val = df_missing_val[selected_features]
mv_pred_test_numpy = model.predict(df_missing_val)


In [13]:
len(mv_pred_test_numpy)

941

In [14]:
df_missing_val_original['bmi_12m'] = mv_pred_test_numpy

In [15]:
df_missing_val_original['bmi_12m']

8087     29.747171
4868     26.634987
12953    30.746731
13659    32.101566
7498     30.160347
           ...    
279      30.758030
180      28.822630
10528    29.913218
4446     29.679741
13507    31.029821
Name: bmi_12m, Length: 941, dtype: float32

In [16]:
df_original

Unnamed: 0,id,init_year,drug_class,MD_RCT_mmol_mol,hba1c_bl_18m,hba1c_bl_6m,sp,ika,t2d_dur_y,P_Krea,...,date_bmi_12m,date_hdl_12m,days_hba1c,days_bmi,days_hdl,days_ldl,hba1c_12m,ldl_12m,hdl_12m,bmi_12m
6448,84221,2019,4,-6.2301,60.0,58.0,1,56.0,13,62.0,...,2020-11-05,2020-04-07,313.0,357.0,229.0,313.0,59.0,1.7,1.47,35.154137
6963,91189,2013,3,-5.7929,51.0,53.0,2,64.0,3,70.0,...,2014-01-21,,364.0,425.0,,,59.0,,,28.730000
13607,1043704,2019,3,-5.7929,46.0,59.0,2,60.0,6,94.0,...,2019-07-09,,350.0,343.0,,350.0,49.0,2.0,,37.654320
1550,20774,2019,4,-6.2301,60.0,78.0,2,74.0,14,72.0,...,2020-05-04,,105.0,313.0,,,62.0,,,27.910000
7758,101936,2019,4,-8.8533,50.0,55.0,2,56.0,9,72.0,...,2020-08-18,2019-10-31,264.0,301.0,2628.0,,37.0,,1.47,31.770000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8510,111202,2021,4,-6.2301,55.0,59.0,2,73.0,13,81.0,...,2021-10-04,,100.0,231.0,,,54.0,,,27.129999
12024,156466,2017,3,-6.5580,48.0,69.0,2,79.0,6,128.0,...,2018-01-22,,314.0,558.0,,44.0,88.0,2.6,,31.100306
7001,91628,2016,4,-5.5743,65.0,74.0,2,70.0,7,58.0,...,2017-02-07,2017-04-04,385.0,307.0,385.0,385.0,65.0,2.4,1.45,31.673470
5306,69736,2020,4,-6.2301,54.0,54.0,2,59.0,6,65.0,...,2021-11-17,,384.0,355.0,,384.0,56.0,1.9,,32.488628


In [17]:
result_df = pd.concat([df_original, df_missing_val_original])

In [18]:
result_df.to_csv('../data/mvbmi.csv', index=True)

In [19]:
result_df[['bmi_12m']]

Unnamed: 0,bmi_12m
6448,35.154137
6963,28.730000
13607,37.654320
1550,27.910000
7758,31.770000
...,...
279,30.758030
180,28.822630
10528,29.913218
4446,29.679741
