In [1]:
import pandas as pd
import numpy as np
import random
from matplotlib.pyplot import pie, axis, show
import seaborn as sns
import missingno as msno
from scipy import stats
import matplotlib.pyplot as plt
import yaml

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import RFE, SelectKBest, f_regression, mutual_info_regression
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn import linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
import statsmodels.regression.linear_model as sm
from sklearn.gaussian_process import GaussianProcessRegressor as GPR
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, CompoundKernel
import sklearn_relief as sr
from skrebate import ReliefF
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import ElasticNet
import lightgbm as ltb
from sklearn.svm import SVR
from scipy.stats import ks_2samp
from tabulate import tabulate

from xgboost.sklearn import XGBRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.over_sampling import BorderlineSMOTE

from sklearn.multioutput import RegressorChain, MultiOutputRegressor

import torch
from torch.utils.data import Dataset, DataLoader
torch.manual_seed(42)

from helper import preprocess, countUsers, get_features_relieff, get_features_ref_single, get_features_ref,\
    get_features_kbest, outlier_detect, cross_val, get_scores

%matplotlib inline


In [2]:
df = pd.read_csv('../data/X_train.csv', sep = ',',decimal = '.', encoding = 'utf-8', engine ='python', index_col=0)

In [3]:
# Read common variables from a YAML file
with open('../../common_variables.yaml', 'r') as file:
    common_data = yaml.safe_load(file)

In [4]:
target_variable = 'ldl_12m'
df, X_train, X_test, Y_train, Y_test, X, Y, scaler, df_missing_val, df_missing_val_original, df_original = preprocess(df, 0.25, target_variable)
df = df.drop(['bmi_12m', 'hba1c_12m', 'hdl_12m', 'days_bmi', 'days_hdl', 'days_ldl', 'days_hba1c'], axis=1)
X_train = X_train.drop(['bmi_12m', 'hba1c_12m', 'hdl_12m', 'days_bmi', 'days_hdl', 'days_ldl', 'days_hba1c'], axis=1)
X_test = X_test.drop(['bmi_12m', 'hba1c_12m', 'hdl_12m', 'days_bmi', 'days_hdl', 'days_ldl', 'days_hba1c'], axis=1)

Shape of data : (2660, 125)
Shape of data after excluding missing response: (1566, 125)
Shape of full data after selecting date range dates > 21 days (1084, 116)


In [5]:
df_missing_val

Unnamed: 0,id,init_year,drug_class,MD_RCT_mmol_mol,hba1c_bl_18m,hba1c_bl_6m,sp,ika,t2d_dur_y,P_Krea,...,dg132,dg133,n_of_dis,days_hba1c,days_bmi,days_hdl,days_ldl,hba1c_12m,hdl_12m,bmi_12m
6963,0.005358,0.000,0.0,0.960794,0.152381,0.000000,1.0,0.616438,0.083333,0.151515,...,0.0,0.0,0.153846,0.996951,0.176597,0.102099,0.14585,0.247312,0.199153,0.000281
1550,0.001216,0.750,1.0,0.882382,0.238095,0.390625,1.0,0.753425,0.388889,0.160173,...,0.0,0.0,0.153846,0.207317,0.128590,0.102099,0.14585,0.279570,0.199153,0.000262
7758,0.005990,0.750,1.0,0.411912,0.142857,0.031250,1.0,0.506849,0.250000,0.160173,...,0.0,0.0,0.538462,0.692073,0.123446,0.827290,0.14585,0.010753,0.364407,0.000350
3104,0.002381,0.375,1.0,0.882382,0.152381,0.125000,0.0,0.671233,0.527778,0.212121,...,0.0,0.0,0.230769,0.868902,0.130304,0.102099,0.14585,0.225806,0.199153,0.000370
6673,0.005122,0.000,0.0,0.960794,0.152381,0.171875,1.0,0.534247,0.111111,0.108225,...,0.0,0.0,0.076923,1.000000,0.150450,0.102099,0.14585,0.193548,0.199153,0.000274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,0.000210,0.625,0.0,0.960794,0.152381,0.234375,0.0,0.452055,0.083333,0.389610,...,0.0,0.0,0.230769,1.000000,0.150450,0.102099,0.14585,0.193548,0.199153,0.000274
12764,0.009767,0.250,1.0,0.411912,0.285714,0.109375,1.0,0.684932,0.138889,0.242424,...,0.0,0.0,0.384615,0.835366,0.123018,0.102099,0.14585,0.129032,0.199153,0.000294
6266,0.004819,0.500,1.0,0.882382,0.152381,0.078125,0.0,0.534247,0.222222,0.116883,...,0.0,0.0,0.153846,0.375000,0.147878,0.102099,0.14585,0.182796,0.199153,0.000615
8510,0.006535,1.000,1.0,0.882382,0.190476,0.093750,1.0,0.739726,0.361111,0.199134,...,1.0,0.0,0.615385,0.192073,0.093442,0.102099,0.14585,0.193548,0.199153,0.000245


In [6]:
# # train with whole dataset and test with drug class 2,3 and 4 data
is_train_with_all=False
if(is_train_with_all):
    combined_df = pd.concat([X_test, Y_test], axis=1)
    testdf = combined_df[(combined_df['drug_class'] == 0.25) | 
                         (combined_df['drug_class'] == 0.375) ]
    X_test = testdf.drop([response_variable], axis = 1)
    Y_test = testdf[response_variable]
    
X_test_original = X_test.copy()

In [7]:
""""
Use drug_class

2=GLP-1 analogues (A10BJ)
3=DPP-4 inhibitors (A10BH)
4=SGLT2 inhibitors (A10BK)
"""

if(is_train_with_all):
    sglt_val = 0.375
    dpp_val = 0.25
else:
    sglt_val = 1
    dpp_val = 0


X_test_ = pd.DataFrame(X_test)
X_train_ = pd.DataFrame(X_train)

X_train = X_train.drop(['init_year'], axis = 1)
X_test = X_test.drop(['init_year'], axis = 1)

print('==== sample count in preprocessed data =======')
print(' number of dpp4 : ', countUsers(3, df))
print(' number of sglt2 : ', countUsers(4, df))

print('==== sample count in training data =======')
print(' number of dpp4 : ', countUsers(dpp_val, X_train_))
print(' number of sglt2 : ', countUsers(sglt_val, X_train_))

print('==== sample count in testing data =======')
print(' number of dpp4 : ', countUsers(dpp_val, X_test_))
print(' number of sglt2 : ', countUsers(sglt_val, X_test_))

 number of dpp4 :  441
 number of sglt2 :  643
 number of dpp4 :  336
 number of sglt2 :  477
 number of dpp4 :  105
 number of sglt2 :  166


In [8]:
# TODO FROM HERE

# feature selection
items = ['C10A']
k = 10 # Select top 25 features
  
random.seed(42)


#feats = get_features_ref_single(X_train, Y_train,5)
#feats = get_features_relieff(X_train, Y_train['ldl_12m'] ,5)
feats = get_features_kbest(X_train, Y_train,10)
selected_features=feats

for item in items:
    if item not in selected_features:
        selected_features = np.append(selected_features, item)

selected_features = ['hba1c_bl_6m', 'ika', 'ldl', 'insulin', 'sum_diab_drugs', 'hyperten', 'chd',
 'cvd_comp', 'obese', 'C02A', 'C10A']  # kbest 10     
print(selected_features)
  
X_train = X_train[selected_features]
X_test = X_test[selected_features]
number_of_features = len(selected_features)

  y = column_or_1d(y, warn=True)


['hba1c_bl_6m', 'ika', 'ldl', 'insulin', 'sum_diab_drugs', 'hyperten', 'chd', 'cvd_comp', 'obese', 'C02A', 'C10A']


In [9]:
################# OUTLIER CODE ################
print('Shape of training data before removing outliers:', np.shape(X_train))
print('Shape of test data before removing outliers:', np.shape(X_test))
    
out_train, out_test = outlier_detect(X_train, Y_train, X_test, Y_test)
response_variable_list = [target_variable]

train_ = X_train.copy()
train_[response_variable_list] = Y_train.values
    
test_ = X_test.copy()
test_[response_variable_list] = Y_test.values
    
train_ = pd.DataFrame(train_.drop(out_train, axis = 0))
test_ = pd.DataFrame(test_.drop(out_test, axis = 0))
    
Y_train = train_[response_variable_list]
X_train = train_.drop(response_variable_list, axis=1)
    
Y_test = test_[response_variable_list]
X_test = test_.drop(response_variable_list, axis=1)
    
print('Shape of training data after removing outliers:', np.shape(X_train))
print('Shape of test data after removing outliers:', np.shape(X_test))

################

Shape of training data before removing outliers: (813, 11)
Shape of test data before removing outliers: (271, 11)
Training set outliers: [4795, 5599]
Testing set outliers: []
Shape of training data after removing outliers: (811, 11)
Shape of test data after removing outliers: (271, 11)


In [10]:
train = X_train.copy()
train[response_variable_list] = Y_train[response_variable_list].copy()


# Models

In [11]:
model1 = XGBRegressor(
    n_estimators=20, 
    eta=0.04, 
    subsample=0.6, 
    colsample_bytree=0.9,
    alpha=0.4,
    max_depth = 12,
    max_leaves = 10,
    learning_rate =0.15)

model3 = CatBoostRegressor(iterations=50,learning_rate=0.1, depth=6)

model2 = RandomForestRegressor(n_estimators=200, max_depth=20, random_state=123)
#model = MLPRegressor(random_state=123, max_iter=200,hidden_layer_sizes = 32,learning_rate= 'adaptive')
model = VotingRegressor([('xgb', model1), ('rfr', model2), ('catboost', model3)])
model = cross_val(model, train , X_train, Y_train, response_variable_list)
model.fit(X_train, Y_train)
# make a prediction

yhat = model.predict(X_test)
# summarize prediction
print(yhat[1])
original_data_pred, model_results, model_results_drugs_ori, score_ori = get_scores(model, X_test, Y_test, X_train, Y_train)


  y = column_or_1d(y, warn=True)


0:	learn: 0.8133700	total: 60.5ms	remaining: 2.96s
1:	learn: 0.7950515	total: 61.6ms	remaining: 1.48s
2:	learn: 0.7770583	total: 62.2ms	remaining: 975ms
3:	learn: 0.7589603	total: 62.9ms	remaining: 723ms
4:	learn: 0.7470547	total: 63.1ms	remaining: 568ms
5:	learn: 0.7346679	total: 63.8ms	remaining: 468ms
6:	learn: 0.7232546	total: 64.5ms	remaining: 396ms
7:	learn: 0.7142888	total: 65.1ms	remaining: 342ms
8:	learn: 0.7063227	total: 65.9ms	remaining: 300ms
9:	learn: 0.6961583	total: 66.6ms	remaining: 266ms
10:	learn: 0.6896614	total: 67.2ms	remaining: 238ms
11:	learn: 0.6828735	total: 67.8ms	remaining: 215ms
12:	learn: 0.6771193	total: 68.4ms	remaining: 195ms
13:	learn: 0.6713167	total: 69ms	remaining: 177ms
14:	learn: 0.6664809	total: 69.6ms	remaining: 162ms
15:	learn: 0.6614395	total: 70.1ms	remaining: 149ms
16:	learn: 0.6565035	total: 70.7ms	remaining: 137ms
17:	learn: 0.6525123	total: 72ms	remaining: 128ms
18:	learn: 0.6484894	total: 72.5ms	remaining: 118ms
19:	learn: 0.6447688	total

  y = column_or_1d(y, warn=True)


0:	learn: 0.7993213	total: 1.46ms	remaining: 71.8ms
1:	learn: 0.7794518	total: 2.27ms	remaining: 54.6ms
2:	learn: 0.7631886	total: 2.91ms	remaining: 45.6ms
3:	learn: 0.7459147	total: 3.56ms	remaining: 40.9ms
4:	learn: 0.7319151	total: 4.16ms	remaining: 37.4ms
5:	learn: 0.7208600	total: 4.84ms	remaining: 35.5ms
6:	learn: 0.7102832	total: 5.4ms	remaining: 33.2ms
7:	learn: 0.7006568	total: 6.02ms	remaining: 31.6ms
8:	learn: 0.6913838	total: 6.6ms	remaining: 30.1ms
9:	learn: 0.6866620	total: 6.79ms	remaining: 27.2ms
10:	learn: 0.6800675	total: 7.43ms	remaining: 26.3ms
11:	learn: 0.6740410	total: 8.02ms	remaining: 25.4ms
12:	learn: 0.6701491	total: 8.6ms	remaining: 24.5ms
13:	learn: 0.6655530	total: 9.21ms	remaining: 23.7ms
14:	learn: 0.6605336	total: 9.8ms	remaining: 22.9ms
15:	learn: 0.6558666	total: 10.4ms	remaining: 22.1ms
16:	learn: 0.6520966	total: 11ms	remaining: 21.4ms
17:	learn: 0.6486971	total: 11.6ms	remaining: 20.6ms
18:	learn: 0.6436188	total: 12.2ms	remaining: 20ms
19:	learn: 

  y = column_or_1d(y, warn=True)


0:	learn: 0.7995048	total: 1.17ms	remaining: 57.1ms
1:	learn: 0.7810947	total: 1.92ms	remaining: 46ms
2:	learn: 0.7646068	total: 2.61ms	remaining: 40.9ms
3:	learn: 0.7495526	total: 3.25ms	remaining: 37.3ms
4:	learn: 0.7391034	total: 3.5ms	remaining: 31.5ms
5:	learn: 0.7285023	total: 4.09ms	remaining: 30ms
6:	learn: 0.7186889	total: 4.69ms	remaining: 28.8ms
7:	learn: 0.7109556	total: 5.32ms	remaining: 27.9ms
8:	learn: 0.7039596	total: 5.93ms	remaining: 27ms
9:	learn: 0.6957411	total: 6.49ms	remaining: 26ms
10:	learn: 0.6900352	total: 7.09ms	remaining: 25.1ms
11:	learn: 0.6844688	total: 7.67ms	remaining: 24.3ms
12:	learn: 0.6792982	total: 8.28ms	remaining: 23.6ms
13:	learn: 0.6735617	total: 8.94ms	remaining: 23ms
14:	learn: 0.6681493	total: 9.54ms	remaining: 22.3ms
15:	learn: 0.6642274	total: 10.1ms	remaining: 21.5ms
16:	learn: 0.6597504	total: 10.7ms	remaining: 20.8ms
17:	learn: 0.6559138	total: 11.3ms	remaining: 20.1ms
18:	learn: 0.6506988	total: 11.9ms	remaining: 19.4ms
19:	learn: 0.6

  y = column_or_1d(y, warn=True)


0:	learn: 0.8086833	total: 716us	remaining: 35.1ms
1:	learn: 0.7918730	total: 1.48ms	remaining: 35.6ms
2:	learn: 0.7752444	total: 2.06ms	remaining: 32.4ms
3:	learn: 0.7589900	total: 2.67ms	remaining: 30.7ms
4:	learn: 0.7481636	total: 2.9ms	remaining: 26.1ms
5:	learn: 0.7369752	total: 3.55ms	remaining: 26ms
6:	learn: 0.7270223	total: 4.13ms	remaining: 25.4ms
7:	learn: 0.7193245	total: 4.72ms	remaining: 24.8ms
8:	learn: 0.7121979	total: 5.32ms	remaining: 24.2ms
9:	learn: 0.7031671	total: 5.89ms	remaining: 23.6ms
10:	learn: 0.6974567	total: 6.5ms	remaining: 23ms
11:	learn: 0.6912330	total: 7.1ms	remaining: 22.5ms
12:	learn: 0.6861317	total: 7.69ms	remaining: 21.9ms
13:	learn: 0.6803021	total: 8.33ms	remaining: 21.4ms
14:	learn: 0.6761532	total: 8.89ms	remaining: 20.8ms
15:	learn: 0.6719595	total: 9.48ms	remaining: 20.1ms
16:	learn: 0.6684587	total: 10.1ms	remaining: 19.5ms
17:	learn: 0.6639390	total: 10.7ms	remaining: 18.9ms
18:	learn: 0.6594149	total: 11.2ms	remaining: 18.3ms
19:	learn: 

  y = column_or_1d(y, warn=True)


0:	learn: 0.8070982	total: 716us	remaining: 35.1ms
1:	learn: 0.7867748	total: 1.43ms	remaining: 34.4ms
2:	learn: 0.7710852	total: 2.04ms	remaining: 31.9ms
3:	learn: 0.7539396	total: 2.66ms	remaining: 30.6ms
4:	learn: 0.7427565	total: 2.91ms	remaining: 26.2ms
5:	learn: 0.7298689	total: 3.5ms	remaining: 25.6ms
6:	learn: 0.7195742	total: 4.13ms	remaining: 25.4ms
7:	learn: 0.7105741	total: 4.77ms	remaining: 25ms
8:	learn: 0.7028071	total: 5.4ms	remaining: 24.6ms
9:	learn: 0.6936624	total: 6ms	remaining: 24ms
10:	learn: 0.6870465	total: 6.59ms	remaining: 23.4ms
11:	learn: 0.6812810	total: 7.24ms	remaining: 22.9ms
12:	learn: 0.6751851	total: 7.83ms	remaining: 22.3ms
13:	learn: 0.6689003	total: 8.48ms	remaining: 21.8ms
14:	learn: 0.6642087	total: 9.02ms	remaining: 21ms
15:	learn: 0.6600772	total: 9.6ms	remaining: 20.4ms
16:	learn: 0.6567672	total: 10.2ms	remaining: 19.8ms
17:	learn: 0.6539972	total: 10.8ms	remaining: 19.1ms
18:	learn: 0.6481000	total: 11.3ms	remaining: 18.5ms
19:	learn: 0.646

  y = column_or_1d(y, warn=True)


0:	learn: 0.8165229	total: 1.44ms	remaining: 70.4ms
1:	learn: 0.7961747	total: 2.24ms	remaining: 53.8ms
2:	learn: 0.7805374	total: 2.88ms	remaining: 45.1ms
3:	learn: 0.7670513	total: 3.55ms	remaining: 40.8ms
4:	learn: 0.7555006	total: 3.78ms	remaining: 34ms
5:	learn: 0.7422100	total: 4.4ms	remaining: 32.3ms
6:	learn: 0.7310590	total: 5.06ms	remaining: 31.1ms
7:	learn: 0.7232205	total: 5.74ms	remaining: 30.2ms
8:	learn: 0.7155448	total: 6.54ms	remaining: 29.8ms
9:	learn: 0.7075916	total: 7.14ms	remaining: 28.6ms
10:	learn: 0.7022246	total: 7.73ms	remaining: 27.4ms
11:	learn: 0.6932459	total: 8.29ms	remaining: 26.3ms
12:	learn: 0.6879282	total: 8.86ms	remaining: 25.2ms
13:	learn: 0.6809454	total: 9.43ms	remaining: 24.3ms
14:	learn: 0.6762887	total: 9.98ms	remaining: 23.3ms
15:	learn: 0.6720024	total: 10.6ms	remaining: 22.5ms
16:	learn: 0.6685208	total: 11.1ms	remaining: 21.6ms
17:	learn: 0.6645813	total: 11.7ms	remaining: 20.8ms
18:	learn: 0.6585797	total: 12.3ms	remaining: 20.1ms
19:	le

  y = column_or_1d(y, warn=True)


0:	learn: 0.8141555	total: 666us	remaining: 32.7ms
1:	learn: 0.7967145	total: 1.41ms	remaining: 33.9ms
2:	learn: 0.7791867	total: 1.99ms	remaining: 31.2ms
3:	learn: 0.7625376	total: 2.6ms	remaining: 29.9ms
4:	learn: 0.7513866	total: 2.93ms	remaining: 26.4ms
5:	learn: 0.7400299	total: 3.56ms	remaining: 26.1ms
6:	learn: 0.7296159	total: 4.13ms	remaining: 25.4ms
7:	learn: 0.7212729	total: 4.66ms	remaining: 24.4ms
8:	learn: 0.7135578	total: 5.22ms	remaining: 23.8ms
9:	learn: 0.7043726	total: 5.78ms	remaining: 23.1ms
10:	learn: 0.6987526	total: 6.35ms	remaining: 22.5ms
11:	learn: 0.6924693	total: 6.96ms	remaining: 22ms
12:	learn: 0.6871751	total: 7.6ms	remaining: 21.6ms
13:	learn: 0.6819565	total: 8.3ms	remaining: 21.3ms
14:	learn: 0.6774394	total: 8.88ms	remaining: 20.7ms
15:	learn: 0.6736880	total: 9.33ms	remaining: 19.8ms
16:	learn: 0.6683116	total: 9.99ms	remaining: 19.4ms
17:	learn: 0.6645144	total: 10.7ms	remaining: 19ms
18:	learn: 0.6616722	total: 11.3ms	remaining: 18.5ms
19:	learn: 

  y = column_or_1d(y, warn=True)


0:	learn: 0.7998657	total: 697us	remaining: 34.2ms
1:	learn: 0.7838033	total: 1.48ms	remaining: 35.6ms
2:	learn: 0.7675119	total: 2.1ms	remaining: 33ms
3:	learn: 0.7519539	total: 2.68ms	remaining: 30.9ms
4:	learn: 0.7416577	total: 2.95ms	remaining: 26.6ms
5:	learn: 0.7312162	total: 3.55ms	remaining: 26ms
6:	learn: 0.7216289	total: 4.17ms	remaining: 25.6ms
7:	learn: 0.7145558	total: 4.78ms	remaining: 25.1ms
8:	learn: 0.7071726	total: 5.34ms	remaining: 24.3ms
9:	learn: 0.6979072	total: 5.91ms	remaining: 23.6ms
10:	learn: 0.6920009	total: 6.51ms	remaining: 23.1ms
11:	learn: 0.6850932	total: 7.09ms	remaining: 22.5ms
12:	learn: 0.6793527	total: 7.65ms	remaining: 21.8ms
13:	learn: 0.6740412	total: 8.26ms	remaining: 21.2ms
14:	learn: 0.6699309	total: 8.82ms	remaining: 20.6ms
15:	learn: 0.6654636	total: 9.16ms	remaining: 19.5ms
16:	learn: 0.6619202	total: 9.73ms	remaining: 18.9ms
17:	learn: 0.6583671	total: 10.3ms	remaining: 18.3ms
18:	learn: 0.6549428	total: 10.9ms	remaining: 17.7ms
19:	learn

  y = column_or_1d(y, warn=True)


0:	learn: 0.8073869	total: 739us	remaining: 36.2ms
1:	learn: 0.7888421	total: 1.48ms	remaining: 35.6ms
2:	learn: 0.7723297	total: 2.12ms	remaining: 33.1ms
3:	learn: 0.7586699	total: 2.74ms	remaining: 31.6ms
4:	learn: 0.7454130	total: 3.31ms	remaining: 29.8ms
5:	learn: 0.7356888	total: 3.91ms	remaining: 28.7ms
6:	learn: 0.7255911	total: 4.47ms	remaining: 27.5ms
7:	learn: 0.7159757	total: 5.11ms	remaining: 26.8ms
8:	learn: 0.7069031	total: 5.7ms	remaining: 26ms
9:	learn: 0.6993118	total: 6.28ms	remaining: 25.1ms
10:	learn: 0.6934756	total: 6.88ms	remaining: 24.4ms
11:	learn: 0.6857667	total: 7.49ms	remaining: 23.7ms
12:	learn: 0.6806747	total: 8.07ms	remaining: 23ms
13:	learn: 0.6755579	total: 8.68ms	remaining: 22.3ms
14:	learn: 0.6707391	total: 9.26ms	remaining: 21.6ms
15:	learn: 0.6661668	total: 9.93ms	remaining: 21.1ms
16:	learn: 0.6610993	total: 10.6ms	remaining: 20.7ms
17:	learn: 0.6577261	total: 11.2ms	remaining: 19.9ms
18:	learn: 0.6548281	total: 11.8ms	remaining: 19.2ms
19:	learn

  y = column_or_1d(y, warn=True)


0:	learn: 0.8055408	total: 1.39ms	remaining: 68ms
1:	learn: 0.7878484	total: 2.09ms	remaining: 50.1ms
2:	learn: 0.7718444	total: 2.67ms	remaining: 41.8ms
3:	learn: 0.7588759	total: 3.28ms	remaining: 37.8ms
4:	learn: 0.7481817	total: 3.59ms	remaining: 32.3ms
5:	learn: 0.7363504	total: 4.22ms	remaining: 31ms
6:	learn: 0.7253979	total: 4.84ms	remaining: 29.7ms
7:	learn: 0.7183816	total: 5.42ms	remaining: 28.5ms
8:	learn: 0.7122518	total: 6ms	remaining: 27.3ms
9:	learn: 0.7040320	total: 6.62ms	remaining: 26.5ms
10:	learn: 0.6978958	total: 7.21ms	remaining: 25.6ms
11:	learn: 0.6905428	total: 7.81ms	remaining: 24.7ms
12:	learn: 0.6829345	total: 8.4ms	remaining: 23.9ms
13:	learn: 0.6763944	total: 9ms	remaining: 23.1ms
14:	learn: 0.6715588	total: 9.55ms	remaining: 22.3ms
15:	learn: 0.6667047	total: 10.2ms	remaining: 21.6ms
16:	learn: 0.6630824	total: 10.8ms	remaining: 20.9ms
17:	learn: 0.6595619	total: 11.4ms	remaining: 20.2ms
18:	learn: 0.6548834	total: 11.9ms	remaining: 19.5ms
19:	learn: 0.6

  y = column_or_1d(y, warn=True)


0:	learn: 0.8064669	total: 1.43ms	remaining: 70.2ms
1:	learn: 0.7889689	total: 2.22ms	remaining: 53.3ms
2:	learn: 0.7719964	total: 2.89ms	remaining: 45.3ms
3:	learn: 0.7555199	total: 3.53ms	remaining: 40.6ms
4:	learn: 0.7445777	total: 3.78ms	remaining: 34ms
5:	learn: 0.7333111	total: 4.45ms	remaining: 32.6ms
6:	learn: 0.7233200	total: 5.08ms	remaining: 31.2ms
7:	learn: 0.7158872	total: 5.69ms	remaining: 29.9ms
8:	learn: 0.7087192	total: 6.37ms	remaining: 29ms
9:	learn: 0.7003106	total: 7.03ms	remaining: 28.1ms
10:	learn: 0.6944448	total: 7.66ms	remaining: 27.1ms
11:	learn: 0.6869355	total: 8.24ms	remaining: 26.1ms
12:	learn: 0.6818191	total: 8.85ms	remaining: 25.2ms
13:	learn: 0.6757595	total: 9.43ms	remaining: 24.2ms
14:	learn: 0.6712115	total: 10ms	remaining: 23.3ms
15:	learn: 0.6676887	total: 10.4ms	remaining: 22ms
16:	learn: 0.6636343	total: 10.9ms	remaining: 21.2ms
17:	learn: 0.6602928	total: 11.5ms	remaining: 20.4ms
18:	learn: 0.6564350	total: 12.1ms	remaining: 19.8ms
19:	learn: 

In [12]:
df_missing_val = df_missing_val[selected_features]
mv_pred_test_numpy = model.predict(df_missing_val)


In [13]:
len(mv_pred_test_numpy)

1094

In [14]:
df_missing_val_original['ldl_12m'] = mv_pred_test_numpy

In [15]:
df_missing_val_original['ldl_12m']

6963     2.640102
1550     2.689150
7758     2.335955
3104     2.243880
6673     2.962870
           ...   
279      2.531712
12764    2.048049
6266     2.563203
8510     1.340796
4446     2.473878
Name: ldl_12m, Length: 1094, dtype: float64

In [16]:
df_original

Unnamed: 0,id,init_year,drug_class,MD_RCT_mmol_mol,hba1c_bl_18m,hba1c_bl_6m,sp,ika,t2d_dur_y,P_Krea,...,date_bmi_12m,date_hdl_12m,days_hba1c,days_bmi,days_hdl,days_ldl,hba1c_12m,ldl_12m,hdl_12m,bmi_12m
8087,106358,2018,3,-5.7929,50.0,55.0,1,83.0,4,56.0,...,,2019-10-23,380.0,,380.0,380.0,50.0,2.2,1.06,
4868,64542,2019,4,-6.2301,58.0,68.0,2,76.0,17,85.0,...,,,274.0,,,274.0,49.0,2.9,,
6448,84221,2019,4,-6.2301,60.0,58.0,1,56.0,13,62.0,...,2020-11-05,2020-04-07,313.0,357.0,229.0,313.0,59.0,1.7,1.47,35.154137
13607,1043704,2019,3,-5.7929,46.0,59.0,2,60.0,6,94.0,...,2019-07-09,,350.0,343.0,,350.0,49.0,2.0,,37.654320
12953,168968,2017,3,-5.7929,66.0,68.0,1,68.0,6,53.0,...,,,301.0,,,301.0,60.0,4.8,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12024,156466,2017,3,-6.5580,48.0,69.0,2,79.0,6,128.0,...,2018-01-22,,314.0,558.0,,44.0,88.0,2.6,,31.100306
7001,91628,2016,4,-5.5743,65.0,74.0,2,70.0,7,58.0,...,2017-02-07,2017-04-04,385.0,307.0,385.0,385.0,65.0,2.4,1.45,31.673470
5306,69736,2020,4,-6.2301,54.0,54.0,2,59.0,6,65.0,...,2021-11-17,,384.0,355.0,,384.0,56.0,1.9,,32.488628
13507,1001933,2014,3,-5.7929,59.0,,2,51.0,3,57.0,...,,,,,,471.0,59.0,3.3,,


In [17]:
result_df = pd.concat([df_original, df_missing_val_original])

In [18]:
result_df.to_csv('../data/mvldl.csv', index=True)

In [19]:
result_df[['ldl_12m']]

Unnamed: 0,ldl_12m
8087,2.200000
4868,2.900000
6448,1.700000
13607,2.000000
12953,4.800000
...,...
279,2.531712
12764,2.048049
6266,2.563203
8510,1.340796
