In [1]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import pickle
%matplotlib inline

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import LabelEncoder

# Explore and understand the data

In [4]:
# Read the different files.
data_dict = pd.read_csv('/Users/princessiria/Desktop/BigDataPrincess/widsdatathon2021/DataDictionaryWiDS2021.csv')
train_df = pd.read_csv('/Users/princessiria/Desktop/BigDataPrincess/widsdatathon2021/TrainingWiDS2021.csv')
unlabelled_df = pd.read_csv('/Users/princessiria/Desktop/BigDataPrincess/widsdatathon2021/UnlabeledWiDS2021.csv')

In [5]:
data_dict.head()

Unnamed: 0,Category,Variable Name,Unit of Measure,Data Type,Description,Example
0,identifier,encounter_id,,integer,Unique identifier associated with a patient un...,
1,identifier,hospital_id,,integer,Unique identifier associated with a hospital,
2,demographic,age,Years,numeric,The age of the patient on unit admission,
3,demographic,bmi,kilograms/metres^2,string,The body mass index of the person on unit admi...,21.5
4,demographic,elective_surgery,,binary,Whether the patient was admitted to the hospit...,0.0


In [6]:
missing_count = train_df.isna().sum()
missing_df = (pd.concat([missing_count.rename('Missing count'),
                     missing_count.div(len(train_df))
                          .rename('Missing ratio')],axis = 1)
             .loc[missing_count.ne(0)])
missing_df

Unnamed: 0,Missing count,Missing ratio
age,4988,0.038323
bmi,4490,0.034497
ethnicity,1587,0.012193
gender,66,0.000507
height,2077,0.015958
...,...,...
h1_arterial_ph_min,107849,0.828607
h1_arterial_po2_max,107445,0.825503
h1_arterial_po2_min,107445,0.825503
h1_pao2fio2ratio_max,113397,0.871232


In [7]:
# list the columns.
train_df.columns

Index(['Unnamed: 0', 'encounter_id', 'hospital_id', 'age', 'bmi',
       'elective_surgery', 'ethnicity', 'gender', 'height',
       'hospital_admit_source',
       ...
       'h1_pao2fio2ratio_max', 'h1_pao2fio2ratio_min', 'aids', 'cirrhosis',
       'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma',
       'solid_tumor_with_metastasis', 'diabetes_mellitus'],
      dtype='object', length=181)

In [8]:
# Describe the data to see what it looks like
train_df.describe()

Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,height,icu_id,pre_icu_los_days,readmission_status,...,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
count,130157.0,130157.0,130157.0,125169.0,125667.0,130157.0,128080.0,130157.0,130157.0,130157.0,...,16760.0,16760.0,130157.0,130157.0,130157.0,130157.0,130157.0,130157.0,130157.0,130157.0
mean,65079.0,213000.856519,106.102131,61.995103,29.11026,0.18984,169.607219,662.428344,0.839933,0.0,...,247.525419,239.617358,0.00103,0.016081,0.013599,0.025669,0.007307,0.004187,0.020852,0.216285
std,37573.233831,38109.828146,63.482277,16.82288,8.262776,0.392176,10.833085,304.259843,2.485337,0.0,...,131.440167,128.562211,0.03207,0.125786,0.115819,0.158146,0.085166,0.064574,0.142888,0.411712
min,1.0,147000.0,1.0,0.0,14.844926,0.0,137.2,82.0,-0.25,0.0,...,42.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32540.0,180001.0,49.0,52.0,23.598006,0.0,162.5,427.0,0.045833,0.0,...,144.0,138.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,65079.0,213014.0,112.0,64.0,27.564749,0.0,170.1,653.0,0.155556,0.0,...,228.125,218.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,97618.0,246002.0,165.0,75.0,32.803127,0.0,177.8,969.0,0.423611,0.0,...,333.0,324.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,130157.0,279000.0,204.0,89.0,67.81499,1.0,195.59,1111.0,175.627778,0.0,...,720.0,654.813793,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Vizualize and see relationships

In [None]:
plt.figure(figsize = (16,8))
plt.title('ethnicity', size = 20)
sns.countplot(y ='ethnicity', data = train_df);

In [None]:
plt.figure(figsize = (16,8))
plt.title('Hospital Admit Source', size = 20)
sns.countplot(y ='hospital_admit_source', data = train_df);

# Data Cleaning Null Values

In [9]:
# Drop unused columns
train_df = train_df.drop(columns=['Unnamed: 0'])

In [10]:
# fill the NAN values
dt_i=[]
dt_fl=[]
dt_o=[]
for col in train_df.columns:
    x=train_df[col].dtype
    if x=='int64':
        dt_i.append(col)
    elif x=='float64':
        dt_fl.append(col)
    else:
        dt_o.append(col)

In [11]:
dt_fl.append("diabetes_mellitus")
train_df[np.intersect1d(train_df.columns, dt_fl)].corr().style.background_gradient(cmap='Oranges')

Unnamed: 0,age,albumin_apache,apache_2_diagnosis,apache_3j_diagnosis,bilirubin_apache,bmi,bun_apache,creatinine_apache,d1_albumin_max,d1_albumin_min,d1_arterial_pco2_max,d1_arterial_pco2_min,d1_arterial_ph_max,d1_arterial_ph_min,d1_arterial_po2_max,d1_arterial_po2_min,d1_bilirubin_max,d1_bilirubin_min,d1_bun_max,d1_bun_min,d1_calcium_max,d1_calcium_min,d1_creatinine_max,d1_creatinine_min,d1_diasbp_invasive_max,d1_diasbp_invasive_min,d1_diasbp_max,d1_diasbp_min,d1_diasbp_noninvasive_max,d1_diasbp_noninvasive_min,d1_glucose_max,d1_glucose_min,d1_hco3_max,d1_hco3_min,d1_heartrate_max,d1_heartrate_min,d1_hemaglobin_max,d1_hemaglobin_min,d1_hematocrit_max,d1_hematocrit_min,d1_inr_max,d1_inr_min,d1_lactate_max,d1_lactate_min,d1_mbp_invasive_max,d1_mbp_invasive_min,d1_mbp_max,d1_mbp_min,d1_mbp_noninvasive_max,d1_mbp_noninvasive_min,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,d1_platelets_max,d1_platelets_min,d1_potassium_max,d1_potassium_min,d1_resprate_max,d1_resprate_min,d1_sodium_max,d1_sodium_min,d1_spo2_max,d1_spo2_min,d1_sysbp_invasive_max,d1_sysbp_invasive_min,d1_sysbp_max,d1_sysbp_min,d1_sysbp_noninvasive_max,d1_sysbp_noninvasive_min,d1_temp_max,d1_temp_min,d1_wbc_max,d1_wbc_min,diabetes_mellitus,fio2_apache,gcs_eyes_apache,gcs_motor_apache,gcs_unable_apache,gcs_verbal_apache,glucose_apache,h1_albumin_max,h1_albumin_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_bilirubin_max,h1_bilirubin_min,h1_bun_max,h1_bun_min,h1_calcium_max,h1_calcium_min,h1_creatinine_max,h1_creatinine_min,h1_diasbp_invasive_max,h1_diasbp_invasive_min,h1_diasbp_max,h1_diasbp_min,h1_diasbp_noninvasive_max,h1_diasbp_noninvasive_min,h1_glucose_max,h1_glucose_min,h1_hco3_max,h1_hco3_min,h1_heartrate_max,h1_heartrate_min,h1_hemaglobin_max,h1_hemaglobin_min,h1_hematocrit_max,h1_hematocrit_min,h1_inr_max,h1_inr_min,h1_lactate_max,h1_lactate_min,h1_mbp_invasive_max,h1_mbp_invasive_min,h1_mbp_max,h1_mbp_min,h1_mbp_noninvasive_max,h1_mbp_noninvasive_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,h1_platelets_max,h1_platelets_min,h1_potassium_max,h1_potassium_min,h1_resprate_max,h1_resprate_min,h1_sodium_max,h1_sodium_min,h1_spo2_max,h1_spo2_min,h1_sysbp_invasive_max,h1_sysbp_invasive_min,h1_sysbp_max,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,h1_temp_max,h1_temp_min,h1_wbc_max,h1_wbc_min,heart_rate_apache,height,hematocrit_apache,map_apache,paco2_apache,paco2_for_ph_apache,pao2_apache,ph_apache,pre_icu_los_days,resprate_apache,sodium_apache,temp_apache,urineoutput_apache,wbc_apache,weight
age,1.0,-0.113363,0.021873,-0.062258,-0.050134,-0.083176,0.237548,0.05898,-0.122881,-0.110585,0.039286,0.055056,0.041229,0.049871,-0.030793,-0.050375,-0.05001,-0.046362,0.239831,0.267634,0.021334,0.04796,0.055498,0.073693,-0.188209,-0.255793,-0.057838,-0.213874,-0.054958,-0.211937,0.014772,0.062673,0.065761,0.100122,-0.140996,-0.143476,-0.152139,-0.142643,-0.122769,-0.116992,0.10967,0.122369,0.023785,0.042114,-0.017743,-0.120473,0.004112,-0.138138,0.003651,-0.137298,-0.133895,-0.117401,-0.078175,-0.076408,0.061542,0.111289,0.026421,0.037319,0.001621,0.035196,-0.041987,-0.085582,0.036304,-0.001816,0.10442,-0.064804,0.104295,-0.065574,-0.083762,-0.066022,-0.007548,-0.005192,0.081019,0.027926,0.030701,0.027903,-0.002532,-0.009352,0.009594,-0.108709,-0.108347,0.043347,0.037953,0.094864,0.087166,-0.00141,-0.023397,-0.038151,-0.037981,0.186225,0.186563,0.070397,0.073299,0.038351,0.038961,-0.250053,-0.277217,-0.144056,-0.202101,-0.134044,-0.194955,-0.066088,-0.051381,0.163897,0.164032,-0.16496,-0.164375,-0.131076,-0.131654,-0.109433,-0.109977,0.10967,0.122369,0.007132,0.008994,-0.052725,-0.130426,-0.062476,-0.11423,-0.060667,-0.108855,-0.100749,-0.104265,-0.09448,-0.094163,0.044009,0.04277,0.023625,0.029852,0.020575,0.019362,-0.067119,-0.077895,0.035321,-0.017662,0.046467,-0.010698,0.046991,-0.007999,-0.082243,-0.085671,-0.054975,-0.054082,-0.150945,-0.113424,-0.1159,-0.020169,0.032291,0.032291,-0.031214,0.044625,0.049,0.032616,0.031858,-0.077945,-0.181133,-0.012766,-0.125448
albumin_apache,-0.113363,1.0,0.038056,-0.093456,-0.134835,0.057065,-0.207615,-0.113871,0.962575,0.934477,0.061791,0.144925,0.06946,0.140231,0.00271,0.06358,-0.136182,-0.125748,-0.206462,-0.20984,0.473723,0.490837,-0.11425,-0.113402,0.067292,0.203476,0.088812,0.261525,0.08906,0.262435,-0.052585,0.036452,0.167194,0.182062,-0.207561,-0.132275,0.387081,0.419667,0.381573,0.406646,-0.177222,-0.178037,-0.157233,-0.151001,0.030706,0.218346,0.12695,0.292513,0.128742,0.294404,0.075227,0.120461,0.015967,0.048856,-0.043274,0.024082,-0.102539,0.00965,-0.037457,0.005574,-0.075788,0.146829,0.091401,0.236702,0.13513,0.299187,0.135977,0.299889,-0.110547,0.099564,-0.170839,-0.139882,-0.024742,-0.11009,0.090287,0.086599,-0.014851,0.113289,-0.028858,0.87753,0.877428,0.075521,0.077214,0.109816,0.106882,0.063411,0.044244,-0.142493,-0.142428,-0.183443,-0.183226,0.413455,0.409923,-0.112844,-0.112694,0.095407,0.144195,0.176026,0.284365,0.179827,0.289604,0.005855,0.012986,0.171612,0.171573,-0.188759,-0.177182,0.35601,0.35576,0.346204,0.332906,-0.177222,-0.178037,-0.158953,-0.159905,0.057289,0.164059,0.223012,0.310546,0.226911,0.316278,0.086029,0.087406,0.014557,0.015416,-0.028837,-0.028888,-0.116605,-0.075925,0.003328,0.006287,0.012502,0.10075,0.111519,0.158317,0.219249,0.300429,0.22226,0.305274,-0.020778,0.014295,-0.104457,-0.102699,-0.167257,0.070031,0.419918,0.148418,0.109495,0.109495,0.023285,0.139615,-0.142386,-0.040062,-0.011621,0.056733,0.137936,-0.158166,0.086274
apache_2_diagnosis,0.021873,0.038056,1.0,0.396156,0.047123,0.027004,-0.059767,0.004909,0.03823,0.034245,-0.056577,-0.044197,0.058307,0.06319,0.09775,0.042088,0.046673,0.044608,-0.06085,-0.063841,0.023011,0.011465,0.003138,-0.000757,0.007579,0.022482,-0.057781,0.063931,-0.055262,0.066926,-0.047491,-0.037831,0.004557,0.018967,-0.120093,-0.042746,-0.014164,-0.036186,-0.030901,-0.053336,-0.100779,-0.111252,-0.059486,-0.060151,0.012017,0.055872,-0.026556,0.081337,-0.028221,0.084112,0.090987,0.075898,0.000837,-0.011816,0.051424,0.043526,-0.075485,-0.098918,-0.026131,-0.035091,0.009397,0.062794,0.053081,0.057689,0.012672,0.100837,0.010573,0.10301,-0.005247,0.027763,-0.015564,-0.041047,0.003797,-0.069734,0.048761,0.059459,0.004894,0.023887,-0.056332,-0.01161,-0.011503,-0.092201,-0.112649,0.158054,0.131196,0.152328,0.094424,0.058686,0.058656,-0.095937,-0.095437,-0.013905,-0.01405,-0.035463,-0.034791,-0.018975,-0.066938,-0.017491,0.025081,-0.0025,0.044784,-0.100596,-0.110686,0.090818,0.090741,-0.149031,-0.139873,-0.058601,-0.074873,-0.085435,-0.107689,-0.100779,-0.111252,-0.070637,-0.072845,-0.003068,-0.031605,0.029453,0.050577,0.032525,0.065882,0.065908,0.05897,-0.100465,-0.103026,-0.005803,-0.023769,-0.148325,-0.164946,0.011638,2.4e-05,0.047274,0.058151,0.039704,-0.021228,0.061126,0.079244,0.060283,0.088669,-0.052669,-0.060618,0.002897,0.001065,-0.1003,0.001781,-0.046831,0.040412,-0.057763,-0.057763,0.03848,0.074793,0.087497,-0.102286,-0.029028,0.008853,0.007766,-0.023872,0.02721
apache_3j_diagnosis,-0.062258,-0.093456,0.396156,1.0,-0.0066,-0.011806,-0.119273,-0.057478,-0.080902,-0.097029,-0.107429,-0.125958,0.031802,0.007797,0.153221,0.05948,-0.005869,-0.013782,-0.11824,-0.134367,-0.135881,-0.170789,-0.055182,-0.068982,0.02669,-0.018075,-0.163909,-0.000654,-0.159106,0.004322,0.006423,-0.02735,-0.081934,-0.09414,-0.020134,0.035555,-0.079726,-0.124493,-0.104456,-0.150942,-0.110717,-0.125961,-0.03001,-0.049497,0.035914,0.018577,-0.139943,4.5e-05,-0.145157,0.003529,0.156962,0.105098,-0.045882,-0.072621,0.072471,-0.000961,-0.087924,-0.157989,0.064246,0.017181,0.065724,0.061196,0.064503,0.019782,-0.100976,0.021707,-0.106475,0.022999,0.110397,-0.003243,0.110417,0.062587,-0.009536,-0.06325,-0.011953,0.016374,0.028149,-0.029991,-0.009355,-0.134405,-0.13442,-0.156181,-0.18619,0.147016,0.108213,0.239472,0.163847,-0.0007,-0.000756,-0.176263,-0.175856,-0.154136,-0.152658,-0.116628,-0.116274,-0.047188,-0.103164,-0.118544,-0.092154,-0.094214,-0.063976,-0.035068,-0.0581,-0.022303,-0.025442,-0.054703,-0.061463,-0.103982,-0.128639,-0.139775,-0.171903,-0.110717,-0.125961,-0.054948,-0.057096,-0.005576,-0.055466,-0.075614,-0.077461,-0.080306,-0.061932,0.122253,0.109676,-0.183328,-0.187579,-0.039754,-0.066977,-0.189729,-0.224843,0.139458,0.118424,0.103988,0.078942,0.045102,-0.019929,-0.042197,-0.042673,-0.054134,-0.036133,-0.061525,-0.100899,0.091759,0.088146,-0.006571,0.015047,-0.143344,-0.011194,-0.124286,-0.124286,0.075947,0.035648,0.085723,-0.153588,0.035131,0.000341,0.054646,0.091665,-0.004145
bilirubin_apache,-0.050134,-0.134835,0.047123,-0.0066,1.0,0.004116,0.115324,0.077907,-0.113367,-0.138332,-0.097938,-0.13589,-0.005158,-0.040054,0.017913,-0.019353,0.996746,0.987449,0.115538,0.111035,-0.046475,-0.078805,0.077989,0.074871,-0.080854,-0.082747,-0.050626,-0.074894,-0.050105,-0.074883,-0.032259,-0.065886,-0.114842,-0.129546,0.053244,0.05916,-0.088227,-0.121151,-0.116565,-0.142309,0.205276,0.200411,0.176654,0.186941,-0.046079,-0.08827,-0.068252,-0.082436,-0.068369,-0.082176,-0.007331,-0.038452,-0.17751,-0.193106,0.024037,-0.018979,0.030462,0.015707,-0.08728,-0.116261,0.002386,-0.031624,-0.088123,-0.090417,-0.083258,-0.082603,-0.083236,-0.082174,0.005483,-0.041212,0.058584,0.03745,-0.041216,0.034079,-0.031667,-0.025043,0.012059,-0.028746,-0.046612,-0.163129,-0.162931,-0.110342,-0.114869,0.0141,0.012244,0.017915,0.014439,0.983557,0.983539,0.106126,0.105976,-0.042655,-0.042442,0.076718,0.076632,-0.070853,-0.055226,-0.060782,-0.077564,-0.060571,-0.079107,-0.070971,-0.076956,-0.109805,-0.109763,0.048964,0.058932,-0.115767,-0.115835,-0.148441,-0.145844,0.205276,0.200411,0.149042,0.151227,-0.058651,-0.073368,-0.078435,-0.087526,-0.07957,-0.08778,0.020422,0.012841,-0.174466,-0.174751,-0.004513,0.002337,0.031174,0.03054,-0.107525,-0.109042,-0.010986,-0.006517,-0.084822,-0.065038,-0.089765,-0.091388,-0.09043,-0.09164,-0.004313,-0.012587,0.030559,0.030416,0.052214,0.047297,-0.138607,-0.060952,-0.113811,-0.113811,-0.003025,-0.032307,0.081105,0.028761,-0.106201,-0.029867,-0.07904,0.04978,0.024878
bmi,-0.083176,0.057065,0.027004,-0.011806,0.004116,1.0,0.049204,0.070648,0.054968,0.057658,0.187087,0.182755,-0.051881,-0.060981,-0.096864,-0.108751,0.004482,0.004529,0.049047,0.054541,0.064478,0.069972,0.068879,0.070698,0.057008,0.052404,0.050049,-0.005661,0.050995,-0.006683,0.099016,0.131502,0.095474,0.094126,-0.027299,0.020804,0.056648,0.062691,0.089869,0.092264,0.021402,0.026366,-0.016903,-0.012564,0.028351,0.005885,0.059872,0.015296,0.061125,0.015394,-0.170284,-0.166024,0.020174,0.021416,0.085362,0.093573,0.012264,-0.00623,-0.02731,-0.017909,-0.070854,-0.021425,0.013986,-0.004318,0.074783,0.049923,0.07572,0.050587,0.024732,0.03552,0.024974,0.026483,0.169043,0.041219,0.008824,0.018009,0.006069,0.025739,0.101687,0.028434,0.028317,0.177257,0.18028,-0.076117,-0.074406,-0.117038,-0.122166,0.012596,0.012476,0.046452,0.046509,0.045124,0.044164,0.063845,0.063925,0.043252,0.042455,0.030541,0.00828,0.030398,0.006065,0.067146,0.076737,0.082922,0.081688,-0.013505,-0.000138,0.06296,0.061215,0.085242,0.079215,0.021402,0.026366,-0.024237,-0.025197,0.011609,-0.003871,0.038521,0.020653,0.03954,0.02181,-0.180801,-0.178718,0.021493,0.021998,0.091163,0.089442,0.014499,0.003099,-0.013354,-0.014701,-0.060669,-0.034126,-0.00874,-0.01261,0.053448,0.044037,0.055714,0.045989,0.040435,0.045625,0.060432,0.06063,-0.019621,-0.056617,0.096054,0.057138,0.18294,0.18294,-0.103279,-0.055671,0.000595,0.007669,-0.023592,0.038775,0.059055,0.03124,0.878656
bun_apache,0.237548,-0.207615,-0.059767,-0.119273,0.115324,0.049204,1.0,0.687038,-0.193257,-0.210229,-0.021904,-0.066225,-0.148144,-0.179836,-0.063529,-0.072026,0.114466,0.110599,0.986554,0.96596,-0.024149,-0.089698,0.682957,0.663078,-0.134483,-0.189991,-0.031797,-0.206201,-0.03178,-0.207668,0.177838,0.002571,-0.157273,-0.228258,0.031103,0.033706,-0.237312,-0.241611,-0.216572,-0.2208,0.217819,0.219964,0.091589,0.090954,-0.056167,-0.193156,-0.029998,-0.184018,-0.029221,-0.185441,-0.079441,-0.090277,-0.066042,-0.080155,0.338375,0.236042,0.063543,0.056707,0.039246,-0.037229,0.013324,-0.109241,-0.080293,-0.148681,-0.015243,-0.156918,-0.014198,-0.157879,-0.065933,-0.102571,0.111796,0.108267,0.145241,0.047532,-0.048257,-0.049171,0.000373,-0.066154,0.149821,-0.193952,-0.194086,-0.000698,0.005807,-0.208631,-0.191268,-0.114052,-0.091244,0.121866,0.121848,0.97325,0.973405,0.017132,0.018011,0.703113,0.703598,-0.164926,-0.148713,-0.12285,-0.185823,-0.129835,-0.19648,0.121777,0.122376,-0.218792,-0.217833,0.020688,0.038867,-0.194952,-0.183147,-0.167223,-0.155019,0.217819,0.219964,0.083452,0.087547,-0.086926,-0.1385,-0.125693,-0.166915,-0.125834,-0.173357,-0.075607,-0.075463,-0.005481,-0.003358,0.365577,0.377875,0.09749,0.110507,-0.06378,-0.05859,-0.050977,-0.086837,-0.099644,-0.090248,-0.099062,-0.131509,-0.098318,-0.136352,-0.068269,-0.06467,0.069176,0.070232,0.019658,0.016097,-0.227035,-0.072813,-0.029558,-0.029558,-0.055287,-0.175709,0.040499,0.076158,-0.011219,-0.094514,-0.152693,0.110641,0.051537
creatinine_apache,0.05898,-0.113871,0.004909,-0.057478,0.077907,0.070648,0.687038,1.0,-0.096595,-0.117955,-0.049948,-0.103874,-0.147258,-0.199743,-0.0212,-0.047297,0.077933,0.073671,0.686441,0.649916,-0.019277,-0.080565,0.993402,0.968595,-0.079848,-0.141663,0.011798,-0.116078,0.012471,-0.116552,0.106403,-0.056753,-0.156108,-0.218847,0.015586,0.015482,-0.191905,-0.190658,-0.179102,-0.176923,0.143986,0.139969,0.129608,0.121711,-0.037054,-0.149371,0.026228,-0.099923,0.028333,-0.100002,-0.025242,-0.056404,-0.055887,-0.069097,0.326836,0.208658,0.044111,0.002113,-0.03497,-0.092912,0.025033,-0.093851,-0.037013,-0.120218,0.047791,-0.084692,0.048562,-0.08501,-0.029519,-0.084554,0.059093,0.047999,0.124891,0.052706,-0.041328,-0.044638,-0.001417,-0.036635,0.074288,-0.11432,-0.114514,-0.035956,-0.034544,-0.213884,-0.204279,-0.06464,-0.050499,0.098567,0.098541,0.696656,0.696656,-0.001142,-0.001201,0.978902,0.979137,-0.08815,-0.088961,-0.044949,-0.093061,-0.048074,-0.099056,0.046473,0.041775,-0.21162,-0.210911,0.00677,0.016713,-0.154174,-0.148426,-0.136338,-0.126326,0.143986,0.139969,0.1306,0.132984,-0.050761,-0.093616,-0.0378,-0.073544,-0.035826,-0.076137,-0.042855,-0.044982,-0.012007,-0.010771,0.343549,0.35024,0.05619,0.049062,-0.087795,-0.086572,-0.019063,-0.058492,-0.050044,-0.061372,-0.016485,-0.051047,-0.014542,-0.052181,-0.047841,-0.046445,0.037889,0.038223,0.007314,0.052856,-0.180025,-0.008816,-0.072662,-0.072662,-0.024463,-0.188631,0.023029,0.042222,-0.07687,-0.068428,-0.155146,0.056271,0.087927
d1_albumin_max,-0.122881,0.962575,0.03823,-0.080902,-0.113367,0.054968,-0.193257,-0.096595,1.0,0.910948,0.063587,0.112674,0.060415,0.097308,0.042213,0.064999,-0.111904,-0.114235,-0.193439,-0.208605,0.489287,0.459241,-0.096105,-0.106484,0.063425,0.195222,0.090662,0.248443,0.091255,0.249665,-0.023213,0.028007,0.156159,0.140083,-0.187773,-0.125486,0.407679,0.410233,0.400403,0.395597,-0.162161,-0.17925,-0.098198,-0.110903,0.027163,0.203785,0.129985,0.277847,0.130894,0.280149,0.101836,0.114247,0.016854,0.032074,-0.009273,0.007484,-0.093714,0.0033,-0.015295,-0.005749,-0.067459,0.136672,0.084315,0.218806,0.134806,0.280383,0.135539,0.281724,-0.100612,0.076015,-0.149533,-0.141963,-0.022031,-0.091957,0.05959,0.056456,-0.014139,0.086592,-0.004055,0.884402,0.883735,0.067304,0.066035,0.07673,0.071172,0.091183,0.069641,-0.119214,-0.119268,-0.188303,-0.188229,0.425439,0.419597,-0.10628,-0.10628,0.094097,0.135219,0.177782,0.275015,0.181686,0.280729,0.022779,0.024475,0.148318,0.148154,-0.173691,-0.164143,0.390829,0.386091,0.380353,0.364357,-0.162161,-0.17925,-0.099177,-0.101989,0.062787,0.146977,0.223643,0.300143,0.227304,0.306583,0.102712,0.101307,0.018996,0.018845,-0.021191,-0.022565,-0.108692,-0.071654,0.017542,0.017822,0.017296,0.099635,0.101553,0.137741,0.21526,0.286419,0.218169,0.291733,-0.03457,-0.002331,-0.100849,-0.100809,-0.148562,0.069664,0.396282,0.154038,0.087371,0.087371,0.037471,0.113383,-0.14951,-0.034245,-0.014915,0.035191,0.134166,-0.144067,0.08335
d1_albumin_min,-0.110585,0.934477,0.034245,-0.097029,-0.138332,0.057658,-0.210229,-0.117955,0.910948,1.0,0.066456,0.160233,0.073524,0.155573,0.002616,0.071238,-0.138091,-0.124578,-0.212283,-0.211299,0.475053,0.506101,-0.12005,-0.115047,0.05909,0.214206,0.091903,0.265788,0.092329,0.267207,-0.060032,0.041636,0.172144,0.196784,-0.213077,-0.131758,0.388032,0.429106,0.382877,0.41727,-0.189393,-0.184534,-0.175059,-0.165103,0.023302,0.230068,0.129146,0.295103,0.130753,0.297567,0.078057,0.127291,0.017533,0.05691,-0.051135,0.030613,-0.10567,0.014518,-0.047402,0.007613,-0.079688,0.150109,0.084909,0.252094,0.137048,0.304197,0.137884,0.305625,-0.110758,0.105313,-0.179864,-0.142552,-0.023808,-0.119223,0.092835,0.089551,-0.016165,0.118585,-0.030689,0.888114,0.888971,0.083049,0.08518,0.121164,0.118159,0.070424,0.048393,-0.153986,-0.153829,-0.193916,-0.193695,0.427807,0.425028,-0.119894,-0.119752,0.094839,0.142859,0.179301,0.284666,0.183604,0.290611,0.00232,0.010388,0.182051,0.182095,-0.191946,-0.179176,0.373338,0.370676,0.365265,0.351947,-0.189393,-0.184534,-0.172909,-0.175096,0.064003,0.166137,0.225447,0.309947,0.229612,0.316384,0.098066,0.096456,0.024517,0.024544,-0.036394,-0.035416,-0.117251,-0.075973,-0.000416,0.003654,0.013366,0.103485,0.119241,0.16596,0.221023,0.301216,0.224514,0.305874,-0.018584,0.017782,-0.125092,-0.124265,-0.168302,0.067283,0.409782,0.151784,0.117263,0.117263,0.028034,0.149286,-0.152519,-0.040516,-0.014424,0.058155,0.131641,-0.165323,0.084955


In [12]:
dt_fl.remove("diabetes_mellitus")
dt_i.remove("diabetes_mellitus")

In [13]:
import dask.dataframe as dd
from dask.dataframe.utils import make_meta

In [14]:
train_df = pd.DataFrame(train_df)
unlabelled_df = pd.DataFrame(unlabelled_df)

In [15]:
train_df[dt_o].isna().sum()

ethnicity                 1587
gender                      66
hospital_admit_source    33198
icu_admit_source           240
icu_stay_type                0
icu_type                     0
dtype: int64

In [16]:
train_df[dt_o] = train_df[dt_o].fillna("")
train_df[dt_o].isna().sum()

ethnicity                0
gender                   0
hospital_admit_source    0
icu_admit_source         0
icu_stay_type            0
icu_type                 0
dtype: int64

In [17]:
unlabelled_df[dt_o].isna().sum()

ethnicity                 204
gender                      5
hospital_admit_source    2733
icu_admit_source           25
icu_stay_type               0
icu_type                    0
dtype: int64

In [18]:
unlabelled_df[dt_o] = unlabelled_df[dt_o].fillna("")
unlabelled_df[dt_o].isna().sum()

ethnicity                0
gender                   0
hospital_admit_source    0
icu_admit_source         0
icu_stay_type            0
icu_type                 0
dtype: int64

In [19]:
train_df[dt_i] = train_df[dt_i].fillna(value=0)
unlabelled_df[dt_i] = unlabelled_df[dt_i].fillna(value=0)

train_df[dt_fl] = train_df[dt_fl].fillna(value=0)
unlabelled_df[dt_fl] = unlabelled_df[dt_fl].fillna(value=0)

# Model Section

In [20]:
train_df_without = train_df.drop(columns=['diabetes_mellitus'], axis=1)

In [21]:
# train_df_without = train_df.drop(columns=['encounter_id'], axis=1)

In [22]:
train_df_without.shape

(130157, 179)

In [23]:
le = LabelEncoder()
train_df_without['ethnicity'] = le.fit_transform(train_df_without.ethnicity.astype(str))
train_df_without['gender'] = le.fit_transform(train_df_without.gender.astype(str))
train_df_without['hospital_admit_source'] = le.fit_transform(train_df_without.hospital_admit_source.astype(str))

train_df_without['icu_admit_source'] = le.fit_transform(train_df_without.icu_admit_source.astype(str))
train_df_without['icu_stay_type'] = le.fit_transform(train_df_without.icu_stay_type.astype(str))
train_df_without['icu_type'] = le.fit_transform(train_df_without.icu_type.astype(str))


In [24]:
#split into training and test sets
X = train_df_without
Y = train_df['diabetes_mellitus']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, stratify=Y)

# Testing Randomforest Starts

In [None]:
# try the randomforest model
# from sklearn.feature_extraction.text import TfidfVectorizer
# vect = TfidfVectorizer(sublinear_tf=True, norm='l2', ngram_range=(1,2))
# final_features = vect.fit_transform(train_df_without).toarray()
# final_features.shape


In [25]:
# try the randomforest model
pipeline2 = Pipeline([#('vec', vect),
                      #('chi', SelectKBest(chi2, k="all")),
                      ('clf', RandomForestClassifier())])

In [27]:
model2 = pipeline2.fit(X_train, y_train)

In [None]:
ytest = np.array(y_test)

In [None]:
print(classification_report(ytest, model2.predict(X_test)))
print(confusion_matrix(ytest, model2.predict(X_test)))

In [28]:
unlabelled_df['ethnicity'] = le.fit_transform(unlabelled_df.ethnicity.astype(str))
unlabelled_df['gender'] = le.fit_transform(unlabelled_df.gender.astype(str))
unlabelled_df['hospital_admit_source'] = le.fit_transform(unlabelled_df.hospital_admit_source.astype(str))

unlabelled_df['icu_admit_source'] = le.fit_transform(unlabelled_df.icu_admit_source.astype(str))
unlabelled_df['icu_stay_type'] = le.fit_transform(unlabelled_df.icu_stay_type.astype(str))
unlabelled_df['icu_type'] = le.fit_transform(unlabelled_df.icu_type.astype(str))

In [29]:
unlabelled_df = unlabelled_df.drop(columns=['Unnamed: 0'])

In [None]:
# unlabelled_df = unlabelled_df.drop(columns=['encounter_id'])

In [None]:
unlabelled_df['diabetes_mellitus'] = model2.predict(unlabelled_df)

In [None]:
sub_df = unlabelled_df[['encounter_id', 'diabetes_mellitus']]
sub_df.to_csv('initiaRandomTest2.csv', index=False)

# Testing Randomforest ends

# Testing LogisticRegression starts

In [None]:
model = LogisticRegression(solver='liblinear', random_state=0)

draft_model = model.fit(X_train, y_train)

In [None]:
# Save the model
with open('LogisticRegression.pickle', 'wb') as f:
    pickle.dump(draft_model, f)
ytest = np.array(y_test)

In [None]:
print(classification_report(ytest, draft_model.predict(X_test)))
print(confusion_matrix(ytest, draft_model.predict(X_test)))

# Test the model against the unlabelled_df

In [None]:
# Clean the unlabelled_df just like the train_df

In [None]:
unlabelled_df['ethnicity'] = le.fit_transform(unlabelled_df.ethnicity.astype(str))
unlabelled_df['gender'] = le.fit_transform(unlabelled_df.gender.astype(str))
unlabelled_df['hospital_admit_source'] = le.fit_transform(unlabelled_df.hospital_admit_source.astype(str))

unlabelled_df['icu_admit_source'] = le.fit_transform(unlabelled_df.icu_admit_source.astype(str))
unlabelled_df['icu_stay_type'] = le.fit_transform(unlabelled_df.icu_stay_type.astype(str))
unlabelled_df['icu_type'] = le.fit_transform(unlabelled_df.icu_type.astype(str))

In [None]:
unlabelled_df = unlabelled_df.drop(columns=['Unnamed: 0'])

In [None]:
unlabelled_df['diabetes_mellitus'] = draft_model.predict(unlabelled_df)

In [None]:
see_1 = unlabelled_df[unlabelled_df['diabetes_mellitus'] == 1]

In [None]:
# Write unlabelled_df to a csv file.
unlabelled_df.to_csv("draft_model_first_try.csv")

# Testing LogisticRegression ends

# KNeighborsClassifier begins

In [30]:
neigh = KNeighborsClassifier(n_neighbors=3)

In [31]:
model3 = neigh.fit(X_train, y_train)

In [None]:
print(classification_report(ytest, model3.predict(X_test)))
print(confusion_matrix(ytest, model3.predict(X_test)))

In [None]:
print(model3.predict_proba(X_test))

In [33]:
print(neigh.score(X_train, y_train))

0.8506100371861458


In [None]:
# unlabelled_df.shape

# X_train.columns

In [32]:
unlabelled_df['diabetes_mellitus'] = model3.predict(unlabelled_df)

In [34]:
sub_df = unlabelled_df[['encounter_id', 'diabetes_mellitus']]
sub_df.to_csv('initiaKNN_1.csv', index=False)