# **Disclaimer:** Minimal feature engineering Notebook. Neural Networks with blending have probably never been abused like this before

## Get all imports

In [1]:
seed_value= 12321
import os
os.environ['PYTHONHASHSEED'] = str(seed_value)
import random
random.seed(seed_value)
import numpy as np
np.random.seed(seed_value)
import tensorflow
tensorflow.random.set_seed(seed_value)
session_conf = tensorflow.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tensorflow.compat.v1.Session(graph=tensorflow.compat.v1.get_default_graph(), config=session_conf)
tensorflow.compat.v1.keras.backend.set_session(sess)

import pandas as pd
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, concatenate, Reshape, Flatten, BatchNormalization, Embedding, Dropout
from tensorflow.keras import initializers
from tensorflow.keras.optimizers import Adam

## Define all functions here

In [2]:
# This function adds a co-occurrence matrix with respect to patient_id
def add_coocur_mat(dataframe, feature1, feature2):
  hc_un = dataframe[feature2].unique()
  hc_un.sort()
  map_hc, map_pid = dict(), dict()
  for index, val in enumerate(hc_un):
    map_hc[val] = index

  for pid, df in tqdm(dataframe.groupby(feature1)):
    tmp = np.zeros(len(map_hc))
    for val in df[feature2].values:
      tmp[map_hc[val]]+=1
    map_pid[pid] = tmp

  tmp = list()
  for val in tqdm(dataframe[feature1].values):
    tmp.append(map_pid[val])
  tmp = np.array(tmp)
  for i in range(len(map_hc)):
    dataframe[feature1+"_"+feature2+"_"+str(i)] = tmp[:,i]

  return dataframe

## Load the data and perform pre-processing
### Minimal feature engineering

In [3]:
# Stack a basic lightgbm's probabilities
df_train_probas = pd.read_csv('/content/drive/My Drive/JanataHack_Healthcare/submit_proba_train_lgbm.csv')
df_test_probas = pd.read_csv('/content/drive/My Drive/JanataHack_Healthcare/submit_proba_test_lgbm.csv')

df_train = pd.merge(pd.read_csv('/content/drive/My Drive/JanataHack_Healthcare/train.csv'), df_train_probas, on='case_id')
df_test = pd.merge(pd.read_csv('/content/drive/My Drive/JanataHack_Healthcare/test.csv'), df_test_probas, on='case_id')

# Bed-grade has nulls. Bed-grade is different for each hospital
df_train["Bed Grade"] = df_train.groupby(["Hospital_code"], sort=False)["Bed Grade"].apply(lambda x: x.fillna(x.mean()))
df_test["Bed Grade"] = df_test.groupby(["Hospital_code"], sort=False)["Bed Grade"].apply(lambda x: x.fillna(x.mean()))

# City_Code_Patient has nulls. Treat them as a separate code
df_train["City_Code_Patient"] = df_train["City_Code_Patient"].fillna(39)
df_test["City_Code_Patient"] = df_test["City_Code_Patient"].fillna(39)

# Super Feature: How many times each hospital has been visited by patients having the same patient_id
df_train = add_coocur_mat(df_train.copy(deep=True), "patientid", "Hospital_code")
df_test = add_coocur_mat(df_test.copy(deep=True), "patientid", "Hospital_code")

# Drop useless features
train_case_ids, case_ids = df_train["case_id"].values, df_test["case_id"].values
df_train = df_train.drop(['case_id', 'patientid'], axis=1)
df_test = df_test.drop(['case_id', 'patientid'], axis=1)

# Categorical Encoding. Each categorical variable will be given its own embedding vector
le = LabelEncoder()
df_train["Hospital_type_code"] = le.fit_transform(df_train["Hospital_type_code"].values)
df_test["Hospital_type_code"] = le.transform(df_test["Hospital_type_code"].values)

df_train["Hospital_region_code"] = le.fit_transform(df_train["Hospital_region_code"].values)
df_test["Hospital_region_code"] = le.transform(df_test["Hospital_region_code"].values)

df_train["Department"] = le.fit_transform(df_train["Department"].values)
df_test["Department"] = le.transform(df_test["Department"].values)

df_train["Ward_Type"] = le.fit_transform(df_train["Ward_Type"].values)
df_test["Ward_Type"] = le.transform(df_test["Ward_Type"].values)

df_train["Ward_Facility_Code"] = le.fit_transform(df_train["Ward_Facility_Code"].values)
df_test["Ward_Facility_Code"] = le.transform(df_test["Ward_Facility_Code"].values)

df_train["Type of Admission"] = le.fit_transform(df_train["Type of Admission"].values)
df_test["Type of Admission"] = le.transform(df_test["Type of Admission"].values)

df_train["Severity of Illness"] = le.fit_transform(df_train["Severity of Illness"].values)
df_test["Severity of Illness"] = le.transform(df_test["Severity of Illness"].values)

df_train["City_Code_Patient"] = df_train['City_Code_Patient'].astype('int')
df_test["City_Code_Patient"] = df_test['City_Code_Patient'].astype('int')

# Generate target class maps
class_map = {"0-10": 0, "11-20": 1, "21-30": 2, "31-40": 3, "41-50": 4, "51-60": 5, "61-70": 6, "71-80": 7, "81-90": 8, "91-100": 9, "More than 100 Days": 10}
class_map_rev = {0: "0-10", 1: "11-20", 2: "21-30", 3: "31-40", 4: "41-50", 5: "51-60", 6: "61-70", 7: "71-80", 8: "81-90", 9: "91-100", 10: "More than 100 Days"}

# Correctly encode age feature
df_train["Age"] = [(class_map[i]*10)+1 for i in df_train["Age"].values]
df_test["Age"] = [(class_map[i]*10)+1 for i in df_test["Age"].values]

# Mark 1 if hospital and patient are in the same city
df_train['hosp_patient_same'] = [1 if i == j else 0 for i, j in zip(df_train["City_Code_Hospital"].values, df_train['City_Code_Patient'].values)]
df_test['hosp_patient_same'] = [1 if i == j else 0 for i, j in zip(df_test["City_Code_Hospital"].values, df_test['City_Code_Patient'].values)]

# Generate a list of categorical and ordinal features
cols_cat = ['Hospital_code', 'City_Code_Hospital', "Hospital_type_code", "Hospital_region_code", "Department", "Ward_Type", "Ward_Facility_Code", 'City_Code_Patient']
cols_cont = list()
for col in df_train.columns:
  if col not in cols_cat and col!="Stay":
    cols_cont.append(col)

# Standardize everything
scaler = StandardScaler()
df_train[cols_cont] = scaler.fit_transform(df_train[cols_cont].values)
df_test[cols_cont] = scaler.transform(df_test[cols_cont].values)

# Get the target variable
df_train["Stay"] = [class_map[i] for i in df_train["Stay"].values]
Y = pd.get_dummies(df_train['Stay']).values

df_train.head()

HBox(children=(FloatProgress(value=0.0, max=92017.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=318438.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39607.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=137057.0), HTML(value='')))




Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay,0-10,11-20,21-30,31-40,41-50,51-60,61-70,71-80,81-90,91-100,More Than 100 Days,patientid_Hospital_code_0,patientid_Hospital_code_1,patientid_Hospital_code_2,patientid_Hospital_code_3,patientid_Hospital_code_4,patientid_Hospital_code_5,patientid_Hospital_code_6,patientid_Hospital_code_7,patientid_Hospital_code_8,patientid_Hospital_code_9,patientid_Hospital_code_10,patientid_Hospital_code_11,patientid_Hospital_code_12,patientid_Hospital_code_13,patientid_Hospital_code_14,patientid_Hospital_code_15,patientid_Hospital_code_16,patientid_Hospital_code_17,patientid_Hospital_code_18,patientid_Hospital_code_19,patientid_Hospital_code_20,patientid_Hospital_code_21,patientid_Hospital_code_22,patientid_Hospital_code_23,patientid_Hospital_code_24,patientid_Hospital_code_25,patientid_Hospital_code_26,patientid_Hospital_code_27,patientid_Hospital_code_28,patientid_Hospital_code_29,patientid_Hospital_code_30,patientid_Hospital_code_31,hosp_patient_same
0,8,2,3,2,-0.169177,3,2,5,-0.716877,7,-1.136165,-1.788287,-0.727923,0.4616,0.027835,0,0.641455,0.482583,0.768666,-0.78309,-0.506233,-0.638952,-0.470721,-0.498094,-0.277966,-0.363342,-0.228884,2.632413,2.507122,-0.291394,-0.123615,-0.262626,-0.368183,-0.134136,3.799102,1.54606,4.72357,-0.456309,-0.389182,-0.261849,-0.501645,-0.318583,3.793229,-0.28776,-0.191968,-0.45973,-0.142393,-0.329718,3.460052,1.904389,-0.349397,-0.375527,2.745021,-0.356953,-0.372072,-0.329696,-0.261619,-0.204559,1.827081,-0.282292
1,2,2,5,2,-1.025217,3,3,5,-0.716877,7,0.315306,-1.788287,-0.727923,0.4616,0.987556,4,-0.468194,-0.392544,0.722772,0.091759,-0.348183,-0.02048,-0.261265,-0.002643,-0.132399,-0.10913,-0.051592,2.632413,2.507122,-0.291394,-0.123615,-0.262626,-0.368183,-0.134136,3.799102,1.54606,4.72357,-0.456309,-0.389182,-0.261849,-0.501645,-0.318583,3.793229,-0.28776,-0.191968,-0.45973,-0.142393,-0.329718,3.460052,1.904389,-0.349397,-0.375527,2.745021,-0.356953,-0.372072,-0.329696,-0.261619,-0.204559,1.827081,-0.282292
2,10,4,1,0,-1.025217,1,3,4,-0.716877,7,0.315306,-1.788287,-0.727923,0.4616,-0.12491,3,-0.375756,0.595764,0.783675,-0.684221,-0.508881,-0.328298,-0.002865,-0.403406,-0.207189,-0.343291,-0.168559,2.632413,2.507122,-0.291394,-0.123615,-0.262626,-0.368183,-0.134136,3.799102,1.54606,4.72357,-0.456309,-0.389182,-0.261849,-0.501645,-0.318583,3.793229,-0.28776,-0.191968,-0.45973,-0.142393,-0.329718,3.460052,1.904389,-0.349397,-0.375527,2.745021,-0.356953,-0.372072,-0.329696,-0.261619,-0.204559,1.827081,-0.282292
3,26,1,2,1,-1.025217,3,2,3,-0.716877,7,0.315306,-1.788287,-0.727923,0.4616,2.200319,4,-0.252593,-0.8045,0.636428,0.607279,2.119277,-0.458124,1.918151,-0.316125,-0.249032,-0.35316,-0.107848,2.632413,2.507122,-0.291394,-0.123615,-0.262626,-0.368183,-0.134136,3.799102,1.54606,4.72357,-0.456309,-0.389182,-0.261849,-0.501645,-0.318583,3.793229,-0.28776,-0.191968,-0.45973,-0.142393,-0.329718,3.460052,1.904389,-0.349397,-0.375527,2.745021,-0.356953,-0.372072,-0.329696,-0.261619,-0.204559,1.827081,-0.282292
4,26,1,2,1,-1.025217,3,3,3,-0.716877,7,0.315306,-1.788287,-0.727923,0.4616,0.623175,4,-0.27361,-0.796418,0.539261,0.170317,1.206203,-0.078461,0.188402,-0.026373,-0.055731,0.365593,0.093622,2.632413,2.507122,-0.291394,-0.123615,-0.262626,-0.368183,-0.134136,3.799102,1.54606,4.72357,-0.456309,-0.389182,-0.261849,-0.501645,-0.318583,3.793229,-0.28776,-0.191968,-0.45973,-0.142393,-0.329718,3.460052,1.904389,-0.349397,-0.375527,2.745021,-0.356953,-0.372072,-0.329696,-0.261619,-0.204559,1.827081,-0.282292


In [4]:
# Just checking if any NaNs still remain
for i in df_train.columns:
  print(i, df_train[i].isnull().values.any())

Hospital_code False
Hospital_type_code False
City_Code_Hospital False
Hospital_region_code False
Available Extra Rooms in Hospital False
Department False
Ward_Type False
Ward_Facility_Code False
Bed Grade False
City_Code_Patient False
Type of Admission False
Severity of Illness False
Visitors with Patient False
Age False
Admission_Deposit False
Stay False
0-10 False
11-20 False
21-30 False
31-40 False
41-50 False
51-60 False
61-70 False
71-80 False
81-90 False
91-100 False
More Than 100 Days False
patientid_Hospital_code_0 False
patientid_Hospital_code_1 False
patientid_Hospital_code_2 False
patientid_Hospital_code_3 False
patientid_Hospital_code_4 False
patientid_Hospital_code_5 False
patientid_Hospital_code_6 False
patientid_Hospital_code_7 False
patientid_Hospital_code_8 False
patientid_Hospital_code_9 False
patientid_Hospital_code_10 False
patientid_Hospital_code_11 False
patientid_Hospital_code_12 False
patientid_Hospital_code_13 False
patientid_Hospital_code_14 False
patientid_Ho

## Function where the architecture is defined

In [5]:
def get_model(x_train, y_train):
  embed_dim, lr = 10, 0.0005

  vals = [x_train["Hospital_code"].unique(), x_train["City_Code_Hospital"].unique(), x_train["Hospital_type_code"].unique(),
          x_train["Hospital_region_code"].unique(), x_train["Department"].unique(), x_train["Ward_Type"].unique(),
          x_train["Ward_Facility_Code"].unique(), x_train["City_Code_Patient"].unique()]

  #categorical
  inp1 = Input(shape=(1,))
  inp2 = Input(shape=(1,))
  inp3 = Input(shape=(1,))
  inp4 = Input(shape=(1,))
  inp5 = Input(shape=(1,))
  inp6 = Input(shape=(1,))
  inp7 = Input(shape=(1,))
  inp8 = Input(shape=(1,))
  # numerical
  inp9 = Input(shape=(len(cols_cont),))

  # Hospital_code
  layer1 = Flatten()(Embedding(max(max(vals[0])+1, len(vals[0])), 2*len(vals[0]), trainable=True)(inp1))
  # City_Code_Hospital
  layer2 = Flatten()(Embedding(max(max(vals[1])+1, len(vals[1])), 2*len(vals[1]), trainable=True)(inp2))
  # Hospital_type_code
  layer3 = Flatten()(Embedding(max(max(vals[2])+1, len(vals[2])), 2*len(vals[2]), trainable=True)(inp3))
  # Hospital_region_code
  layer4 = Flatten()(Embedding(max(max(vals[3])+1, len(vals[3])), 2*len(vals[3]), trainable=True)(inp4))
  # Department
  layer5 = Flatten()(Embedding(max(max(vals[4])+1, len(vals[4])), 2*len(vals[4]), trainable=True)(inp5))
  # Ward_Type
  layer6 = Flatten()(Embedding(max(max(vals[5])+1, len(vals[5])), 2*len(vals[5]), trainable=True)(inp6))
  # Ward_Facility_Code
  layer7 = Flatten()(Embedding(max(max(vals[6])+1, len(vals[6])), 2*len(vals[6]), trainable=True)(inp7))
  # City_Code_Patient
  layer8 = Flatten()(Embedding(max(max(vals[7])+1, len(vals[7])), 2*len(vals[7]), trainable=True)(inp8))

  merge = concatenate([layer1, layer2, layer3, layer4, layer5, layer6, layer7, layer8, inp9])

  x1 = Dense(256, activation='relu', kernel_initializer=initializers.GlorotUniform(seed=0))(merge)
  x1 = Dense(512, activation='relu', kernel_initializer=initializers.GlorotUniform(seed=2))(x1)
  x1 = Dense(256, activation='relu', kernel_initializer=initializers.GlorotUniform(seed=3))(x1)
  x1 = Dense(128, activation='relu', kernel_initializer=initializers.GlorotUniform(seed=4))(x1)
  x1 = Dense(32, activation='relu', kernel_initializer=initializers.GlorotUniform(seed=5))(x1)
  output = Dense(y_train.shape[1], activation='softmax', kernel_initializer=initializers.GlorotUniform(seed=6))(x1)
  model = Model(inputs=[inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8, inp9], outputs=output)
  model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=lr), metrics = ['accuracy'])
  # model.summary()

  return model

# Saves the predictions of provided model
def save_preds(model, epoch):
  # Create input for the model
  preds = model.predict([np.reshape(df_test["Hospital_code"].values, (-1,1)), np.reshape(df_test["City_Code_Hospital"].values, (-1,1)), np.reshape(df_test["Hospital_type_code"].values, (-1,1)), np.reshape(df_test["Hospital_region_code"].values, (-1,1)), np.reshape(df_test["Department"].values, (-1,1)), np.reshape(df_test["Ward_Type"].values, (-1,1)), np.reshape(df_test["Ward_Facility_Code"].values, (-1,1)), np.reshape(df_test["City_Code_Patient"].values, (-1,1)), df_test[cols_cont].values], verbose=1)

  # Write the file to csv
  fp = open("/content/drive/My Drive/JanataHack_Healthcare/submit_proba_"+str(epoch)+".csv", "w")
  fp.write("case_id,Stay\n")
  for id_, pred in zip(case_ids, preds):
    fp.write(str(id_)+","+",".join([str(i) for i in pred])+"\n")
  fp.close()

  preds = np.argmax(preds, axis=1)
  # Write the file to csv
  fp = open("/content/drive/My Drive/JanataHack_Healthcare/submit_"+str(epoch)+".csv", "w")
  fp.write("case_id,Stay\n")
  for id_, pred in zip(case_ids, preds):
    fp.write(str(id_)+","+class_map_rev[pred]+"\n")
  fp.close()

# Callback to save model's predictions after each epoch
class CustomCallback(tensorflow.keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs=None):
      if epoch!=0:
        save_preds(self.model, epoch)

## Time for good ol CV

In [6]:
# kfold = KFold(n_splits=10, random_state=27, shuffle=True)
# scores, train_preds = list(), np.zeros(Y.shape, dtype=np.float32)
# for train, test in kfold.split(df_train):
#   x_train, x_test = df_train.iloc[train], df_train.iloc[test]
#   y_train, y_test = Y[train], Y[test]

#   batch_size, epochs = 32, 4
#   model = get_model(x_train, y_train)
#   model.fit(x=[np.reshape(x_train["Hospital_code"].values, (-1,1)), np.reshape(x_train["City_Code_Hospital"].values, (-1,1)), np.reshape(x_train["Hospital_type_code"].values, (-1,1)), np.reshape(x_train["Hospital_region_code"].values, (-1,1)), np.reshape(x_train["Department"].values, (-1,1)), np.reshape(x_train["Ward_Type"].values, (-1,1)), np.reshape(x_train["Ward_Facility_Code"].values, (-1,1)), np.reshape(x_train["City_Code_Patient"].values, (-1,1)), x_train[cols_cont].values], y=y_train, batch_size=batch_size, epochs=epochs, verbose=1)
#   preds = model.predict([np.reshape(x_test["Hospital_code"].values, (-1,1)), np.reshape(x_test["City_Code_Hospital"].values, (-1,1)), np.reshape(x_test["Hospital_type_code"].values, (-1,1)), np.reshape(x_test["Hospital_region_code"].values, (-1,1)), np.reshape(x_test["Department"].values, (-1,1)), np.reshape(x_test["Ward_Type"].values, (-1,1)), np.reshape(x_test["Ward_Facility_Code"].values, (-1,1)), np.reshape(x_test["City_Code_Patient"].values, (-1,1)), x_test[cols_cont].values], verbose=1)
#   score = accuracy_score(np.argmax(y_test, axis=1), np.argmax(preds, axis=1))
#   train_preds[test] = preds
#   scores.append(score)
#   print("\n", score, "\n")
# print("\n\nAverage: ", sum(scores)/len(scores))

# # Write out the train predictions into a csv
# fp = open("/content/drive/My Drive/JanataHack_Healthcare/submit_proba_train.csv", "w")
# fp.write("case_id,0-10,11-20,21-30,31-40,41-50,51-60,61-70,71-80,81-90,91-100,More Than 100 Days\n")
# for id_, pred in zip(train_case_ids, train_preds):
#   fp.write(str(id_)+","+",".join([str(i) for i in pred])+"\n")
# fp.close()


## The final model training and then submission

In [27]:
batch_size, epochs = 32, 10
model = get_model(df_train, Y)
model.fit(x=[np.reshape(df_train["Hospital_code"].values, (-1,1)), np.reshape(df_train["City_Code_Hospital"].values, (-1,1)), np.reshape(df_train["Hospital_type_code"].values, (-1,1)), np.reshape(df_train["Hospital_region_code"].values, (-1,1)), np.reshape(df_train["Department"].values, (-1,1)), np.reshape(df_train["Ward_Type"].values, (-1,1)), np.reshape(df_train["Ward_Facility_Code"].values, (-1,1)), np.reshape(df_train["City_Code_Patient"].values, (-1,1)), df_train[cols_cont].values], y=Y, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[CustomCallback()])
preds = model.predict([np.reshape(df_test["Hospital_code"].values, (-1,1)), np.reshape(df_test["City_Code_Hospital"].values, (-1,1)), np.reshape(df_test["Hospital_type_code"].values, (-1,1)), np.reshape(df_test["Hospital_region_code"].values, (-1,1)), np.reshape(df_test["Department"].values, (-1,1)), np.reshape(df_test["Ward_Type"].values, (-1,1)), np.reshape(df_test["Ward_Facility_Code"].values, (-1,1)), np.reshape(df_test["City_Code_Patient"].values, (-1,1)), df_test[cols_cont].values], verbose=1)
preds = np.argmax(preds, axis=1)

# Write out the train predictions into a csv
# fp = open("/content/drive/My Drive/JanataHack_Healthcare/submit_proba_test.csv", "w")
# fp.write("case_id,0-10,11-20,21-30,31-40,41-50,51-60,61-70,71-80,81-90,91-100,More Than 100 Days\n")
# for id_, pred in zip(case_ids, preds):
#   fp.write(str(id_)+","+",".join([str(i) for i in pred])+"\n")
# fp.close()

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
