In [None]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm

In [None]:
# This function adds a co-occurrence matrix with respect to patient_id
def add_coocur_mat(dataframe, feature1, feature2):
  hc_un = dataframe[feature2].unique()
  hc_un.sort()
  map_hc, map_pid = dict(), dict()
  for index, val in enumerate(hc_un):
    map_hc[val] = index

  for pid, df in tqdm(dataframe.groupby(feature1)):
    tmp = np.zeros(len(map_hc))
    for val in df[feature2].values:
      tmp[map_hc[val]]+=1
    map_pid[pid] = tmp

  tmp = list()
  for val in tqdm(dataframe[feature1].values):
    tmp.append(map_pid[val])
  tmp = np.array(tmp)
  for i in range(len(map_hc)):
    dataframe[feature1+"_"+feature2+"_"+str(i)] = tmp[:,i]

  return dataframe

In [None]:
df_train_probas = pd.read_csv('/content/drive/My Drive/JanataHack_Healthcare/submit_proba_train_nn.csv')
df_test_probas = pd.read_csv('/content/drive/My Drive/JanataHack_Healthcare/submit_proba_test_nn.csv')

df_train = pd.merge(pd.read_csv('/content/drive/My Drive/JanataHack_Healthcare/train.csv'), df_train_probas, on='case_id')
df_test = pd.merge(pd.read_csv('/content/drive/My Drive/JanataHack_Healthcare/test.csv'), df_test_probas, on='case_id')

# Bed-grade has nulls. Bed-grade is different for each hospital
df_train["Bed Grade"] = df_train.groupby(["Hospital_code"], sort=False)["Bed Grade"].apply(lambda x: x.fillna(x.mean()))
df_test["Bed Grade"] = df_test.groupby(["Hospital_code"], sort=False)["Bed Grade"].apply(lambda x: x.fillna(x.mean()))

# City_Code_Patient has nulls. Treat them as a separate code
df_train["City_Code_Patient"] = df_train["City_Code_Patient"].fillna(39)
df_test["City_Code_Patient"] = df_test["City_Code_Patient"].fillna(39)

df_train = add_coocur_mat(df_train.copy(deep=True), "patientid", "Hospital_code")
df_test = add_coocur_mat(df_test.copy(deep=True), "patientid", "Hospital_code")

train_case_ids, case_ids = df_train["case_id"].values, df_test["case_id"].values
df_train = df_train.drop(['case_id', 'patientid'], axis=1)
df_test = df_test.drop(['case_id', 'patientid'], axis=1)

le = LabelEncoder()
df_train["Hospital_type_code"] = le.fit_transform(df_train["Hospital_type_code"].values)
df_test["Hospital_type_code"] = le.transform(df_test["Hospital_type_code"].values)

df_train["Hospital_region_code"] = le.fit_transform(df_train["Hospital_region_code"].values)
df_test["Hospital_region_code"] = le.transform(df_test["Hospital_region_code"].values)

df_train["Department"] = le.fit_transform(df_train["Department"].values)
df_test["Department"] = le.transform(df_test["Department"].values)

df_train["Ward_Type"] = le.fit_transform(df_train["Ward_Type"].values)
df_test["Ward_Type"] = le.transform(df_test["Ward_Type"].values)

df_train["Ward_Facility_Code"] = le.fit_transform(df_train["Ward_Facility_Code"].values)
df_test["Ward_Facility_Code"] = le.transform(df_test["Ward_Facility_Code"].values)

df_train["Type of Admission"] = le.fit_transform(df_train["Type of Admission"].values)
df_test["Type of Admission"] = le.transform(df_test["Type of Admission"].values)

df_train["Severity of Illness"] = le.fit_transform(df_train["Severity of Illness"].values)
df_test["Severity of Illness"] = le.transform(df_test["Severity of Illness"].values)

df_train["City_Code_Patient"] = df_train['City_Code_Patient'].astype('int')
df_test["City_Code_Patient"] = df_test['City_Code_Patient'].astype('int')

class_map = {"0-10": 0, "11-20": 1, "21-30": 2, "31-40": 3, "41-50": 4, "51-60": 5, "61-70": 6, "71-80": 7, "81-90": 8, "91-100": 9, "More than 100 Days": 10}
class_map_rev = {0: "0-10", 1: "11-20", 2: "21-30", 3: "31-40", 4: "41-50", 5: "51-60", 6: "61-70", 7: "71-80", 8: "81-90", 9: "91-100", 10: "More than 100 Days"}

df_train["Age"] = [(class_map[i]*10)+1 for i in df_train["Age"].values]
df_test["Age"] = [(class_map[i]*10)+1 for i in df_test["Age"].values]

# If hosp and patient are in same city
df_train['hosp_patient_same'] = [1 if i == j else 0 for i, j in zip(df_train["City_Code_Hospital"].values, df_train['City_Code_Patient'].values)]
df_test['hosp_patient_same'] = [1 if i == j else 0 for i, j in zip(df_test["City_Code_Hospital"].values, df_test['City_Code_Patient'].values)]

# Gender. 1 -> Female, 0 -> Male
df_train["Gender"] = [1 if i=="gynecology" else 0 for i in df_train["Department"].values]
df_test["Gender"] = [1 if i=="gynecology" else 0 for i in df_test["Department"].values]

# Numebr of hospitals in a city
df_train["Num_hospitals"] = df_train.groupby(["City_Code_Hospital"])["Hospital_code"].nunique()
df_test["Num_hospitals"] = df_test.groupby(["City_Code_Hospital"])["Hospital_code"].nunique()
df_train["Num_hospitals"] = df_train["Num_hospitals"].fillna(0)
df_test["Num_hospitals"] = df_test["Num_hospitals"].fillna(0)

# Some weird features by Sayantan
# df_train['ward_score'] = df_train['Ward_Facility_Code'] * df_train['Ward_Type']
# df_test['ward_score'] = df_test['Ward_Facility_Code'] * df_test['Ward_Type']

# df_train['ward_bed_score'] = df_train['Ward_Facility_Code'] * df_train['Bed Grade']
# df_test['ward_bed_score'] = df_test['Ward_Facility_Code'] * df_test['Bed Grade']

# df_train['add_age'] = df_train['Age'] * df_train['Type of Admission']
# df_test['add_age'] =df_test['Age'] * df_test['Type of Admission']

# Convert target variable to usable format
df_train["Stay"] = [class_map[i] for i in df_train["Stay"].values]
Y = df_train["Stay"].values
# Get X and Y for train and test
df_train = df_train.drop(["Stay"], axis=1)
X_train, X_test = df_train.values, df_test.values
print(X_train.shape, X_test.shape, Y.shape)

cols_cat = ['Hospital_code', "Hospital_type_code", 'City_Code_Hospital', "Hospital_region_code", "Department", "Ward_Type", "Ward_Facility_Code", 'City_Code_Patient', "Gender", "hosp_patient_same"]
cols_cat_inds = [list(df_train.columns).index(i) for i in cols_cat]
cols_cont = list()
for col in df_train.columns:
  if col not in cols_cat:
    cols_cont.append(col)

df_train.head()

HBox(children=(FloatProgress(value=0.0, max=92017.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=318438.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39607.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=137057.0), HTML(value='')))


(318438, 61) (137057, 61) (318438,)


Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,0-10,11-20,21-30,31-40,41-50,51-60,61-70,71-80,81-90,91-100,More Than 100 Days,patientid_Hospital_code_0,patientid_Hospital_code_1,patientid_Hospital_code_2,patientid_Hospital_code_3,patientid_Hospital_code_4,patientid_Hospital_code_5,patientid_Hospital_code_6,patientid_Hospital_code_7,patientid_Hospital_code_8,patientid_Hospital_code_9,patientid_Hospital_code_10,patientid_Hospital_code_11,patientid_Hospital_code_12,patientid_Hospital_code_13,patientid_Hospital_code_14,patientid_Hospital_code_15,patientid_Hospital_code_16,patientid_Hospital_code_17,patientid_Hospital_code_18,patientid_Hospital_code_19,patientid_Hospital_code_20,patientid_Hospital_code_21,patientid_Hospital_code_22,patientid_Hospital_code_23,patientid_Hospital_code_24,patientid_Hospital_code_25,patientid_Hospital_code_26,patientid_Hospital_code_27,patientid_Hospital_code_28,patientid_Hospital_code_29,patientid_Hospital_code_30,patientid_Hospital_code_31,hosp_patient_same,Gender,Num_hospitals
0,8,2,3,2,3,3,2,5,2.0,7,0,0,2,51,4911.0,0.116914,0.28745,0.400205,0.122809,0.036219,0.020899,0.007062,0.004172,0.001396,0.000782,0.002091,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0.0
1,2,2,5,2,2,3,3,5,2.0,7,1,0,2,51,5954.0,0.027747,0.190355,0.392714,0.165411,0.032654,0.106617,0.014557,0.04015,0.003371,0.010898,0.015526,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,6.0
2,10,4,1,0,2,1,3,4,2.0,7,1,0,2,51,4745.0,0.035707,0.360571,0.387216,0.116276,0.017152,0.054798,0.005022,0.011942,0.002822,0.002926,0.005568,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,3.0
3,26,1,2,1,2,3,2,3,2.0,7,1,0,2,51,7272.0,0.050094,0.12393,0.384715,0.253173,0.098284,0.048091,0.022287,0.010201,0.002599,0.00295,0.003675,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,6.0
4,26,1,2,1,2,3,3,3,2.0,7,1,0,2,51,5558.0,0.049521,0.16811,0.333094,0.194637,0.062984,0.092277,0.027937,0.037002,0.006329,0.011842,0.016268,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,3.0


In [None]:
kfold, scores, train_preds = KFold(n_splits=10, shuffle=True, random_state=0), list(), np.zeros((len(Y), len(set(Y))), dtype=np.float32)
for train, test in kfold.split(X_train):
    x_train, x_test = X_train[train], X_train[test]
    y_train, y_test = Y[train], Y[test]
    
    model = LGBMClassifier(random_state=27, boosting_type="gbdt", num_leaves=80, min_data_in_leaf=307, max_depth=7, learning_rate=0.1)
    model.fit(x_train, y_train, categorical_feature=cols_cat_inds)
    preds_proba = model.predict_proba(x_test)
    preds = model.predict(x_test)
    score = accuracy_score(y_test, preds)
    train_preds[test] = preds_proba
    scores.append(score)
    print(score)
print("Average: ", sum(scores)/len(scores))

# Write out the train predictions into a csv
fp = open("/content/drive/My Drive/JanataHack_Healthcare/submit_proba_train_lgbm.csv", "w")
fp.write("case_id,0-10,11-20,21-30,31-40,41-50,51-60,61-70,71-80,81-90,91-100,More Than 100 Days\n")
for id_, pred in zip(train_case_ids, train_preds):
  fp.write(str(id_)+","+",".join([str(i) for i in pred])+"\n")
fp.close()

New categorical_feature is [0, 1, 2, 3, 5, 6, 7, 9, 58, 59]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


0.4302223338776536


New categorical_feature is [0, 1, 2, 3, 5, 6, 7, 9, 58, 59]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


0.4373508353221957


New categorical_feature is [0, 1, 2, 3, 5, 6, 7, 9, 58, 59]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


0.43600050244944105


New categorical_feature is [0, 1, 2, 3, 5, 6, 7, 9, 58, 59]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


0.4282125361135536


New categorical_feature is [0, 1, 2, 3, 5, 6, 7, 9, 58, 59]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


0.4305991709584223


New categorical_feature is [0, 1, 2, 3, 5, 6, 7, 9, 58, 59]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


0.43471297575681445


New categorical_feature is [0, 1, 2, 3, 5, 6, 7, 9, 58, 59]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


0.4305363647782942


New categorical_feature is [0, 1, 2, 3, 5, 6, 7, 9, 58, 59]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


0.4255746765481723


New categorical_feature is [0, 1, 2, 3, 5, 6, 7, 9, 58, 59]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


0.4272838614452156


New categorical_feature is [0, 1, 2, 3, 5, 6, 7, 9, 58, 59]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


0.4305498853751217
Average:  0.4311043142624884


In [None]:
model = LGBMClassifier(random_state=27, boosting_type="gbdt", num_leaves=80, min_data_in_leaf=307, max_depth=7, learning_rate=0.1)
model.fit(X_train, Y, categorical_feature=cols_cat_inds)
preds = model.predict_proba(X_test)

# Write out the train predictions into a csv
fp = open("/content/drive/My Drive/JanataHack_Healthcare/submit_proba_test_lgbm.csv", "w")
fp.write("case_id,0-10,11-20,21-30,31-40,41-50,51-60,61-70,71-80,81-90,91-100,More Than 100 Days\n")
for id_, pred in zip(case_ids, preds):
  fp.write(str(id_)+","+",".join([str(i) for i in pred])+"\n")
fp.close()

preds = model.predict(X_test)
fp = open("/content/drive/My Drive/JanataHack_Healthcare/submit.csv", "w")
fp.write("case_id,Stay\n")
for id_, pred in zip(case_ids, preds):
  fp.write(str(id_)+","+class_map_rev[pred]+"\n")
fp.close()

New categorical_feature is [0, 1, 2, 3, 5, 6, 7, 9, 58, 59]
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))
