In [1]:
## libraries

import pandas as pd
from tqdm import tqdm
import torch
import numpy as np
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import lightgbm as lgb

In [2]:
## data load

train1 = pd.read_csv("train.csv")
train2 = pd.read_csv("submit_proba_train.csv")


test1 = pd.read_csv("test.csv")
test2 = pd.read_csv("submit_proba_test.csv")

train = pd.merge(train1, train2, on='case_id')
test = pd.merge(test1, test2, on='case_id')

In [3]:
#super feature

hc_un = train["Hospital_code"].unique()
hc_un.sort()
map_hc, map_pid = dict(), dict()
for index, val in enumerate(hc_un):
  map_hc[val] = index

for pid, df in tqdm(train.groupby("patientid")):
  tmp = np.zeros(len(map_hc))
  for val in df["Hospital_code"].values:
    tmp[map_hc[val]]+=1
  map_pid[pid] = tmp

# For train
tmp = list()
for val in tqdm(train["patientid"].values):
  tmp.append(map_pid[val])
tmp = np.array(tmp)
for i in range(len(map_hc)):
  train["patientid_hospital_code_"+str(i)] = tmp[:,i]







hc_un = test["Hospital_code"].unique()
hc_un.sort()
map_hc, map_pid = dict(), dict()
for index, val in enumerate(hc_un):
  map_hc[val] = index

for pid, df in tqdm(test.groupby("patientid")):
  tmp = np.zeros(len(map_hc))
  for val in df["Hospital_code"].values:
    tmp[map_hc[val]]+=1
  map_pid[pid] = tmp

# For test
tmp = list()
for val in tqdm(test["patientid"].values):
  tmp.append(map_pid[val])
tmp = np.array(tmp)
for i in range(len(map_hc)):
  test["patientid_hospital_code_"+str(i)] = tmp[:,i]

100%|██████████████████████████████████████████████████████████████████████████| 92017/92017 [00:19<00:00, 4613.05it/s]
100%|█████████████████████████████████████████████████████████████████████| 318438/318438 [00:00<00:00, 1158718.44it/s]
100%|██████████████████████████████████████████████████████████████████████████| 39607/39607 [00:08<00:00, 4595.74it/s]
100%|█████████████████████████████████████████████████████████████████████| 137057/137057 [00:00<00:00, 1184679.71it/s]


In [4]:
## filling nan and specifying categorical variables

#train["Bed Grade"] = train.groupby(["Hospital_code"], sort=False)["Bed Grade"].apply(lambda x: x.fillna(x.mean()))
#test["Bed Grade"] = test.groupby(["Hospital_code"], sort=False)["Bed Grade"].apply(lambda x: x.fillna(x.mean()))

# City_Code_Patient has nulls. Treat them as a separate code
train["City_Code_Patient"] = train["City_Code_Patient"].fillna(39)
test["City_Code_Patient"] = test["City_Code_Patient"].fillna(39)

cols_cat = ['Hospital_code', 'City_Code_Hospital', "Hospital_type_code", "Hospital_region_code", "Department", "Ward_Type", "Ward_Facility_Code", 'City_Code_Patient']
cols_cont = ['Available Extra Rooms in Hospital','Bed Grade', 'Visitors with Patient', 'Age', 'Admission_Deposit', "Severity of Illness", "Type of Admission"]

In [5]:
train = train.dropna()
train = train.drop(['patientid'], axis = 1)
test = test.drop(['patientid'], axis = 1)
train_case_ids = train['case_id']
case_ids = test['case_id']
train = train.drop(['case_id'], axis = 1)
test = test.drop(['case_id'], axis = 1)

In [6]:
train['Gender'] = [1 if i=='gynecology' else 0 for i in train.Department.values]
test['Gender'] = [1 if i=='gynecology' else 0 for i in test.Department.values]

df = train
df_test = test1.copy(deep=True)

In [7]:
le = LabelEncoder()
train["Department"] = le.fit_transform(train["Department"])
test["Department"] = le.transform(test["Department"])

In [8]:
mp = {'P':1, 'Q':2, 'R':3, 'S':4, 'T':5, 'U':6}

train["Ward_Type"] = train["Ward_Type"].map(mp)
test["Ward_Type"] = test["Ward_Type"].map(mp)

In [9]:
mp = {'a':1,'b':2,'c':3,'d':4,'e':5,'f':6,'g':7}

train["Hospital_type_code"] = train["Hospital_type_code"].map(mp)
test["Hospital_type_code"] = test["Hospital_type_code"].map(mp)

In [10]:
train["Hospital_region_code"] = le.fit_transform(train["Hospital_region_code"])
test["Hospital_region_code"] = le.transform(test["Hospital_region_code"])

In [11]:
mp = {'A':6,'B':5,'C':4,'D':3,'E':2,'F':1}
train["Ward_Facility_Code"] = train["Ward_Facility_Code"].map(mp)
test["Ward_Facility_Code"] = test["Ward_Facility_Code"].map(mp)

In [12]:
m2 = {'Emergency':2, 'Urgent':3, 'Trauma':1}
m3 = {'Minor':1, 'Moderate':2, 'Extreme':3}

train['Type of Admission'] = train['Type of Admission'].map(m2)
test['Type of Admission'] = test['Type of Admission'].map(m2)

train['Severity of Illness'] = train['Severity of Illness'].map(m3)
test['Severity of Illness'] = test['Severity of Illness'].map(m3)

In [13]:
m = {'0-10':0, '11-20':1, '21-30':2, '31-40':3,'41-50':4,'51-60':5,'61-70':6,'71-80':7,'81-90':8,'91-100':9,'More than 100 Days':10 }
train['Age'] = train['Age'].map(m)
train['Stay'] = train['Stay'].map(m)

test['Age'] = test['Age'].map(m)

In [14]:
#train = train.drop_duplicates(keep='first')

In [15]:
#if hosp and patient are in same city
city = train['City_Code_Hospital'].values
patient = train['City_Code_Patient'].values
val = []
        
for i in range(len(city)):
  if city[i] == patient[i]:
    val.append(1)
  else:
    val.append(0)
train['hosp_patient_same'] = val

city = test['City_Code_Hospital'].values
patient = test['City_Code_Patient'].values
val = []

for i in range(len(city)):
  if city[i] == patient[i]:
    val.append(1)
  else:
    val.append(0)
test['hosp_patient_same'] = val
#hospitals per city
import numpy as np
df_comb = pd.concat([train,test], axis = 0)

count = []
for i in range(14):
  count.append([])


hosp = df_comb.Hospital_code.values
city = df_comb.City_Code_Hospital.values

for i in range(len(hosp)):
  if hosp[i] not in count[city[i]]:
    count[city[i]].append(hosp[i])

res = []

city = train.City_Code_Patient.values

for i in range(len(city)):
  try:
    res.append( len(count[ int( city[i] ) ] ))
  except:
    res.append(0)

train['max_hospitals'] = res

res = []

city = test.City_Code_Patient.values

for i in range(len(city)):
  try:
    res.append( len(count[ int( city[i] ) ] ))
  except:
    res.append(0)
test['max_hospitals'] = res

In [16]:
train.drop(['91-100' ], axis=1, inplace=True)
test.drop(['91-100'  ], axis=1, inplace=True)
#train['ward_score'] = train['Ward_Facility_Code'] * train['Ward_Type']
#test['ward_score'] = test['Ward_Facility_Code'] * test['Ward_Type']
#train['ward_bed_score'] = train['Ward_Facility_Code'] * train['Bed Grade']
#test['ward_bed_score'] = test['Ward_Facility_Code'] * test['Bed Grade']
train['add_age'] = train['Age'] * train['Type of Admission']
test['add_age'] =test['Age'] * test['Type of Admission']
comb = pd.concat([train,test], axis = 0)
hosp = df_comb.Hospital_code.values
dep = df_comb.Department.values

#X_train, Y = train.drop(["Stay"], axis=1).values, train["Stay"].values
#X_test = test.values

In [17]:
X_train, Y = train.drop(['max_hospitals',"Stay"], axis=1).values, train["Stay"].values
X_test = test.drop(['max_hospitals'], axis=1).values

In [19]:
#oof predictions
import numpy as np

i=0
setused=X_train
targ=Y

scores=[]
splits=10

cnf_matrix=[]
sc = 0
#oof_preds = [np.zeros((len(X_test)))]
kfold, scores = KFold(n_splits=splits,shuffle= True, random_state=True), list()
for train2, test2 in kfold.split(setused,targ):
    x_train, x_test = setused[train2], setused[test2]
    y_train, y_test = targ[train2], targ[test2]
    eval_set = [(x_test,y_test)]
    cat_feat = [df.columns.get_loc(i) for  i in ['Hospital_region_code','Hospital_code', 'City_Code_Hospital','Hospital_type_code','Department','City_Code_Patient', 'Gender']]
    #model = LGBMClassifier(boosting_type='gbdt',learning_rate=0.1,n_estimstors=500,max_depth=15,random_state=22,categorical_feature=cat_feat) # n_estimaators=5000, max_depth=16,
    
    model = lgb.LGBMClassifier(boosting_type='gbdt', categorical_feature=cat_feat, objective= 'multiclass', num_leaves=80, min_data_in_leaf=307, max_depth=7, learning_rate=0.1)
    #model = lgb.LGBMClassifier(random_state=22,categorical_feature=cat_feat)
    #model = LogisticRegression(max_iter=1000, random_state=22)
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    score = accuracy_score(y_test, preds)
    scores.append(score)
    print(score)    
    sc += 1
    if i == 0:
      oof_preds = model.predict_proba(X_test)
      i += 1
    else:
      oof_preds += model.predict_proba(X_test)
oof_preds = oof_preds/sc

print("Average: ", np.sum(scores)/len(scores))

Please use categorical_feature argument of the Dataset constructor to pass this parameter.


0.43316683944334494


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


0.42986837558508467


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


0.4374077215468225


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


0.43037099864920053


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


0.4334495649169101


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


0.4321751696406132


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


0.4339029907011812


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


0.4285624528776074


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


0.430635838150289


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


0.435819301331993
Average:  0.43253592528430457


In [30]:
preds = np.argmax(oof_preds, axis = 1)

new_preds2 = list()
for i in range(len(preds)):
    if preds[i] == 0:
        new_preds2.append('0-10')
    elif preds[i] == 1:
        new_preds2.append('11-20')
    elif preds[i] == 2:
        new_preds2.append('21-30')
    elif preds[i] == 3:
        new_preds2.append('31-40')
    elif preds[i] == 4:
        new_preds2.append('41-50')
    elif preds[i] == 5:
        new_preds2.append('51-60')
    elif preds[i] == 6:
        new_preds2.append('61-70')
    elif preds[i] == 7:
        new_preds2.append('71-80')
    elif preds[i] == 8:
        new_preds2.append('81-90')
    elif preds[i] == 9:
        new_preds2.append('91-100')
    elif preds[i] == 10:
        new_preds2.append('More than 100 Days')

df_submit = pd.DataFrame({'case_id': df_test['case_id'].values, 'Stay':new_preds2})
df_submit.to_csv('submit.csv', index = False)

In [31]:
df_nn = pd.read_csv('submit_proba.csv')
id = df_nn.case_id.values
df_nn.drop(['case_id'], axis=1, inplace=True)

val = df_nn.values

In [32]:
## total predictions
cat_feat = [df.columns.get_loc(i) for  i in ['Hospital_region_code','Hospital_code', 'City_Code_Hospital','Hospital_type_code','Department','City_Code_Patient', 'Gender']]
clf = lgb.LGBMClassifier(boosting_type='gbdt', categorical_feature=cat_feat, objective= 'multiclass', num_leaves=80, min_data_in_leaf=307, max_depth=7, learning_rate=0.1)
#clf = lgb.LGBMClassifier(categorical_feature=cat_feat, randome_state=22)
clf.fit(X_train, Y)
probs = clf.predict_proba(X_test)

Please use categorical_feature argument of the Dataset constructor to pass this parameter.




In [131]:
## model blending
import numpy as np
lgbm = 0.1
nn = 0.9
final_preds, new_pre = [], []

for i in range(len(preds)):
  temp = []
  for j in range(11):
    x = 0.95*oof_preds[i][j] + 0.05*probs[i][j]
    temp.append(x)
  new_pre.append(temp)

for i in range(len(preds)):
  temp = []
  for j in range(11):
    x = lgbm*new_pre[i][j] + nn*val[i][j]
    temp.append(x)
  final_preds.append(temp)

final_preds = np.argmax(final_preds, axis = 1)
#final_preds = np.argmax(new_pre, axis = 1)

In [132]:
preds = final_preds

In [133]:
new_preds2 = list()
for i in range(len(preds)):
    if preds[i] == 0:
        new_preds2.append('0-10')
    elif preds[i] == 1:
        new_preds2.append('11-20')
    elif preds[i] == 2:
        new_preds2.append('21-30')
    elif preds[i] == 3:
        new_preds2.append('31-40')
    elif preds[i] == 4:
        new_preds2.append('41-50')
    elif preds[i] == 5:
        new_preds2.append('51-60')
    elif preds[i] == 6:
        new_preds2.append('61-70')
    elif preds[i] == 7:
        new_preds2.append('71-80')
    elif preds[i] == 8:
        new_preds2.append('81-90')
    elif preds[i] == 9:
        new_preds2.append('91-100')
    elif preds[i] == 10:
        new_preds2.append('More than 100 Days')

df_submit = pd.DataFrame({'case_id': df_test['case_id'].values, 'Stay':new_preds2})
df_submit.to_csv('submit.csv', index = False)