In [165]:
import pandas as pd
import numpy as np

from sklearn.model_selection import *

from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
from sklearn.impute import SimpleImputer

import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [166]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
data_dict=pd.read_csv('train_data_dict.csv')

In [167]:
train.shape,test.shape

((318438, 18), (137057, 17))

In [168]:
train.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


In [169]:
df=pd.DataFrame(columns=['cols_','unique_'])
i=0
df['cols_']=train.columns
for col in train.columns:
    df.iloc[i,1]=train[col].nunique()
    i+=1
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
cols_,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
unique_,318438,32,7,11,3,18,5,6,6,4,92017,37,3,3,28,10,7300,11


In [170]:
dict_stay={'0-10':0, '41-50':1, '31-40':2, '11-20':3, '51-60':4, '21-30':5, '71-80':6,
       'More than 100 Days':7, '81-90':8, '61-70':9, '91-100':10}
train['Stay'].replace(dict_stay,inplace=True)


train_X=train.drop(['Stay'],axis=1)

train_X['type']='train'
test['type']='test'
data=pd.concat([train_X,test])

In [171]:
data.isna().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                             148
patientid                               0
City_Code_Patient                    6689
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
type                                    0
dtype: int64

In [172]:
impute_median=SimpleImputer(missing_values=np.nan,strategy='median')
data['Bed Grade']=impute_median.fit_transform(data[['Bed Grade']]).ravel()
data['City_Code_Patient']=impute_median.fit_transform(data[['City_Code_Patient']]).ravel()

cat_col=['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code','Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'City_Code_Patient', 'Type of Admission',
       'Severity of Illness','Age']
float_col=['Available Extra Rooms in Hospital','Visitors with Patient','Admission_Deposit']

for col in cat_col:
    data[col]=le.fit_transform(data[col])


In [173]:
train_X=data[data['type']=='train']
test_=data[data['type']=='test']
train_X.drop(['type'],axis=1,inplace=True)
test_.drop(['type'],axis=1,inplace=True)

X = train_X.drop(['case_id','patientid'],axis=1)
y = train['Stay']
test_=test_[X.columns]

cat_cols=X.columns
cat_cols=[X.columns.get_loc(c) for c in cat_col]

In [174]:
lgb = LGBMClassifier(boosting_type='gbdt', 
                      objective='multiclass',
                      num_class=11,
                      num_iteration=5000, 
                      
                      max_depth=5,
                      num_leaves=4,
                      min_data_in_leaf=4,
                     
                      learning_rate=0.1,
                      categorical_feature = cat_cols,
                      random_state=101
                     )

# x_train,x_test,y_train,y_test=train_test_split(X,y)
# clf=lgb.fit(x_train,y_train)
# print(accuracy_score(y_test,clf.predict(x_test)))

clf=lgb.fit(X,y)
pred=clf.predict(test_)

feature_imp = pd.DataFrame(sorted(zip(X.columns,clf.feature_importances_)), columns=['Feature','Value'])

In [175]:
feature_imp

Unnamed: 0,Feature,Value
0,Admission_Deposit,28816
1,Age,12945
2,Available Extra Rooms in Hospital,8751
3,Bed Grade,5837
4,City_Code_Hospital,138
5,City_Code_Patient,28422
6,Department,4304
7,Hospital_code,42118
8,Hospital_region_code,442
9,Hospital_type_code,35


In [186]:
test_['Stay']=pred
test_.index=test.case_id
test_=test_[['Stay']]
inv_map_dict_stay = {v: k for k, v in dict_stay.items()}
test_['Stay'].replace(inv_map_dict_stay,inplace=True)
test_.to_csv('try1.csv')
test_.head()

Unnamed: 0_level_0,Stay
case_id,Unnamed: 1_level_1
318439,0-10
318440,51-60
318441,21-30
318442,21-30
318443,51-60


In [188]:
lr=LogisticRegression()
lr.fit(x_train,y_train)
print(accuracy_score(y_test,lr.predict(x_test)))

0.3608968722522296


In [190]:
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)
print(accuracy_score(y_test,rfc.predict(x_test)))

0.3577063183017209
