In [110]:
import pandas as pd
import numpy as np

from sklearn.model_selection import *

from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
from sklearn.impute import SimpleImputer

import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings("ignore")

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [111]:
train=pd.read_csv('Untitled Folder/train.csv')
test=pd.read_csv('Untitled Folder/test.csv')

dict_stay={'0-10':0, '41-50':1, '31-40':2, '11-20':3, '51-60':4, '21-30':5, '71-80':6,
       'More than 100 Days':7, '81-90':8, '61-70':9, '91-100':10}
train['Stay'].replace(dict_stay,inplace=True)

In [112]:
train.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


In [113]:
patient_train=train.groupby(['patientid']).agg({\
                                   'case_id':'count',
                                   'Hospital_code':'nunique',
                                   'Hospital_type_code':'nunique',
                                   'City_Code_Hospital':'nunique',
                                   'Hospital_region_code':'nunique',
                                   'Department':'nunique',
                                   'Type of Admission':'nunique',
                                   'Admission_Deposit':'mean',
                                  })

patient_test=test.groupby(['patientid']).agg({\
                                   'case_id':'count',
                                   'Hospital_code':'nunique',
                                   'Hospital_type_code':'nunique',
                                   'City_Code_Hospital':'nunique',
                                   'Hospital_region_code':'nunique',
                                   'Department':'nunique',
                                   'Type of Admission':'nunique',
                                   'Admission_Deposit':'mean',
                                  })

cols=['total_visits','uniqueHos','uniqueHosType','uniqueHosCity','uniqueHosRegion','uniqueDepartment','uniqueAdm','avgDeposit']
patient_train.columns=cols
patient_test.columns=cols

train=pd.merge(train,patient_train,on='patientid',how='left')
test=pd.merge(test,patient_test,on='patientid',how='left')

In [115]:
train_X=train.drop(['Stay'],axis=1)

train_X['type']='train'
test['type']='test'
data=pd.concat([train_X,test])



impute_median=SimpleImputer(missing_values=np.nan,strategy='median')
data['Bed Grade']=impute_median.fit_transform(data[['Bed Grade']]).ravel()
data['City_Code_Patient']=impute_median.fit_transform(data[['City_Code_Patient']]).ravel()

cat_col=['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code','Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'City_Code_Patient', 'Type of Admission',
       'Severity of Illness','Age']

float_col=['Available Extra Rooms in Hospital','Visitors with Patient','Admission_Deposit',
          'total_visits','uniqueHos','uniqueHosType','uniqueHosCity','uniqueHosRegion','uniqueDepartment','uniqueAdm','avgDeposit']

for col in cat_col:
    data[col]=le.fit_transform(data[col])

for col in float_col:
    data[col]=data[col].astype(float)

In [116]:
train_X=data[data['type']=='train']
test_=data[data['type']=='test']
train_X.drop(['type'],axis=1,inplace=True)
test_.drop(['type'],axis=1,inplace=True)



X = train_X.drop(['case_id','patientid','City_Code_Hospital','Hospital_type_code','Ward_Facility_Code'],axis=1)
y = train['Stay']
test_=test_[X.columns]


cat_col=['Hospital_code',
       'Hospital_region_code','Department', 'Ward_Type', 'Bed Grade',
       'City_Code_Patient', 'Type of Admission',
       'Severity of Illness','Age']



cat_cols=X.columns
cat_cols=[X.columns.get_loc(c) for c in cat_col]

In [117]:
clf_lgb = lgb.LGBMClassifier(boosting_type='gbdt', 
                      objective='multiclass',
                      num_class=11,
                      num_iteration=1000, 
                      
                      max_depth=7,
                      num_leaves=None,
                      min_data_in_leaf=4,
                     
                      learning_rate=0.1,
                      categorical_feature = cat_cols,
                      random_state=101
                     )

In [127]:
#grid search for best param
kf = KFold(n_splits=5, shuffle=True, random_state=42).split(X,y)

param_grid = {
    'objective':'multiclass',
    'num_class':[11],
    'num_iteration':[1000],
    'max_depth':[5,7],
    'num_leaves': [30, 100],
    'min_data_in_leaf': [5,10],
    'learning_rate':[0.1],
    }

gsearch=GridSearchCV(estimator=clf_lgb,param_grid=param_grid,n_jobs=-1,verbose=0,return_train_score=True,cv=kf)

lgb_model = gsearch.fit(X,y)
print(lgb_model.best_params_, lgb_model.best_score_)

In [91]:
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=101)
cv_score = []
pred_test =np.zeros((len(test_),11))

for train_index,test_index in skf.split(X,y):
    x_train,x_val = X.iloc[train_index],X.iloc[test_index]
    y_train,y_val = y.iloc[train_index],y.iloc[test_index]
    clf = clf_lgb
    clf.fit(x_train,y_train)
    score = round(accuracy_score(y_val,clf.predict(x_val)),2)
    cv_score.append(score)
    print(score,end=",")
    
    #predictions
    pred_test += clf.predict_proba(test_).reshape(-1,11)
    
pred_test = pred_test/10

# feature_imp = pd.DataFrame(sorted(zip(X.columns,clf.feature_importances_)), columns=['Feature','Value'])
# feature_imp=feature_imp.sort_values(by='Value',ascending=False)

arg_pred=[]
for item in pred_test:
    arg_pred.append(np.argmax(item))
    

0.43,0.43,0.43,0.43,0.43,0.44,0.43,0.43,0.44,0.44,

In [105]:
test_['Stay']=arg_pred
test_.index=test.case_id
test_=test_[['Stay']]
inv_map_dict_stay = {v: k for k, v in dict_stay.items()}
test_['Stay'].replace(inv_map_dict_stay,inplace=True)
test_.to_csv('try1.csv')
test_.head()

Unnamed: 0_level_0,Stay
case_id,Unnamed: 1_level_1
318439,0-10
318440,51-60
318441,21-30
318442,21-30
318443,51-60


In [None]:
cat=CatBoostClassifier(iterations=1000,  
                         learning_rate=0.1,
                         loss_function='MultiClass',
                         cat_features = cat_cols,
                        verbose=False
                        )

x_train,x_test,y_train,y_test=train_test_split(X,y)
clf=cat.fit(x_train,y_train)
print(accuracy_score(y_test,clf.predict(x_test)))