In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import *
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
le = preprocessing.LabelEncoder()


import warnings
warnings.filterwarnings("ignore")

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [4]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')


dict_stay={'0-10':0,'11-20':1, '21-30':2, '31-40':3, '41-50':4, '51-60':5, '61-70':6, '71-80':7,
        '81-90':8, '91-100':9, 'More than 100 Days':10}

train['Stay'].replace(dict_stay,inplace=True)

In [6]:
train_X=train.drop(['Stay'],axis=1)

train_X['type']='train'
test['type']='test'
data=pd.concat([train_X,test])

#fill na using group by
data['Bed Grade']=data.groupby(['Hospital_code'])['Bed Grade'].\
transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))

data['City_Code_Patient']=data.groupby(['Hospital_region_code'])['City_Code_Patient'].\
transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))

cat_col=['Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code','Department', 'Ward_Type', 'Ward_Facility_Code',
       'City_Code_Patient', 'Type of Admission',
       'Severity of Illness','Age']

float_col=['Available Extra Rooms in Hospital','Visitors with Patient','Admission_Deposit','Bed Grade']

for col in cat_col:
    data[col]=le.fit_transform(data[col])

for col in float_col:
    data[col]=data[col].astype(float)

In [7]:
train_X=data[data['type']=='train']
test_=data[data['type']=='test']
train_X.drop(['type'],axis=1,inplace=True)
test_.drop(['type'],axis=1,inplace=True)

X = train_X.drop(['case_id','patientid'],axis=1)
y = train['Stay']
test_=test_[X.columns]

In [16]:
from deeptables.models.deeptable import DeepTable, ModelConfig
from deeptables.datasets import dsutils
from deeptables.models import deeptable
from tensorflow import keras
from tensorflow.keras.callbacks import LearningRateScheduler , EarlyStopping

In [17]:
def build_model():  
    conf = ModelConfig(dnn_params={'hidden_units':((300, 0.3, True),(300, 0.3, True),),
                                'dnn_activation':'relu',},
                            fixed_embedding_dim=True,
                            embeddings_output_dim=20,
                            nets =['pnn_nets'],
                            stacking_op = 'add',
                            output_use_bias = False,
                            metrics=['accuracy'],
                            categorical_columns = cat_col
                        )
    dt = DeepTable(config = conf)
    
    return dt

In [18]:
epochs = 100 
batch_size = 256
seeds = [32,432 ,73]
early_stop = EarlyStopping(monitor='val_loss', patience = 5 ,restore_best_weights= True)
annealer = LearningRateScheduler(lambda x: 1e-3 * 0.95 ** x) #for plateau

In [None]:
scores = []
avg_loss = []

X_train_cv,y_train_cv = X.copy(), y.copy()

sssf = StratifiedShuffleSplit(n_splits=5, test_size = 0.45 ,random_state=1)

for i, (idxT, idxV) in enumerate(sssf.split(X_train_cv, y_train_cv)):  
    
    steps_per_epoch = len(X_train_cv.iloc[idxT])//batch_size  
    validation_steps = len(X_train_cv.iloc[idxV])//batch_size
    
    print('Fold',i)
    print(' rows of train =',len(idxT),'rows of holdout =',len(idxV))
    
    dt_cv =  build_model()
    model_dnn_cv, history_cv = dt_cv.fit(X_train_cv.iloc[idxT], y_train_cv.iloc[idxT],
                                                 validation_data = (X_train_cv.iloc[idxV],y_train_cv.iloc[idxV]),
                                                 steps_per_epoch = steps_per_epoch,
                                                 validation_steps = validation_steps,
                                                 batch_size=batch_size, epochs=epochs, 
                                                 verbose=0, callbacks=[early_stop,annealer])
    
    val_stats = dt_cv.evaluate(X_train_cv.iloc[idxV],y_train_cv.iloc[idxV], batch_size=batch_size, verbose=0)
    
    acc= val_stats['accuracy']*100
    scores.append(acc)      
    avg_loss.append(val_stats['loss'])
    print ('LGB Val CV=',acc)
    print('#'*100)
    print('\n')
    
print("Multi Log Loss Stats {0:.5f},{1:.5f}".format(np.array(avg_loss).mean(), np.array(avg_loss).std()))
print('%.3f (%.3f)' % (np.array(scores).mean(), np.array(scores).std()))

Fold 0
 rows of train = 175140 rows of holdout = 143298
11 class detected, inferred as a [multiclass classification] task
Preparing features cost:0.019106149673461914
Imputation cost:0.08170890808105469
Categorical encoding cost:0.16249394416809082
fit_transform cost:0.29991698265075684
transform_X cost:9.318440198898315
transform_y cost:0.0046389102935791016
>>>>>>>>>>>>>>>>>>>>>> Model Desc <<<<<<<<<<<<<<<<<<<<<<< 
---------------------------------------------------------
inputs:
---------------------------------------------------------
['all_categorical_vars: (11)', 'input_continuous_all: (4)']
---------------------------------------------------------
embeddings:
---------------------------------------------------------
input_dims: [34, 9, 13, 5, 7, 8, 8, 39, 5, 5, 12]
output_dims: [20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]
dropout: 0.3
---------------------------------------------------------
dense: dropout: 0
batch_normalization: False
-------------------------------------------