<a href="https://colab.research.google.com/github/agastya1995/Market-Basket-Analysis/blob/main/Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Embedding, Dense, Input, Reshape, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras import backend  
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical
import tensorflow_addons as tfa

In [None]:
train = pd.read_csv('DNN Data on Healthcare Analytics_Train.csv')
test = pd.read_csv('DNN Data on Healthcare Analytics_Test.csv')

In [None]:
train.head()

In [None]:
train.shape, test.shape

In [None]:
test.head()

**See which variables are categorical variables and which are numeric** 
**Categorical Variables:**
1. Hospital Code
2. Hospital Type Code
3. City Code Hospital
4. Hospital Region Code (Drop?)
5. Department
6. Ward Type
7. Ward Code
9. Patient ID (Can be dropped)
10. City_Code_Patient (Can be dropped)
11. Type of Admission
12. Severity of Illness (Has order, so label encode)
13. Age (Label encode)
14. Stay (Convert to separate class - Label Encode for now)

In [None]:
def convert_target(length_of_stay):
    if length_of_stay in ['0-10', '11-20']:
        return 1
    elif length_of_stay in ['21-30', '31-40']:
        return 2
    elif length_of_stay in ['41-50', '51-60']:
        return 3
    elif length_of_stay in ['61-70', '71-80']:
        return 4
    else:
        return 5

In [None]:
def preprocess(df):
    # Drop nas
    df.dropna(inplace=True)
    # Drop columns that are not needed
    df = df.sample(frac=1)
    target = df['Stay']
    df = df.drop(['patientid', 'City_Code_Patient', 'Stay'], axis=1)
    
    
    # Since there is an order to severity of illness, manually map the integers to it
    df['Severity of Illness'] = df['Severity of Illness'].map(lambda x: 0 if x=='Minor' else 1 if x=='Moderate' else 2)
    # Take age as the mid point
    df['Age'] = df['Age'].map(lambda x: int(x.split('-')[0])+4)
    
    # Label encode all the other cateogrical columns 
    cat_cols = df.columns[df.dtypes=='object']
    for i in cat_cols:
        le = LabelEncoder()
        df[i] =  le.fit_transform(df[i])
        
    # Normalize the other columns
    non_cat_columns = [i for i in df.columns if i not in cat_cols]
    min_max = MinMaxScaler()
    df[non_cat_columns] = min_max.fit_transform(df[non_cat_columns])
    
    # Convert the 11 classes in target to 5 
    target_converted = target.map(convert_target)
        
    # Make dummies
    target_converted = pd.get_dummies(target_converted)
    
    return (df, cat_cols, target_converted)

In [None]:
train, cat_cols, train_target = preprocess(train)

In [None]:
train_target

In [None]:
train

In [None]:
train.describe()

### Embedding ###

In [None]:
backend.clear_session()
models = []
inputs = []

for feature in cat_cols:
    number_of_unique_cat = train[feature].nunique()
    embedding_size = int(np.ceil(number_of_unique_cat/2))
    input_layer = Input(shape=1)
    embedding_layer = Embedding(number_of_unique_cat+1, embedding_size, input_length=1)(input_layer)
    output_layer = Reshape((embedding_size,))(embedding_layer)
    models.append(output_layer)
    inputs.append(input_layer)

input_rest = Input(shape=8)
output_rest = Dense(32)(input_rest)
models.append(output_rest)
inputs.append(input_rest)

full_model = concatenate(models)
x1 = Dense(32, activation = 'relu')(full_model)
x1 = Dense(64, activation = 'relu')(x1)

output = Dense(5, activation='softmax')(x1)
final_model = Model(inputs, output)



In [None]:
def Convert_Df_To_List(df, cat_cols):
    train_set_for_model = [df[i].values for i in cat_cols]
    train_set_for_model.append(df.loc[:, ~df.columns.isin(cat_cols)])
    return train_set_for_model

In [None]:
final_model.summary()

In [None]:
# Convert dataframe to list
train_set_for_model = Convert_Df_To_List(train, cat_cols)

In [None]:
len(train_set_for_model)

In [None]:
final_model.compile(loss='categorical_crossentropy', metrics=['accuracy', tfa.metrics.CohenKappa(num_classes=5)])
hist = final_model.fit(x=train_set_for_model, y=train_target.values, batch_size=128, epochs=50, validation_split=0.3)

In [None]:
test, _, test_target = preprocess(test)

In [None]:
test_for_model = Convert_Df_To_List(test, cat_cols)

In [None]:
test_for_model

In [None]:
preds = final_model.predict(test_for_model)

In [None]:
cohen_kappa = tfa.metrics.CohenKappa(5, sparse_labels=True)

In [None]:
preds_t = preds.argmax(axis=1)

In [None]:
actual_t = np.argmax(test_target.values, axis=1)

In [None]:
cohen_kappa.update_state(preds_t, actual_t)

In [None]:
cohen_kappa.result().numpy()