In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve

## Neighbourhood wise modelling

In [None]:
def onehot_encode(X, cols):
    # Treat new categories as a new 'unknown' category (all onehot columns are 0)
    onehot_enc = OneHotEncoder(handle_unknown='ignore')
    # Fit encoder on training data
    onehot_enc.fit(X[cols])
    # Get the names of the new columns created
    colnames = list(onehot_enc.get_feature_names(input_features=cols))
    # Transform the data
    onehot_vals = onehot_enc.transform(X[cols]).toarray()
    # Put transformed data into dataframe
    enc_df = pd.DataFrame(onehot_vals,columns=colnames,index=X.index)
    # Add onehot columns back onto original dataframe and drop the original columns
    X = pd.concat([X,enc_df],axis=1).drop(cols,axis=1)
    return X,onehot_enc

In [None]:
## Read the data for modelling
df = pd.read_csv('../data/final_data_for_modelling.csv')

df.info()

In [None]:
## get a list of neighbourhoods
neigh_list = list(df.neighbourhood.unique())
filter_neigh_list = [neigh for neigh in neigh_list if len(df[df["neighbourhood"] == neigh])>1000]

filter_neigh_list

In [None]:
## Make dataframe filters for each neighbourhood
data = {}
for neigh in filter_neigh_list:
    df_temp = df.copy()
    df_temp = df_temp[df_temp["neighbourhood"] == neigh]
    df_temp.drop(columns=["age_group", "neighbourhood", "patientid", "appointmentid",
                    "scheduledday", "appointmentday", "showed"] , inplace=True)
    data[neigh] = df_temp

In [None]:
## Create one model for each neighbourhood

for key, datafr in data.items():
    # Splitting dataset into test and train
    X_train, X_test, y_train, y_test = train_test_split(datafr.drop(columns=["no_show"]), datafr["no_show"], random_state=0,test_size=0.2)

    ## Ordinal encoder for features
    enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

    ## Fit encoder on train and apply to test data as well
    X_train[["gender"]] = enc.fit_transform(X_train[["gender"]])
    X_test[["gender"]] = enc.transform(X_test[["gender"]])

    ## One hot encode the train data
    cols = ["appointment_day_of_week"]
    X_train, onehot_enc = onehot_encode(X_train, cols)

    # Apply onehot encoder to test data
    colnames = columns=list(onehot_enc.get_feature_names(input_features=cols))
    onehot_vals = onehot_enc.transform(X_test[cols]).toarray()

    # Put transformed data into dataframe
    enc_df = pd.DataFrame(onehot_vals,columns=colnames,index=X_test.index)
    # Add onehot columns back onto original dataframe and drop the original columns
    X_test = pd.concat([X_test,enc_df],axis=1).drop(cols, axis=1)

    # Create the parameter grid based on the results of random search 
    param_grid = {
        'max_depth' : [3,4,5,6],
        'min_samples_leaf': [2, 3, 4, 5],
        'n_estimators': [25, 50, 75],
        'random_state':[0],
        'criterion' :['gini', 'entropy'],
        'class_weight': [{1:4}]
    }

    # Create a based model
    rf = RandomForestClassifier()
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                                cv = 3, n_jobs = -1, verbose = 0, scoring='precision')

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    ## Train the best model
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)

    ## Classsification report
    y_pred_train = best_model.predict(X_train)
    print(f"Train Report: {key}")
    print(classification_report(y_train, y_pred_train))
    y_pred_test = best_model.predict(X_test)
    print("Test Report")
    print(classification_report(y_test, y_pred_test))
