In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve

## Neighbourhood wise modelling

In [7]:
def onehot_encode(X, cols):
    # Treat new categories as a new 'unknown' category (all onehot columns are 0)
    onehot_enc = OneHotEncoder(handle_unknown='ignore')
    # Fit encoder on training data
    onehot_enc.fit(X[cols])
    # Get the names of the new columns created
    colnames = list(onehot_enc.get_feature_names(input_features=cols))
    # Transform the data
    onehot_vals = onehot_enc.transform(X[cols]).toarray()
    # Put transformed data into dataframe
    enc_df = pd.DataFrame(onehot_vals,columns=colnames,index=X.index)
    # Add onehot columns back onto original dataframe and drop the original columns
    X = pd.concat([X,enc_df],axis=1).drop(cols,axis=1)
    return X,onehot_enc

In [8]:
## Read the data for modelling
df = pd.read_csv('data/final_data_for_modelling.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106980 entries, 0 to 106979
Data columns (total 26 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   gender                                      106980 non-null  object 
 1   age                                         106980 non-null  int64  
 2   neighbourhood                               106980 non-null  object 
 3   scholarship                                 106980 non-null  int64  
 4   hypertension                                106980 non-null  int64  
 5   diabetes                                    106980 non-null  int64  
 6   alcoholism                                  106980 non-null  int64  
 7   handicap                                    106980 non-null  int64  
 8   sms_received                                106980 non-null  int64  
 9   showed                                      106980 non-null  int64  
 

In [9]:
## get a list of neighbourhoods
neigh_list = list(df.neighbourhood.unique())
filter_neigh_list = [neigh for neigh in neigh_list if len(df[df["neighbourhood"] == neigh])>2000]

filter_neigh_list

['JARDIM DA PENHA',
 'ANDORINHAS',
 'NOVA PALESTINA',
 'DA PENHA',
 'TABUAZEIRO',
 'SÃO PEDRO',
 'SANTA MARTHA',
 'SANTO ANDRÉ',
 'BONFIM',
 'JARDIM CAMBURI',
 'MARIA ORTIZ',
 'JABOUR',
 'RESISTÊNCIA',
 'SANTO ANTÔNIO',
 'ITARARÉ',
 'CENTRO',
 'CARATOÍRA',
 'JESUS DE NAZARETH',
 'ILHA DO PRÍNCIPE',
 'ROMÃO']

In [10]:
## Make dataframe filters for each neighbourhood
data = {}
for neigh in filter_neigh_list:
    df_temp = df.copy()
    df_temp = df_temp[df_temp["neighbourhood"] == neigh]
    df_temp.drop(columns=["age_group", "Neighbourhood", "neighbourhood", "showed"] , inplace=True)
    data[neigh] = df_temp

In [11]:
## Create one model for each neighbourhood

for key, datafr in data.items():
    # Splitting dataset into test and train
    X_train, X_test, y_train, y_test = train_test_split(datafr.drop(columns=["no_show"]), datafr["no_show"], random_state=0,test_size=0.2)

    ## Ordinal encoder for features
    enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

    ## Fit encoder on train and apply to test data as well
    X_train[["gender"]] = enc.fit_transform(X_train[["gender"]])
    X_test[["gender"]] = enc.transform(X_test[["gender"]])

    ## One hot encode the train data
    cols = ["appointment_day_of_week"]
    X_train, onehot_enc = onehot_encode(X_train, cols)

    # Apply onehot encoder to test data
    colnames = columns=list(onehot_enc.get_feature_names(input_features=cols))
    onehot_vals = onehot_enc.transform(X_test[cols]).toarray()

    # Put transformed data into dataframe
    enc_df = pd.DataFrame(onehot_vals,columns=colnames,index=X_test.index)
    # Add onehot columns back onto original dataframe and drop the original columns
    X_test = pd.concat([X_test,enc_df],axis=1).drop(cols, axis=1)

    # Create the parameter grid based on the results of random search 
    param_grid = {
        'max_depth' : [3,4,5,6],
        'min_samples_leaf': [2, 3, 4, 5],
        'n_estimators': [25, 50, 75],
        'random_state':[0],
        'criterion' :['gini', 'entropy'],
        'class_weight': [{1:4}]
    }

    # Create a based model
    rf = RandomForestClassifier()
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                                cv = 3, n_jobs = -1, verbose = 0, scoring='precision')

    # Fit the grid search to the data
    grid_search.fit(X_train, y_train)

    ## Train the best model
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)

    ## Classsification report
    y_pred_train = best_model.predict(X_train)
    print(f"Train Report: {key}")
    print(classification_report(y_train, y_pred_train))
    y_pred_test = best_model.predict(X_test)
    print("Test Report")
    print(classification_report(y_test, y_pred_test))




Train Report: JARDIM DA PENHA
              precision    recall  f1-score   support

           0       0.87      0.89      0.88      2590
           1       0.33      0.30      0.31       490

    accuracy                           0.79      3080
   macro avg       0.60      0.59      0.60      3080
weighted avg       0.78      0.79      0.79      3080

Test Report
              precision    recall  f1-score   support

           0       0.84      0.85      0.85       634
           1       0.26      0.24      0.25       137

    accuracy                           0.75       771
   macro avg       0.55      0.55      0.55       771
weighted avg       0.74      0.75      0.74       771





Train Report: ANDORINHAS
              precision    recall  f1-score   support

           0       0.96      0.51      0.67      1335
           1       0.38      0.94      0.54       421

    accuracy                           0.62      1756
   macro avg       0.67      0.73      0.60      1756
weighted avg       0.82      0.62      0.64      1756

Test Report
              precision    recall  f1-score   support

           0       0.94      0.51      0.66       351
           1       0.31      0.86      0.45        88

    accuracy                           0.58       439
   macro avg       0.62      0.69      0.56       439
weighted avg       0.81      0.58      0.62       439





Train Report: NOVA PALESTINA
              precision    recall  f1-score   support

           0       0.95      0.77      0.85      1407
           1       0.43      0.80      0.56       310

    accuracy                           0.77      1717
   macro avg       0.69      0.78      0.70      1717
weighted avg       0.85      0.77      0.79      1717

Test Report
              precision    recall  f1-score   support

           0       0.90      0.74      0.81       356
           1       0.33      0.62      0.43        74

    accuracy                           0.72       430
   macro avg       0.62      0.68      0.62       430
weighted avg       0.81      0.72      0.75       430





Train Report: DA PENHA
              precision    recall  f1-score   support

           0       0.95      0.72      0.82      1391
           1       0.42      0.84      0.56       333

    accuracy                           0.74      1724
   macro avg       0.68      0.78      0.69      1724
weighted avg       0.85      0.74      0.77      1724

Test Report
              precision    recall  f1-score   support

           0       0.86      0.70      0.77       342
           1       0.33      0.57      0.42        89

    accuracy                           0.67       431
   macro avg       0.60      0.63      0.59       431
weighted avg       0.75      0.67      0.70       431





Train Report: TABUAZEIRO
              precision    recall  f1-score   support

           0       0.94      0.71      0.81      1976
           1       0.39      0.81      0.52       438

    accuracy                           0.73      2414
   macro avg       0.67      0.76      0.67      2414
weighted avg       0.84      0.73      0.76      2414

Test Report
              precision    recall  f1-score   support

           0       0.89      0.66      0.76       491
           1       0.30      0.65      0.41       113

    accuracy                           0.66       604
   macro avg       0.60      0.65      0.58       604
weighted avg       0.78      0.66      0.69       604





Train Report: SÃO PEDRO
              precision    recall  f1-score   support

           0       0.94      0.61      0.74      1475
           1       0.37      0.85      0.52       401

    accuracy                           0.66      1876
   macro avg       0.65      0.73      0.63      1876
weighted avg       0.82      0.66      0.69      1876

Test Report
              precision    recall  f1-score   support

           0       0.89      0.58      0.70       374
           1       0.31      0.73      0.43        96

    accuracy                           0.61       470
   macro avg       0.60      0.65      0.57       470
weighted avg       0.77      0.61      0.65       470





Train Report: SANTA MARTHA
              precision    recall  f1-score   support

           0       0.96      0.71      0.82      2041
           1       0.36      0.83      0.50       391

    accuracy                           0.73      2432
   macro avg       0.66      0.77      0.66      2432
weighted avg       0.86      0.73      0.77      2432

Test Report
              precision    recall  f1-score   support

           0       0.94      0.72      0.81       511
           1       0.34      0.77      0.47        98

    accuracy                           0.72       609
   macro avg       0.64      0.74      0.64       609
weighted avg       0.84      0.72      0.76       609





Train Report: SANTO ANDRÉ
              precision    recall  f1-score   support

           0       0.95      0.69      0.80      1586
           1       0.40      0.84      0.54       380

    accuracy                           0.72      1966
   macro avg       0.67      0.77      0.67      1966
weighted avg       0.84      0.72      0.75      1966

Test Report
              precision    recall  f1-score   support

           0       0.89      0.63      0.74       386
           1       0.35      0.72      0.47       106

    accuracy                           0.65       492
   macro avg       0.62      0.67      0.60       492
weighted avg       0.77      0.65      0.68       492





Train Report: BONFIM
              precision    recall  f1-score   support

           0       0.96      0.67      0.79      1736
           1       0.39      0.88      0.54       424

    accuracy                           0.71      2160
   macro avg       0.67      0.77      0.66      2160
weighted avg       0.85      0.71      0.74      2160

Test Report
              precision    recall  f1-score   support

           0       0.89      0.67      0.77       427
           1       0.36      0.69      0.47       113

    accuracy                           0.68       540
   macro avg       0.62      0.68      0.62       540
weighted avg       0.78      0.68      0.70       540





Train Report: JARDIM CAMBURI
              precision    recall  f1-score   support

           0       0.92      0.61      0.74      4927
           1       0.32      0.78      0.45      1141

    accuracy                           0.64      6068
   macro avg       0.62      0.70      0.59      6068
weighted avg       0.81      0.64      0.68      6068

Test Report
              precision    recall  f1-score   support

           0       0.89      0.59      0.71      1227
           1       0.28      0.68      0.40       291

    accuracy                           0.61      1518
   macro avg       0.58      0.63      0.55      1518
weighted avg       0.77      0.61      0.65      1518





Train Report: MARIA ORTIZ
              precision    recall  f1-score   support

           0       0.96      0.55      0.70      3490
           1       0.36      0.91      0.52       960

    accuracy                           0.63      4450
   macro avg       0.66      0.73      0.61      4450
weighted avg       0.83      0.63      0.66      4450

Test Report
              precision    recall  f1-score   support

           0       0.95      0.51      0.66       879
           1       0.33      0.90      0.48       234

    accuracy                           0.59      1113
   macro avg       0.64      0.71      0.57      1113
weighted avg       0.82      0.59      0.62      1113





Train Report: JABOUR
              precision    recall  f1-score   support

           0       0.94      0.65      0.77      1557
           1       0.35      0.84      0.50       359

    accuracy                           0.68      1916
   macro avg       0.65      0.74      0.63      1916
weighted avg       0.83      0.68      0.72      1916

Test Report
              precision    recall  f1-score   support

           0       0.93      0.64      0.76       407
           1       0.26      0.73      0.39        73

    accuracy                           0.65       480
   macro avg       0.60      0.68      0.57       480
weighted avg       0.83      0.65      0.70       480





Train Report: RESISTÊNCIA
              precision    recall  f1-score   support

           0       0.95      0.59      0.73      2678
           1       0.37      0.89      0.52       709

    accuracy                           0.66      3387
   macro avg       0.66      0.74      0.63      3387
weighted avg       0.83      0.66      0.69      3387

Test Report
              precision    recall  f1-score   support

           0       0.91      0.55      0.69       682
           1       0.30      0.78      0.43       165

    accuracy                           0.60       847
   macro avg       0.61      0.67      0.56       847
weighted avg       0.79      0.60      0.64       847





Train Report: SANTO ANTÔNIO
              precision    recall  f1-score   support

           0       0.94      0.72      0.82      1760
           1       0.37      0.77      0.50       376

    accuracy                           0.73      2136
   macro avg       0.65      0.75      0.66      2136
weighted avg       0.84      0.73      0.76      2136

Test Report
              precision    recall  f1-score   support

           0       0.88      0.68      0.77       438
           1       0.29      0.59      0.39        97

    accuracy                           0.67       535
   macro avg       0.59      0.64      0.58       535
weighted avg       0.78      0.67      0.70       535





Train Report: ITARARÉ
              precision    recall  f1-score   support

           0       0.95      0.56      0.70      2021
           1       0.42      0.91      0.57       708

    accuracy                           0.65      2729
   macro avg       0.68      0.73      0.64      2729
weighted avg       0.81      0.65      0.67      2729

Test Report
              precision    recall  f1-score   support

           0       0.88      0.52      0.65       493
           1       0.39      0.82      0.53       190

    accuracy                           0.60       683
   macro avg       0.64      0.67      0.59       683
weighted avg       0.74      0.60      0.62       683





Train Report: CENTRO
              precision    recall  f1-score   support

           0       0.94      0.62      0.74      2062
           1       0.38      0.86      0.53       563

    accuracy                           0.67      2625
   macro avg       0.66      0.74      0.64      2625
weighted avg       0.82      0.67      0.70      2625

Test Report
              precision    recall  f1-score   support

           0       0.90      0.59      0.72       528
           1       0.31      0.74      0.44       129

    accuracy                           0.62       657
   macro avg       0.61      0.67      0.58       657
weighted avg       0.79      0.62      0.66       657





Train Report: CARATOÍRA
              precision    recall  f1-score   support

           0       0.95      0.59      0.73      1530
           1       0.40      0.89      0.55       461

    accuracy                           0.66      1991
   macro avg       0.67      0.74      0.64      1991
weighted avg       0.82      0.66      0.68      1991

Test Report
              precision    recall  f1-score   support

           0       0.92      0.64      0.75       373
           1       0.43      0.83      0.57       125

    accuracy                           0.68       498
   macro avg       0.68      0.73      0.66       498
weighted avg       0.80      0.68      0.71       498





Train Report: JESUS DE NAZARETH
              precision    recall  f1-score   support

           0       0.96      0.60      0.74      1633
           1       0.43      0.93      0.59       542

    accuracy                           0.68      2175
   macro avg       0.70      0.76      0.67      2175
weighted avg       0.83      0.68      0.70      2175

Test Report
              precision    recall  f1-score   support

           0       0.93      0.58      0.71       410
           1       0.40      0.87      0.55       134

    accuracy                           0.65       544
   macro avg       0.66      0.72      0.63       544
weighted avg       0.80      0.65      0.67       544





Train Report: ILHA DO PRÍNCIPE
              precision    recall  f1-score   support

           0       0.96      0.58      0.72      1354
           1       0.41      0.91      0.56       423

    accuracy                           0.66      1777
   macro avg       0.68      0.75      0.64      1777
weighted avg       0.82      0.66      0.68      1777

Test Report
              precision    recall  f1-score   support

           0       0.92      0.57      0.70       343
           1       0.36      0.83      0.51       102

    accuracy                           0.63       445
   macro avg       0.64      0.70      0.61       445
weighted avg       0.79      0.63      0.66       445





Train Report: ROMÃO
              precision    recall  f1-score   support

           0       0.95      0.48      0.63      1329
           1       0.33      0.91      0.48       371

    accuracy                           0.57      1700
   macro avg       0.64      0.69      0.56      1700
weighted avg       0.81      0.57      0.60      1700

Test Report
              precision    recall  f1-score   support

           0       0.94      0.46      0.62       331
           1       0.32      0.91      0.48        95

    accuracy                           0.56       426
   macro avg       0.63      0.68      0.55       426
weighted avg       0.81      0.56      0.58       426

