In [29]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [30]:
df = pd.read_csv('data/final_data_for_modelling.csv')

df.head()

Unnamed: 0,gender,age,neighbourhood,scholarship,hypertension,diabetes,alcoholism,handicap,sms_received,showed,...,tempmax,temp,feelslikemax,feelslike,humidity,windspeed,solarradiation,solarenergy,uvindex,rating
0,F,62,JARDIM DA PENHA,0,1,0,0,0,0,1,...,75.2,73.3,75.2,73.3,76.7,16.1,52.9,4.3,2.0,4.5
1,M,56,JARDIM DA PENHA,0,0,0,0,0,0,1,...,75.2,73.3,75.2,73.3,76.7,16.1,52.9,4.3,2.0,4.5
2,F,62,MATA DA PRAIA,0,0,0,0,0,0,1,...,75.2,73.3,75.2,73.3,76.7,16.1,52.9,4.3,2.0,5.0
3,F,8,PONTAL DE CAMBURI,0,0,0,0,0,0,1,...,75.2,73.3,75.2,73.3,76.7,16.1,52.9,4.3,2.0,5.0
4,F,56,JARDIM DA PENHA,0,1,1,0,0,0,1,...,75.2,73.3,75.2,73.3,76.7,16.1,52.9,4.3,2.0,4.5


In [31]:
## Ordinal encoder for features
enc = OrdinalEncoder()

## One hot encoder for categorical features
ohe = OneHotEncoder(sparse=False)

enc.fit(df[["gender"]])

df[["gender"]] = enc.transform(df[["gender"]])
encoded_columns = ohe.fit_transform(df[['appointment_day_of_week']])
data_hot_encoded = pd.DataFrame(encoded_columns, index=df.index)
data_hot_encoded.columns = ohe.get_feature_names(['appointment_day_of_week'])

data_other_cols = df.drop(columns='appointment_day_of_week')

#Concatenate the two dataframes :
df = pd.concat([data_hot_encoded, data_other_cols], axis=1)

df.head()



Unnamed: 0,appointment_day_of_week_Friday,appointment_day_of_week_Monday,appointment_day_of_week_Saturday,appointment_day_of_week_Thursday,appointment_day_of_week_Tuesday,appointment_day_of_week_Wednesday,gender,age,neighbourhood,scholarship,...,tempmax,temp,feelslikemax,feelslike,humidity,windspeed,solarradiation,solarenergy,uvindex,rating
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,62,JARDIM DA PENHA,0,...,75.2,73.3,75.2,73.3,76.7,16.1,52.9,4.3,2.0,4.5
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,56,JARDIM DA PENHA,0,...,75.2,73.3,75.2,73.3,76.7,16.1,52.9,4.3,2.0,4.5
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,62,MATA DA PRAIA,0,...,75.2,73.3,75.2,73.3,76.7,16.1,52.9,4.3,2.0,5.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,8,PONTAL DE CAMBURI,0,...,75.2,73.3,75.2,73.3,76.7,16.1,52.9,4.3,2.0,5.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,56,JARDIM DA PENHA,0,...,75.2,73.3,75.2,73.3,76.7,16.1,52.9,4.3,2.0,4.5


In [32]:
y = df['showed']
X = df.drop('showed', axis=1)
X = X[['gender', 'age', 'scholarship', 'hypertension', 'diabetes', 'alcoholism', 'handicap', 'sms_received',
       'days_between_appointment_and_scheduled_day', 'tempmax', 'temp', 'feelslikemax', 'feelslike', 'humidity',
       'windspeed', 'solarradiation', 'solarenergy', 'uvindex', 'appointment_day_of_week_Friday',
       'appointment_day_of_week_Monday', 'appointment_day_of_week_Tuesday', 'appointment_day_of_week_Wednesday',
       'appointment_day_of_week_Thursday', 'appointment_day_of_week_Friday', 'appointment_day_of_week_Saturday',
       'rating']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [33]:

clf = RandomForestClassifier()
param_grid = {
    'n_estimators': [5, 10, 15, 20],
    'max_depth': [2, 5, 7, 9]
}
model = GridSearchCV(clf, param_grid,scoring='recall', cv=10)
model.fit(X_train, y_train)
# model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))
print(model.best_params_)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00      4319
           1       0.80      1.00      0.89     17077

    accuracy                           0.80     21396
   macro avg       0.40      0.50      0.44     21396
weighted avg       0.64      0.80      0.71     21396

{'max_depth': 2, 'n_estimators': 5}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
## THings to do for modelling
# 1) Optimize model with paraemeter tuning (grid search)
# 2) Results - precision, recall (for both classes) f1, AUROC
# 3) read data from DB instead of CSV
