In [1]:
# importing the neccesary code libraries
import pandas as pd
import numpy as np
import warnings
import pickle
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report


In [2]:
# reading the file
df = pd.read_csv("../data/processed/usage.csv")

In [3]:
df.head()

Unnamed: 0,inserted_at,institution_id,in-patient,out-patient,total
0,2021-01-01,09415354-4eb1-4866-87bb-efffe23e196a,0.0,11.0,11.0
1,2021-01-01,097d4418-6d7c-4922-a703-bde5a6b64919,1.0,19.0,20.0
2,2021-01-01,0da8e9a5-855d-42c5-9435-d69e1019cafc,0.0,2.0,2.0
3,2021-01-01,0fa2e546-c292-4060-a05e-beb53fc31e72,0.0,3.0,3.0
4,2021-01-01,18cb6e95-6073-4102-989e-1b133ac99b29,0.0,1.0,1.0


In [4]:
# converting to datetime
df["inserted_at"] = pd.to_datetime(df["inserted_at"])

In [5]:
df.head()

Unnamed: 0,inserted_at,institution_id,in-patient,out-patient,total
0,2021-01-01,09415354-4eb1-4866-87bb-efffe23e196a,0.0,11.0,11.0
1,2021-01-01,097d4418-6d7c-4922-a703-bde5a6b64919,1.0,19.0,20.0
2,2021-01-01,0da8e9a5-855d-42c5-9435-d69e1019cafc,0.0,2.0,2.0
3,2021-01-01,0fa2e546-c292-4060-a05e-beb53fc31e72,0.0,3.0,3.0
4,2021-01-01,18cb6e95-6073-4102-989e-1b133ac99b29,0.0,1.0,1.0


In [6]:
# expanding the time column to day, week, day of the week and month
df["day"] = df.inserted_at.dt.day
df["week"] = df.inserted_at.dt.week
df["day_of_the_week"] = df.inserted_at.dt.dayofweek
df["month"] = df.inserted_at.dt.month

In [7]:
df.head()

Unnamed: 0,inserted_at,institution_id,in-patient,out-patient,total,day,week,day_of_the_week,month
0,2021-01-01,09415354-4eb1-4866-87bb-efffe23e196a,0.0,11.0,11.0,1,53,4,1
1,2021-01-01,097d4418-6d7c-4922-a703-bde5a6b64919,1.0,19.0,20.0,1,53,4,1
2,2021-01-01,0da8e9a5-855d-42c5-9435-d69e1019cafc,0.0,2.0,2.0,1,53,4,1
3,2021-01-01,0fa2e546-c292-4060-a05e-beb53fc31e72,0.0,3.0,3.0,1,53,4,1
4,2021-01-01,18cb6e95-6073-4102-989e-1b133ac99b29,0.0,1.0,1.0,1,53,4,1


In [8]:
df.inserted_at.min(), df.inserted_at.max()

(Timestamp('2021-01-01 00:00:00'), Timestamp('2021-12-01 00:00:00'))

In [105]:
# splitting to get the train, test and validation size
duration = df.inserted_at.max() - df.inserted_at.min()
train, test, val = duration * 0.6, duration * 0.2, duration * 0.2


In [106]:
train, test = df.inserted_at.min() + train, df.inserted_at.max() - val

In [107]:
# splitting to train, test and validation using the time
train_set = df[df.inserted_at < train]
test_set = df[(df.inserted_at >= train) & (df.inserted_at < test)]  
val_set = df[df.inserted_at >= test]

In [108]:
# dropping the time column
train_set.drop("inserted_at", axis=1, inplace=True)
test_set.drop("inserted_at", axis=1, inplace=True)
val_set.drop("inserted_at", axis=1, inplace=True)

In [109]:
# checking the shape of the data
train_set.shape, test_set.shape, val_set.shape

((14541, 8), (5513, 8), (5881, 8))

In [110]:
# building a simple linear regression model
y_train = train_set["total"]
X_train = train_set.drop(["in-patient", "out-patient", "total"], axis=1)

y_test = test_set["total"]
X_test = test_set.drop(["in-patient", "out-patient", "total"], axis=1)

In [111]:
# Checking the shape of the data
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((14541, 5), (14541,), (5513, 5), (5513,))

In [112]:
X_train.head(), y_train.head()

(                         institution_id  day  week  day_of_the_week  month
 0  09415354-4eb1-4866-87bb-efffe23e196a    1    53                4      1
 1  097d4418-6d7c-4922-a703-bde5a6b64919    1    53                4      1
 2  0da8e9a5-855d-42c5-9435-d69e1019cafc    1    53                4      1
 3  0fa2e546-c292-4060-a05e-beb53fc31e72    1    53                4      1
 4  18cb6e95-6073-4102-989e-1b133ac99b29    1    53                4      1,
 0    11.0
 1    20.0
 2     2.0
 3     3.0
 4     1.0
 Name: total, dtype: float64)

In [113]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

In [114]:
le = LabelEncoder()
X_train["institution_id"] = le.fit_transform(X_train["institution_id"])
X_test["institution_id"] = le.fit_transform(X_test["institution_id"])
with open("../models/encoder.pkl", "wb") as f:
    pickle.dump(le, f)

In [104]:
X_train.head()

Unnamed: 0,institution_id,day,week,day_of_the_week,month
0,2,1,53,4,1
1,3,1,53,4,1
2,5,1,53,4,1
3,6,1,53,4,1
4,11,1,53,4,1


In [100]:
# creating a pipeline
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])

In [101]:
pipe.fit(X_train, y_train)

In [39]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso

In [41]:
rf_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", RandomForestRegressor())
])
ridge_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Ridge())
])
lasso_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Lasso())
])

In [42]:
# creating a parameter grid
rf_grid = {
    "model__n_estimators": [100, 200, 300, 400, 500],
    "model__max_depth": [1,2,3, 4, 5],
}
ridge_grid = {
    "model__alpha": [0.1, 0.2, 0.3, 0.4, 0.5],
    "model__max_iter": [100, 200, 300, 400, 500],
}
lasso_grid = {
    "model__alpha": [0.1, 0.2, 0.3, 0.4, 0.5],
    "model__max_iter": [100, 200, 300, 400, 500],
}

In [59]:
models = [rf_pipe, ridge_pipe, lasso_pipe]
grids = [rf_grid, ridge_grid, lasso_grid]
model_names = ["Random Forest", "Ridge", "Lasso"]

# creating a function to fit the models
def train_models(models, grids, model_names, X_train, y_train, X_test, y_test):
    for i in range(len(models)):
        location = f"../models/{model_names[i]}.pkl"
        grid = GridSearchCV(models[i], grids[i], cv=5, verbose=1)
        grid.fit(X_train, y_train)
        print(f"{model_names[i]}: {grid.score(X_test, y_test)}")
        print(grid.best_params_)
        print(grid.best_estimator_)
        print("\n")
        with open(location, "wb") as f:
            pickle.dump(grid.best_estimator_, f)
    return grid

In [60]:
train_models(models, grids, model_names, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Random Forest: -0.22435736206650425
{'model__max_depth': 5, 'model__n_estimators': 300}
Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 RandomForestRegressor(max_depth=5, n_estimators=300))])


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Ridge: 0.020606256709984283
{'model__alpha': 0.5, 'model__max_iter': 100}
Pipeline(steps=[('scaler', StandardScaler()),
                ('model', Ridge(alpha=0.5, max_iter=100))])


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Lasso: 0.019751124169617662
{'model__alpha': 0.2, 'model__max_iter': 100}
Pipeline(steps=[('scaler', StandardScaler()),
                ('model', Lasso(alpha=0.2, max_iter=100))])




This algorithm predicts of the number of patients the facilities are going to see.

Given the dataset that was provided, it will be quite hard to make a reasonable forecast or advice on resources alocation as the only information we have about the facility is the facility id, type of institution and the patient they saw without giving so much information about the patient. However, an information that can be extracted is whether it was an in patient or out patient, thus, we can make reasonable forecast on the number of in patient or outpatient to expect as this then informs resources prioritization (an expected high number of in patients indicates the need for more beds, blood transfusion and other facilities, human resources, admission consumables and drugs to stock)

## Explainable AI

In [115]:
import os

In [116]:
os.getcwd()

'/Users/madeofajala/helium-test/notebooks'

In [None]:
os.path.exists()

In [119]:
MODEL_NAME = "Ridge"
encoder = pickle.load(open("../models/encoder.pkl", "rb"))
model = pickle.load(open(f"../models/{MODEL_NAME}.pkl", "rb"))


In [120]:
model

Could not add the other parts for the explainable AI again because of the time constraint but how I will go about is, given that I'm using very simple ML models for this, so checking coeficients will give information of the columns the model prioritised. Also, LIME and shap will be other great alternatives
