In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

In [2]:
file_path = "OR_AE2_Project_Adjusted.xlsx"
df = pd.read_excel(file_path, engine='openpyxl')

In [3]:
y = df['Wait_Time']

range_mapping = {
    '00-29': 1,
    '30-59': 2,
    '60-89': 3,
    '90-119': 4,
    '120-149': 5,
    '150-179': 6,
    '180-209': 7,
    '210-239': 8,
    '240-269': 9,
    '270-299': 10,
    '300-329': 11,
    '360+': 12
}

# Map the ranges in y to numerical values
y = y.map(range_mapping)

In [4]:
# Define the mapping for 'Drive_Distance_Miles'
distance_mapping = {
    '00 to 05': 1,
    '05 to 10': 2,
    '10 to 15': 3,
    '15 to 20': 4,
    '20 to 25': 5,
    '25 to 30': 6,
    '30 to 35': 7,
    '35 to 40': 8,
    '40 to 45': 9,
    '45 to 50': 10,
    '50 to 55': 11
}

# Define the mapping for 'Drive_Time_mins'
time_mapping = {
    '00 to 05': 1,
    '05 to 10': 2,
    '10 to 15': 3,
    '15 to 20': 4,
    '20 to 25': 5,
    '25 to 30': 6,
    '30 to 35': 7,
    '35 to 40': 8,
    '40 to 45': 9,
    '45 to 50': 10,
    '50 to 55': 11,
    '55 to 60': 12,
    '60 to 65': 13,
    '65 to 70': 14,
    '70 to 75': 15,
    '75 to 80': 16,
    '80 to 85': 17,
    '85 to 90': 18
}

X = df.drop(columns=['Wait_Time','Site_X','Site_Y','Pat_X','Pat_Y','Year','Month', 'Number_Of_Attendances'])

# Apply the mappings to transform the columns into numerical values
X['Drive_Distance_Miles'] = X['Drive_Distance_Miles'].map(distance_mapping)
X['Driving_Time_mins'] = X['Driving_Time_mins'].map(time_mapping)

X.head()

Unnamed: 0,Site_Code,Site_Type,Site_Loc_GPs,Site_Loc_GP_List,Site_Pop_20miles,Pat_Loc_GPs,Pat_Loc_GP_List,Drive_Distance_Miles,Driving_Time_mins,Attendance_Type,Age_Group
0,2,ED,50,210000,1814482,0,0,1,1,New - unplanned,20-39
1,2,ED,50,210000,1814482,0,0,1,1,New - unplanned,20-39
2,2,ED,50,210000,1814482,0,0,1,1,New - unplanned,20-39
3,2,ED,50,210000,1814482,0,0,1,1,New - unplanned,20-39
4,2,ED,50,210000,1814482,0,0,1,1,New - unplanned,20-39


In [5]:
X["Age_Group"].unique()

array(['20-39', '40-59', '60-79', '80+', 'Missing'], dtype=object)

In [6]:
# Suddivisione in training e test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identificazione delle variabili numeriche e categoriche
numerical_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

# Preprocessing: Imputazione e scaling per le numeriche, One-Hot Encoding per le categoriche
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Creazione della pipeline con Random Forest
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [7]:
# Training del modello
model_pipeline.fit(X_train, y_train)

# Predizioni
y_pred = model_pipeline.predict(X_test)

# Valutazione del modello
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[ 210 1192  783  649  631  486  439  711   34   45   37  124]
 [ 395 1348 1296 1078  965  729  507  981   43   53   43  131]
 [ 309 1539 1041 1197 1159  913  609 1280   48   44   52  183]
 [ 306 1131 1277  951 1191  913  667 1418   50   54   62  194]
 [ 238  885 1106 1237  879  960  670 1518   46   54   48  202]
 [ 201  640  927 1153 1082  665  691 1585   43   65   50  184]
 [ 173  550  851 1079 1064  896  443 1520   45   48   49  201]
 [ 161  481  923 1109 1159  965  757 1408   44   45   42  205]
 [  84  198  272  385  435  381  385  644   25   56   53  116]
 [  91  243  305  451  497  439  387  760   40   29   47  147]
 [  86  213  290  389  459  421  338  645   37   48   26  131]
 [  98  309  473  599  631  556  474 1070   42   43   48  129]]


1. Fare un'analisi di ogni singola variabile. Capire se ha senso inserirla in un modello predittivo per wait_time e come va trasformata per trarne il massimo.