In [None]:
import pandas as pd

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer

In [None]:
df = pd.read_csv('../data/cleaned_df.csv')

In [None]:
df.rename(columns={
    'Distance (Trip to Destination)': 'Distance_Trip_to_Destination',
    'Distance From Trip Origin': 'Distance_From_Trip_Origin',
    'lat': 'latitude',
    'lng': 'longitude'
}, inplace=True)

In [None]:
le = LabelEncoder()

non_numeric_columns = list(df.select_dtypes(exclude = 'number').columns)

for col in non_numeric_columns:
  df[col] = le.fit_transform(df[col])

In [None]:
df['driver_action'] = df['driver_action'].apply(lambda x: 0 if x == 1 else 1)

In [None]:
# Discretize the continuous variables
discretizer_distance_origin = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
discretizer_distance_destination = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
discretizer_latitude = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
discretizer_longitude = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')


df['Distance_From_Trip_Origin'] = discretizer_distance_origin.fit_transform(df[['Distance_From_Trip_Origin']]).astype(int)
df['Distance_Trip_to_Destination'] = discretizer_distance_destination.fit_transform(df[['Distance_Trip_to_Destination']]).astype(int)
df['latitude'] = discretizer_latitude.fit_transform(df[['latitude']]).astype(int)
df['longitude'] = discretizer_longitude.fit_transform(df[['longitude']]).astype(int)


In [None]:
cols = ['is_weekend', 'driver_action', 'latitude', 'longitude', 'Distance_From_Trip_Origin', 'Distance_Trip_to_Destination']

In [None]:
from sklearn.model_selection import train_test_split
treatment = 'driver_action'
covariates = ['is_weekend', 'latitude', 'longitude', 'Distance_From_Trip_Origin', 'Distance_Trip_to_Destination']

# Split data into train and test sets
train, test = train_test_split(df[cols], test_size=0.2, random_state=42)

In [None]:
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Train ML models using all variables
X_train = train_data.drop(columns=['driver_action'])
y_train = train_data['driver_action']

X_holdout = test_data.drop(columns=['driver_action'])
y_holdout = test_data['driver_action']

rf_all = RandomForestClassifier(random_state=42)
rf_all.fit(X_train, y_train)

xgb_all = XGBClassifier(random_state=42)
xgb_all.fit(X_train, y_train)

# Train ML models using only the variables selected by the graph
X_train_selected = train.drop('driver_action', axis=1)
y_train_selected = train['driver_action']

X_holdout_selected = test.drop('driver_action', axis=1)
y_holdout_selected = test['driver_action']

rf_selected = RandomForestClassifier(random_state=42)
rf_selected.fit(X_train_selected, y_train_selected)

xgb_selected = XGBClassifier(random_state=42)
xgb_selected.fit(X_train_selected, y_train_selected)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
def evaluate_model(model, X_train, y_train, X_holdout, y_holdout):
    train_predictions = model.predict(X_train)
    holdout_predictions = model.predict(X_holdout)

    train_accuracy = accuracy_score(y_train, train_predictions)
    holdout_accuracy = accuracy_score(y_holdout, holdout_predictions)

    return {
        'train_accuracy': train_accuracy,
        'holdout_accuracy': holdout_accuracy,
        'classification_report': classification_report(y_holdout, holdout_predictions)
    }

# Evaluation for models using all variables
rf_all_eval = evaluate_model(rf_all, X_train, y_train, X_holdout, y_holdout)
xgb_all_eval = evaluate_model(xgb_all, X_train, y_train, X_holdout, y_holdout)

# Evaluation for models using selected variables
rf_selected_eval = evaluate_model(rf_selected, X_train_selected, y_train_selected, X_holdout_selected, y_holdout_selected)
xgb_selected_eval = evaluate_model(xgb_selected,  X_train_selected, y_train_selected, X_holdout_selected, y_holdout_selected)

# Print results
print("Random Forest with All Variables:")
print(f"Train Accuracy: {rf_all_eval['train_accuracy']}")
print(f"Holdout Accuracy: {rf_all_eval['holdout_accuracy']}")
print(rf_all_eval['classification_report'])

print("\nXGBoost with All Variables:")
print(f"Train Accuracy: {xgb_all_eval['train_accuracy']}")
print(f"Holdout Accuracy: {xgb_all_eval['holdout_accuracy']}")
print(xgb_all_eval['classification_report'])

print("\nRandom Forest with Selected Variables:")
print(f"Train Accuracy: {rf_selected_eval['train_accuracy']}")
print(f"Holdout Accuracy: {rf_selected_eval['holdout_accuracy']}")
print(rf_selected_eval['classification_report'])

print("\nXGBoost with Selected Variables:")
print(f"Train Accuracy: {xgb_selected_eval['train_accuracy']}")
print(f"Holdout Accuracy: {xgb_selected_eval['holdout_accuracy']}")
print(xgb_selected_eval['classification_report'])

# Measure overfitting

Random Forest with All Variables:
Train Accuracy: 0.9999470579403109
Holdout Accuracy: 0.9828224862958296
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    283486
           1       0.01      0.00      0.00      4565

    accuracy                           0.98    288051
   macro avg       0.50      0.50      0.50    288051
weighted avg       0.97      0.98      0.98    288051


XGBoost with All Variables:
Train Accuracy: 0.9843603948262589
Holdout Accuracy: 0.9841312823076469
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    283486
           1       0.36      0.00      0.00      4565

    accuracy                           0.98    288051
   macro avg       0.67      0.50      0.50    288051
weighted avg       0.97      0.98      0.98    288051


Random Forest with Selected Variables:
Train Accuracy: 0.9843499799948446
Holdout Accuracy: 0.9841278107001885
              precision    

In [None]:
def measure_overfitting(train_accuracy, holdout_accuracy):
    return train_accuracy - holdout_accuracy

rf_all_overfit = measure_overfitting(rf_all_eval['train_accuracy'], rf_all_eval['holdout_accuracy'])
xgb_all_overfit = measure_overfitting(xgb_all_eval['train_accuracy'], xgb_all_eval['holdout_accuracy'])

rf_selected_overfit = measure_overfitting(rf_selected_eval['train_accuracy'], rf_selected_eval['holdout_accuracy'])
xgb_selected_overfit = measure_overfitting(xgb_selected_eval['train_accuracy'], xgb_selected_eval['holdout_accuracy'])

print("\nOverfitting Measure (Train - Holdout Accuracy):")
print(f"Random Forest with All Variables: {rf_all_overfit}")
print(f"XGBoost with All Variables: {xgb_all_overfit}")
print(f"Random Forest with Selected Variables: {rf_selected_overfit}")
print(f"XGBoost with Selected Variables: {xgb_selected_overfit}")


Overfitting Measure (Train - Holdout Accuracy):
Random Forest with All Variables: 0.017124571644481335
XGBoost with All Variables: 0.00022911251861201798
Random Forest with Selected Variables: 0.00022216929465612711
XGBoost with Selected Variables: 0.00018484949112751448
