## Training ExtraTreesClassifier and XGBoost

### 1. Data Preparation and cleaning

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

In [None]:
df = pd.read_csv("fitness_class_2212.csv")

In [None]:
df.weight = df.weight.fillna(df.weight.median()) #For ExtraTreesClassifier, there is a need to handle missing values, not for XGBoost
df.drop('booking_id', axis=1, inplace=True)  # drop the booking_id column as it's irrelevant to the analysis
df.days_before = df.days_before.str.replace(r' days$', '', regex=True) # Remove ' days'
df.days_before = pd.to_numeric(df.days_before, errors='coerce').astype(int) # Convert the cleaned values to integers

df.day_of_week = df.day_of_week.replace({'Wednesday': 'Wed', 'Monday': 'Mon'})  # Replace 'Wednesday' with 'Wed'
df.day_of_week = df.day_of_week.str.replace(r'Fri\.$', 'Fri', regex=True)  # Use raw string to remove period from 'Fri.'
df.category = df.category.replace('-', 'Unknown')  # Replace with "Unknown" instead of NaN to prevent information loss

### 2. Splitting the data, extracting target variables, dropping target column and transforming features

In [None]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train['attended'].values
y_val = df_val['attended'].values
y_test = df_test['attended'].values

del df_train['attended']
del df_val['attended']
del df_test['attended']

### 3a. Training ExtraTreesClassifier
- This class implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. https://scikit-learn.org/dev/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html

In [None]:
dv = DictVectorizer(sparse=False)

train_dicts = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

test_dicts = df_test.to_dict(orient='records')
X_test = dv.transform(test_dicts)

In [None]:
et = ExtraTreesClassifier(n_estimators=100, random_state=1)
et.fit(X_train, y_train)

In [None]:
y_pred = et.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, y_pred)
print("Validation AUC:", val_auc)

#### 3b. Tuning the parameters with GridSearchCV (to search for the best combination of parameters) 

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    "n_estimators": [50, 100, 150, 200],
    "max_depth": [10, 20, 30, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None]
}

In [None]:
et = ExtraTreesClassifier(random_state=1)

In [None]:
from tqdm.notebook import tqdm
from sklearn.model_selection import GridSearchCV

In [None]:
grid_search = GridSearchCV(
    estimator=et,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=5,
    verbose=0,
    n_jobs=-1
)

In [None]:
with tqdm(total=len(param_grid)) as pbar:
    grid_search.fit(X_train, y_train)
    pbar.update(1)

In [None]:
print("Best Parameters:", grid_search.best_params_)
print("Best Validation AUC:", grid_search.best_score_)

##### Explanation of result:
- `max_depth=10`: This limits the depth of each tree to 10 levels, balancing model complexity and overfitting. A smaller depth prevents the model from memorizing the training data.
- `max_features=None`: The model considers all features when determining splits, potentially increasing performance because there are only 6 features.
- `min_samples_leaf=2`: Each leaf node must have at least 2 samples, preventing splits that would result in very small or pure leaf nodes. This can help reduce overfitting.
- `min_samples_split=10`: A node must have at least 10 samples to be considered for splitting, further controlling overfitting.
- `n_estimators=50`: The model uses 50 trees in the ensemble, which is reasonable for your dataset size (1500 rows). More trees would increase training time but may not significantly improve performance.

#### 3c. Training the model with the Best Parameters.

In [None]:
best_et = ExtraTreesClassifier(
    max_depth=10,
    max_features=None,
    min_samples_leaf=2,
    min_samples_split=10,
    n_estimators=50,
    random_state=1
)

best_et.fit(X_train, y_train)

In [None]:
# Evaluating on validation dataset

y_pred = best_et.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, y_pred)
print("Validation AUC:", val_auc)

In [None]:
# Evaluate on test set

y_pred = best_et.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_pred)
print("Test AUC:", test_auc)

#### The model appears to be slightly overfitting to the training and validation data given that the `Test AUC: 0.745` is lower than the `Validation AUC: 0.818`.

#### 3d. Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(
    best_et,
    X_train,
    y_train,
    cv=5,
    scoring="roc_auc"
)

print("Cross-Validation AUC:", np.mean(cv_scores), "+-", np.std(cv_scores))

#### `The Cross-Validation AUC: 0.815` does indicate the the model is performing consistently across folds. Moving on...

### 4a. Training, tuning and evaluating an XGBoost model
- XG Boost: XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. It implements machine learning algorithms under the Gradient Boosting framework. https://www.nvidia.com/en-us/glossary/xgboost/

In [None]:
pip install xgboost

In [None]:
import xgboost as xgb

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

In [None]:
xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "max_depth": 10,
    "eta": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 1,
}

In [None]:
watchlist = [(dtrain, "train"), (dval, "val")]

model = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=100,
    evals=watchlist,
    early_stopping_rounds=10,
    verbose_eval=10,
)

In [None]:
y_pred = model.predict(dtest)

test_auc = roc_auc_score(y_test, y_pred)
print("XGBoost Test AUC:", test_auc)

#### An XGBoost Test AUC of 0.747 is slightly better than that of the ExtraTreesClassifier Test AUC of 0.745 but it's still not a significant improvement.

#### The imbalance in the dataset appears to be affecting the performance of XGBoost

#### 4b. Ensemble Models: Combine predictions from multiple models, such as XGBoost, ExtraTrees, and LightGBM, using a soft voting approach. This often improves generalization.

In [None]:
from sklearn.ensemble import VotingClassifier

ensemble_model = VotingClassifier(
    estimators=[
        ('xgb', best_xgb),
        ('et', best_et),
    ],
    voting='soft'
)

ensemble_model.fit(X_train, y_train)

# Evaluate on test set
y_test_pred = ensemble_model.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_test_pred)
print("Ensemble Test AUC:", test_auc)

In [None]:
# Fine tuning the Ensemble model

ensemble_model = VotingClassifier(
    estimators=[
        ('xgb', best_xgb),
        ('et', best_et),
    ],
    voting='soft',
    weights=[2, 1]  # Give more weight to XGBoost if it performs better
)

ensemble_model.fit(X_train, y_train)

# Evaluate on test set
y_test_pred = ensemble_model.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_test_pred)
print("Weighted Ensemble Test AUC:", test_auc)

#### The ensemble has successfully captured complementary strengths from both XGBoost and ExtraTreesClassifier.
- The weighted voting scheme gave XGBoost, the stronger individual performer, more influence on the final predictions.
- This AUC improvement suggests that the ensemble is robust and better at distinguishing between classes on unseen data compared to individual models.

### 5. Selecting the final model
- Choosing between logistic regression, ExtraTreesClassifier and XGBoost
- Training the final model
- Saving the model

##### Feature Importance for further insights

In [None]:
# Average feature importances from both models
feature_importances = (
    np.array(best_xgb.feature_importances_) + np.array(best_et.feature_importances_)
) / 2

# Feature names
feature_names = dv.feature_names_

# Plot
indices = np.argsort(feature_importances)[::-1]
plt.figure(figsize=(10, 6))
plt.bar(range(len(feature_importances)), feature_importances[indices], align="center")
plt.xticks(range(len(feature_importances)), [feature_names[i] for i in indices], rotation=90)
plt.title("Ensemble Feature Importance")
plt.show()