In [105]:
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import warnings

In [106]:
warnings.filterwarnings("ignore")

In [107]:
# Load the training data
train = pd.read_csv("../data/train.csv")
# Load the test data
test = pd.read_csv("../data/test.csv")

# Build a classifier which predicts churn

In [108]:
# Function to fit a model on train data
def fit_model(train_data, model):
    # First split the data into features and target
    X = train_data.drop(columns=["Churn"])
    y = train_data["Churn"]

    model.fit(X, y)
    return model

In [109]:
# Function to search for the best hyperparameters
def search_hyperparameters(train_data, model, param_grid):
    # First split the data into features and target
    X = train_data.drop(columns=["Churn"])
    y = train_data["Churn"]

    grid_search = GridSearchCV(
        model,
        param_grid,
        cv=5,
        scoring="recall",
        return_train_score=True,
        n_jobs=-1,
    )
    grid_search.fit(X, y)
    return grid_search

In [110]:
# Function to score a model on given data
def score_model(data, model):
    X = data.drop(columns=["Churn"])
    y = data["Churn"]
    predictions = model.predict(X)
    precision = precision_score(y, predictions)
    accuracy = accuracy_score(y, predictions)
    recall = recall_score(y, predictions)
    f1 = f1_score(y, predictions)
    return f"Precision: {precision} Accuracy: {accuracy}, Recall: {recall}, F1: {f1}"

In [111]:
# Function to evaluate the model predictions
def evaluate_model(data, model):
    X = data.drop(columns=["Churn"])
    y = data["Churn"]
    predictions = model.predict(X)
    report = classification_report(y, predictions, digits=3)
    return report

In [112]:
# Fit the adaboost model
ada_boost_model = fit_model(train, AdaBoostClassifier())

In [113]:
# Score the model on train data
ada_boost_accuracy = score_model(train, ada_boost_model)
print(f"Adaboost accuracy on train data: {ada_boost_accuracy}")
# Evaluate the model predictions
print(evaluate_model(train, ada_boost_model))

Adaboost accuracy on train data: Precision: 0.6528384279475983 Accuracy: 0.65, Recall: 0.7119047619047619, F1: 0.6810933940774487
              precision    recall  f1-score   support

       False      0.646     0.582     0.612       380
        True      0.653     0.712     0.681       420

    accuracy                          0.650       800
   macro avg      0.650     0.647     0.647       800
weighted avg      0.650     0.650     0.648       800



In [114]:
ada_boost_accuracy = score_model(test, ada_boost_model)
print(f"Adaboost accuracy on test data: {ada_boost_accuracy}")
# Evaluate the model predictions
print(evaluate_model(test, ada_boost_model))

Adaboost accuracy on test data: Precision: 0.5547445255474452 Accuracy: 0.545, Recall: 0.7169811320754716, F1: 0.6255144032921811
              precision    recall  f1-score   support

       False      0.524     0.351     0.420        94
        True      0.555     0.717     0.626       106

    accuracy                          0.545       200
   macro avg      0.539     0.534     0.523       200
weighted avg      0.540     0.545     0.529       200



In [115]:
print("Adaboost confusion matrix on test data:")
print(
    confusion_matrix(
        test["Churn"], ada_boost_model.predict(test.drop(columns=["Churn"]))
    )
)

Adaboost confusion matrix on test data:
[[33 61]
 [30 76]]


In [116]:
# Grid search for best parameters
param_grid = {
    "estimator": [DecisionTreeClassifier(max_depth=2)],
    "n_estimators": [150, 175, 200],  # Number of boosting rounds
    "learning_rate": [0.75, 1.0, 1.5, 1.75, 2.0, 2.2, 2.5],  # Learning rate
}
ada_boost_grid_search = search_hyperparameters(
    train, AdaBoostClassifier(algorithm="SAMME", random_state=42), param_grid
)
print(f"Best parameters for Adaboost: {ada_boost_grid_search.best_params_}")

Best parameters for Adaboost: {'estimator': DecisionTreeClassifier(max_depth=2), 'learning_rate': 2.2, 'n_estimators': 175}


In [117]:
# Fit the adaboost model with best parameters
ada_boost_model_best = fit_model(
    train, AdaBoostClassifier(**ada_boost_grid_search.best_params_, random_state=0)
)

In [118]:
# Score the model on train data
ada_boost_best = score_model(train, ada_boost_model_best)
print(f"Adaboost accuracy on train data: {ada_boost_best}")
# Evaluate the model predictions
print(evaluate_model(train, ada_boost_model_best))

Adaboost accuracy on train data: Precision: 0.5514809590973202 Accuracy: 0.56625, Recall: 0.930952380952381, F1: 0.6926483613817538
              precision    recall  f1-score   support

       False      0.681     0.163     0.263       380
        True      0.551     0.931     0.693       420

    accuracy                          0.566       800
   macro avg      0.616     0.547     0.478       800
weighted avg      0.613     0.566     0.489       800



In [119]:
# Score the model on test data
ada_boost_best = score_model(test, ada_boost_model_best)
print(f"Adaboost accuracy on test data: {ada_boost_best}")
# Evaluate the model predictions
print(evaluate_model(test, ada_boost_model_best))

Adaboost accuracy on test data: Precision: 0.5444444444444444 Accuracy: 0.55, Recall: 0.9245283018867925, F1: 0.6853146853146853
              precision    recall  f1-score   support

       False      0.600     0.128     0.211        94
        True      0.544     0.925     0.685       106

    accuracy                          0.550       200
   macro avg      0.572     0.526     0.448       200
weighted avg      0.571     0.550     0.462       200



In [120]:
confusion_matrix(
    test["Churn"], ada_boost_model_best.predict(test.drop(columns=["Churn"]))
)

array([[12, 82],
       [ 8, 98]])

# Random Forest

In [121]:
# Fit the random forest model
random_forest_model = fit_model(
    train, RandomForestClassifier(max_depth=1, random_state=0)
)

In [122]:
# Score the model on train data
random_forest_accuracy = score_model(train, random_forest_model)
print(f"Random forest accuracy on train data: {random_forest_accuracy}")
# Evaluate the model predictions
print(evaluate_model(train, random_forest_model))

Random forest accuracy on train data: Precision: 0.5293367346938775 Accuracy: 0.5325, Recall: 0.9880952380952381, F1: 0.6893687707641196
              precision    recall  f1-score   support

       False      0.688     0.029     0.056       380
        True      0.529     0.988     0.689       420

    accuracy                          0.532       800
   macro avg      0.608     0.509     0.372       800
weighted avg      0.604     0.532     0.388       800



In [123]:
# Score the model on test data
random_forest_accuracy = score_model(test, random_forest_model)
print(f"Random forest accuracy on test data: {random_forest_accuracy}")
# Evaluate the model predictions
print(evaluate_model(test, random_forest_model))

Random forest accuracy on test data: Precision: 0.5303030303030303 Accuracy: 0.53, Recall: 0.9905660377358491, F1: 0.6907894736842105
              precision    recall  f1-score   support

       False      0.500     0.011     0.021        94
        True      0.530     0.991     0.691       106

    accuracy                          0.530       200
   macro avg      0.515     0.501     0.356       200
weighted avg      0.516     0.530     0.376       200



In [124]:
# Grid search for best parameters
param_grid = {
    "n_estimators": [175, 200, 225, 250],  # Number of trees in the forest
    "max_depth": [1, 2, 3, 4],  # Maximum depth of the tree
}
random_forest_grid_search = search_hyperparameters(
    train, RandomForestClassifier(random_state=42), param_grid
)
print(f"Best parameters for Random Forest: {random_forest_grid_search.best_params_}")

Best parameters for Random Forest: {'max_depth': 1, 'n_estimators': 250}


In [125]:
random_forest_best_model = fit_model(
    train,
    RandomForestClassifier(**random_forest_grid_search.best_params_, random_state=0),
)

In [126]:
# Score the model on train data
random_forest_accuracy_best = score_model(train, random_forest_best_model)
print(f"Random forest accuracy on train data: {random_forest_accuracy_best}")
# Evaluate the model predictions
print(evaluate_model(train, random_forest_best_model))

Random forest accuracy on train data: Precision: 0.5318877551020408 Accuracy: 0.5375, Recall: 0.9928571428571429, F1: 0.6926910299003323
              precision    recall  f1-score   support

       False      0.812     0.034     0.066       380
        True      0.532     0.993     0.693       420

    accuracy                          0.537       800
   macro avg      0.672     0.514     0.379       800
weighted avg      0.665     0.537     0.395       800



In [127]:
# Score the model on test data
random_forest_accuracy_best = score_model(test, random_forest_best_model)
print(f"Random forest accuracy on test data: {random_forest_accuracy_best}")
# Evaluate the model predictions
print(evaluate_model(test, random_forest_best_model))

Random forest accuracy on test data: Precision: 0.5282051282051282 Accuracy: 0.525, Recall: 0.9716981132075472, F1: 0.6843853820598007
              precision    recall  f1-score   support

       False      0.400     0.021     0.040        94
        True      0.528     0.972     0.684       106

    accuracy                          0.525       200
   macro avg      0.464     0.496     0.362       200
weighted avg      0.468     0.525     0.382       200



In [128]:
confusion_matrix(
    test["Churn"], random_forest_best_model.predict(test.drop(columns=["Churn"]))
)

array([[  2,  92],
       [  3, 103]])