In [1]:
import pandas as pd
from sklearn.ensemble import (
    AdaBoostClassifier,
    BaggingClassifier,
    RandomForestClassifier,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from metrics import classification_summary
from pandas_util import normalize_columns

## Bagging and Boosting classification trees (personal loan data)

In [2]:
bank_df = pd.read_csv("./datasets/dmba/UniversalBank.csv")
normalize_columns(bank_df)
bank_df.drop(columns=["id", "zip_code"], inplace=True)

X = bank_df.drop(columns=["personal_loan"])
y = bank_df["personal_loan"]
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.4, random_state=3
)

In [3]:
# Single tree
single_tree = DecisionTreeClassifier(random_state=1)
single_tree.fit(X_train, y_train)
classification_summary(y_valid, single_tree.predict(X_valid))

Accuracy: 0.9825

Confusion matrix:
[[1778   15]
 [  20  187]]

Classification report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1793
           1       0.93      0.90      0.91       207

    accuracy                           0.98      2000
   macro avg       0.96      0.95      0.95      2000
weighted avg       0.98      0.98      0.98      2000



In [4]:
# Bagging
bagging = BaggingClassifier(
    DecisionTreeClassifier(random_state=1), n_estimators=100, random_state=1
)
bagging.fit(X_train, y_train)
classification_summary(y_valid, bagging.predict(X_valid))

Accuracy: 0.9855

Confusion matrix:
[[1781   12]
 [  17  190]]

Classification report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1793
           1       0.94      0.92      0.93       207

    accuracy                           0.99      2000
   macro avg       0.97      0.96      0.96      2000
weighted avg       0.99      0.99      0.99      2000



In [5]:
# Boosting
boost = AdaBoostClassifier(
    DecisionTreeClassifier(random_state=1), n_estimators=100, random_state=1
)
boost.fit(X_train, y_train)
classification_summary(y_valid, boost.predict(X_valid))

Accuracy: 0.984

Confusion matrix:
[[1776   17]
 [  15  192]]

Classification report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1793
           1       0.92      0.93      0.92       207

    accuracy                           0.98      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.98      0.98      0.98      2000



# Uplift

In [7]:
voter_df = pd.read_csv("./datasets/dmba/Voter-Persuasion.csv")
normalize_columns(voter_df)

In [9]:
predictors = [
    "age",
    "nh_white",
    "comm_pt",
    "h_f1",
    "reg_days",
    "pr_pelig",
    "e_pelig",
    "politicalc",
    "message_a",
]
outcome = "moved_ad"
classes = list(voter_df.moved_ad.unique())
classes

['N', 'Y']

In [10]:
# Partition the data
X = voter_df[predictors]
y = voter_df[outcome]
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.4, random_state=1
)

In [11]:
# Train a random forest classifier using the training set
rf = RandomForestClassifier(n_estimators=100, random_state=1)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=1)

In [12]:
# Calculating the uplift
uplift_df = X_valid.copy()  # Create a copy so that we can modify this data
uplift_df.message_a = 1
y_pred = rf.predict_proba(uplift_df)
uplift_df.message_a = 0
y_control = rf.predict_proba(uplift_df)

uplift_result_df = pd.DataFrame(
    {
        "prob_message": y_pred[:, 1],
        "prob_no_message": y_control[:, 1],
        "uplift": y_pred[:, 1] - y_control[:, 1],
    },
    index=uplift_df.index,
)
uplift_result_df.head()

Unnamed: 0,prob_message,prob_no_message,uplift
9953,0.77,0.62,0.15
3850,0.39,0.39,0.0
4962,0.2,0.14,0.06
3886,0.86,0.62,0.24
5437,0.1,0.28,-0.18
