In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

import sys
sys.path.append('../src')  # add src/ to kernel path
from test_results import score, test_results

In [2]:
train_data = pd.read_csv("../data/training.csv")
test_data = pd.read_csv("../data/Test.csv")
train_data.head()

Unnamed: 0,ID,Promotion,purchase,V1,V2,V3,V4,V5,V6,V7
0,1,No,0,2,30.443518,-1.165083,1,1,3,2
1,3,No,0,3,32.15935,-0.645617,2,3,2,2
2,4,No,0,2,30.431659,0.133583,1,1,4,2
3,5,No,0,0,26.588914,-0.212728,2,1,4,2
4,8,Yes,0,3,28.044331,-0.385883,1,1,2,2


In [3]:
# define Group A as g1, Group B as g2
g1 = train_data[(train_data.Promotion == "No")]
g2 = train_data[(train_data.Promotion == "Yes")]

In [4]:
# split the dataset into training and testing sets, only from group 2
X = g2[["V1", "V2", "V3", "V4", "V5", "V6", "V7"]]
y = g2["purchase"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [5]:
# create baseline models with balanced dataset
rf = RandomForestClassifier(class_weight="balanced", random_state=42)
rf.fit(X_train, y_train)
lr = LogisticRegression(class_weight="balanced", random_state=42)
lr.fit(X_train, y_train)
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
xgb_clf = XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42)
xgb_clf.fit(X_train, y_train)

In [6]:
# define results for storing model baselines
results = {
    "Model": [],
    "IRR": [],
    "NIR": []
}

In [7]:
# baseline of Random Forest Classifier
def promotion_strategy(df):
    y_pred = rf.predict(df)

    # use numpy.where to replace 1 with "Yes", else with "No"
    promotion = np.where(y_pred == 1, "Yes", "No")

    return promotion

irr, nir = test_results(promotion_strategy)
results["Model"].append("Random Forest Classifier")
results["IRR"].append(irr)
results["NIR"].append(nir)

# baseline of LogisticRegression
def promotion_strategy(df):
    y_pred = lr.predict(df)

    # use numpy.where to replace 1 with "Yes", else with "No"
    promotion = np.where(y_pred == 1, "Yes", "No")

    return promotion

irr, nir = test_results(promotion_strategy)
results["Model"].append("Logistic Regression")
results["IRR"].append(irr)
results["NIR"].append(nir)

# baseline of XGBoost Classifier
def promotion_strategy(df):
    y_pred = xgb_clf.predict(df)

    # use numpy.where to replace 1 with "Yes", else with "No"
    promotion = np.where(y_pred == 1, "Yes", "No")

    return promotion

irr, nir = test_results(promotion_strategy)
results["Model"].append("XGBoost Classifier")
results["IRR"].append(irr)
results["NIR"].append(nir)

Nice job!  See how well your strategy worked on our test data below!

Your irr with this strategy is 0.0000.

Your nir with this strategy is -2.40.
We came up with a model with an irr of 0.0188 and an nir of 189.45 on the test set.

 How did you do?
Nice job!  See how well your strategy worked on our test data below!

Your irr with this strategy is 0.0151.

Your nir with this strategy is 18.40.
We came up with a model with an irr of 0.0188 and an nir of 189.45 on the test set.

 How did you do?
Nice job!  See how well your strategy worked on our test data below!

Your irr with this strategy is 0.0228.

Your nir with this strategy is 137.65.
We came up with a model with an irr of 0.0188 and an nir of 189.45 on the test set.

 How did you do?


In [8]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,IRR,NIR
0,Random Forest Classifier,0.0,-2.4
1,Logistic Regression,0.01514,18.4
2,XGBoost Classifier,0.022779,137.65


In [9]:
# save model baseline results as a CSV file
results_df.to_csv('../reports/baselines/model_baseline.csv', index=False)