In [13]:
#  import packages

import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from pygam import LogisticGAM, s, f
import xgboost as xgb


from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
#  import data
data = pd.read_csv('../Project_datasets/balanced_dataset.csv')

In [3]:
data.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,user_id,product_id,total_cost_pennies,source,total_cost_pre_discount_pennies,discount%,campaign_id,sent_at,...,avg_email_sent,avg_days_between_purchases,prev_discount,no_discount,row_num,max_purchase_discount,same_day,weekend,bought_before,avg_item_discount
0,0,333834,233231,210,2600,0,2600,0.0,0,2017-02-28,...,0.0,0.0,0,0,235,0.0,0,0,50,6.181228
1,1,259057,853216,110,3000,0,3000,0.0,0,2018-04-19,...,0.0,0.0,0,0,3,0.0,0,0,2,5.877251


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45300 entries, 0 to 45299
Data columns (total 25 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Unnamed: 0                       45300 non-null  int64  
 1   Unnamed: 0.1                     45300 non-null  int64  
 2   user_id                          45300 non-null  int64  
 3   product_id                       45300 non-null  int64  
 4   total_cost_pennies               45300 non-null  int64  
 5   source                           45300 non-null  int64  
 6   total_cost_pre_discount_pennies  45300 non-null  int64  
 7   discount%                        45300 non-null  float64
 8   campaign_id                      45300 non-null  int64  
 9   sent_at                          45300 non-null  object 
 10  name_campaigns                   45300 non-null  int64  
 11  opened_email                     45300 non-null  int64  
 12  purchase          

In [5]:
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [6]:
#  preparing dataset for model feeding

X = data.drop(['Unnamed: 0', 'Unnamed: 0.1', 'sent_at', 'user_id', 'total_cost_pre_discount_pennies', 'purchase'], axis=1)
y = data['purchase']


In [7]:
X.head(100)

Unnamed: 0,product_id,total_cost_pennies,source,discount%,campaign_id,name_campaigns,opened_email,last_email,last_pos_email,avg_email_sent,avg_days_between_purchases,prev_discount,no_discount,row_num,max_purchase_discount,same_day,weekend,bought_before,avg_item_discount
0,107,1380,0,40.0,0,0,1,0,3,2.000000,0.000000,0,0,3,40.0,0,1,3,7.475338
1,2055,3900,0,0.0,0,0,0,0,3,13.750000,28.000000,0,0,59,10.0,0,0,18,0.000000
2,1888,2550,0,15.0,0,0,0,0,48,23.000000,45.111111,0,9,240,0.0,0,0,50,7.395038
3,1339,3100,0,0.0,0,0,0,0,156,0.000000,0.000000,0,0,155,0.0,0,0,7,15.674044
4,2602,3400,0,0.0,0,0,1,1,9,1.142857,0.333333,0,7,15,0.0,1,0,12,7.409584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,331,2400,0,0.0,0,0,0,0,590,94.875000,4.571429,0,5,767,0.0,0,0,133,0.000000
96,1795,2975,0,15.0,0,0,0,0,32,19.888889,33.875000,0,2,188,15.0,0,0,24,5.748503
97,709,2970,0,10.0,0,0,0,0,0,17.000000,105.000000,1,4,90,0.0,0,0,23,2.659574
98,1680,4800,0,0.0,0,0,0,0,106,0.000000,0.000000,0,0,105,0.0,0,0,105,0.000000


In [8]:
# Split the data into training and testing sets
# This splits into 70% train, 30% test by default
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
#  Random Forest Classifier

# Create and train the Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=100, 
    random_state=42,   
    max_depth=None,    
    min_samples_split=2,
    min_samples_leaf=1
)

# Train the model a
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy:.4f}")

# Detailed performance metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Random Forest Accuracy: 0.9269

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.94      0.93      6806
           1       0.94      0.91      0.93      6784

    accuracy                           0.93     13590
   macro avg       0.93      0.93      0.93     13590
weighted avg       0.93      0.93      0.93     13590


Confusion Matrix:
[[6408  398]
 [ 595 6189]]


In [14]:
# Re-load feature dataset
X_train_gam = X_train
X_test_gam = X_test

# testing GAM model 
gam_model = LogisticGAM()

# Fit the model

X_train_array = X_train_gam.values
X_test_array = X_test_gam.values
y_train_array = y_train.values if hasattr(y_train, 'values') else y_train

gam_model.fit(X_train_array, y_train_array)

# Make predictions
y_pred_gam = gam_model.predict(X_test_array)
y_pred_proba_gam = gam_model.predict_proba(X_test_array)

# Evaluate performance
accuracy_gam = accuracy_score(y_test, y_pred_gam)

print(f"\nGAM Model Results:")
print(f"Accuracy: {accuracy_gam:.4f}")

print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_gam))


  return dist.levels / (mu * (dist.levels - mu))
  self.link.gradient(mu, self.distribution) ** 2



GAM Model Results:
Accuracy: 0.8662

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      6806
           1       0.89      0.83      0.86      6784

    accuracy                           0.87     13590
   macro avg       0.87      0.87      0.87     13590
weighted avg       0.87      0.87      0.87     13590



In [15]:
# Re-load feature dataset
X_train_boost = X_train
X_test_boost = X_test


# Create XGBoost classifier
model = xgb.XGBClassifier(random_state=42)

# Train model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9358

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      6806
           1       0.94      0.93      0.94      6784

    accuracy                           0.94     13590
   macro avg       0.94      0.94      0.94     13590
weighted avg       0.94      0.94      0.94     13590


Confusion Matrix:
[[6386  420]
 [ 452 6332]]
