In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, PrecisionRecallDisplay, RocCurveDisplay
from sklearn.metrics import f1_score, roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay

In [3]:
# Loading data
customers = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/customers_final.csv')
transactions = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/transactions_final.csv')
engagement = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/engagements_final.csv')
marketing = pd.read_csv('https://raw.githubusercontent.com/delinai/schulich_ds1_2024/main/Datasets/marketing_final.csv')

# Initial exploration
print(customers.head())
print(transactions.head())
print(engagement.head())
print(marketing.head())

   customer_id   join_date last_purchase_date   age  gender           location
0            1  2023-11-20         2024-03-17  56.0  Female  North Shannonbury
1            2  2021-09-08         2023-10-25   NaN    Male          Hillville
2            3  2021-06-01         2022-11-27   NaN     NaN   North Latoyatown
3            4  2022-01-01         2022-09-01  29.0    Male          Grossstad
4            5  2022-01-24         2023-06-02   NaN    Male   East Matthewfort
   transaction_id  customer_id transaction_date  transaction_amount  \
0               1            1       2024-02-03              165.56   
1               2            1       2024-03-02              699.01   
2               3            1       2024-03-12              146.86   
3               4            1       2024-01-20              927.46   
4               5            1       2024-02-25             1395.87   

  product_category  
0         Clothing  
1       Home Goods  
2       Home Goods  
3      Electron

In [12]:
# DATA CLEANING AND PREPARATION

# Data Preparation Approach
# Handling missing values
customers['age'].fillna(customers['age'].median(), inplace=True)
customers['gender'].fillna('Unknown', inplace=True)

# Converting dates to DateTime
customers['join_date'] = pd.to_datetime(customers['join_date'])
customers['last_purchase_date'] = pd.to_datetime(customers['last_purchase_date'])
transactions['transaction_date'] = pd.to_datetime(transactions['transaction_date'])
marketing['campaign_date'] = pd.to_datetime(marketing['campaign_date'])

# Imputing missing values
imputer = KNNImputer(n_neighbors=5)
customers[['age']] = imputer.fit_transform(customers[['age']])

# Calculating Months from Join Date to Last Purchase Date
customers['months_active'] = (customers['last_purchase_date'] - customers['join_date']).dt.days // 30
customers['months_active'].replace(0, 1, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers['age'].fillna(customers['age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers['gender'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whic

In [13]:
# FEATURE ENGINEERING


monthly_revenue = transactions.groupby('customer_id')['transaction_amount'].sum() / customers['months_active']
customers['CLV'] = monthly_revenue

# RFM analysis
customers['recency'] = (pd.Timestamp.now() - customers['last_purchase_date']).dt.days
frequency = transactions.groupby('customer_id').size()
customers['frequency'] = customers['customer_id'].map(frequency)
monetary = transactions.groupby('customer_id')['transaction_amount'].sum()
customers['monetary'] = customers['customer_id'].map(monetary)

# Merging with Engagement Data
customers = customers.merge(engagement, on='customer_id', how='left')
customers.fillna(0, inplace=True)

# Positive Engagement Response
customers['click_per_transaction'] = customers['number_of_clicks_y'] / customers['frequency'].replace(0, 1)
customers['visit_per_transaction'] = customers['number_of_site_visits_y'] / customers['frequency'].replace(0, 1)
customers['email_per_transaction'] = customers['number_of_emails_opened_y'] / customers['frequency'].replace(0, 1)

# Calculating response rate
positive_responses = marketing[marketing['response'] == 'Yes'].groupby('customer_id').size()
total_responses = marketing.groupby('customer_id').size()
customers['response_rate'] = customers['customer_id'].map(positive_responses) / customers['customer_id'].map(total_responses)
customers['response_rate'].fillna(0, inplace=True)

# Verify that all required columns are present
print(customers.head())


   customer_id  join_date last_purchase_date   age   gender  \
0            1 2023-11-20         2024-03-17  56.0   Female   
1            2 2021-09-08         2023-10-25  44.0     Male   
2            3 2021-06-01         2022-11-27  44.0  Unknown   
3            4 2022-01-01         2022-09-01  29.0     Male   
4            5 2022-01-24         2023-06-02  44.0     Male   

            location  months_active         CLV  recency  frequency  ...  \
0  North Shannonbury              3    0.000000      119          6  ...   
1          Hillville             25  140.379200      263          9  ...   
2   North Latoyatown             18  337.851111      595          6  ...   
3          Grossstad              8  181.858750      682         20  ...   
4   East Matthewfort             16  492.167500      408         24  ...   

   avg_transaction_amount  total_transactions  avg_visit_per_transaction  \
0                0.000000                 0.0                   0.000000   
1           

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customers['response_rate'].fillna(0, inplace=True)


In [14]:
# MODEL BUILDING AND EVALUATION



# Splitting data into features and target variable
X = customers[['age', 'gender', 'recency', 'frequency', 'response_rate', 'click_per_transaction']]
y = customers['CLV']

# One-hot encode the categorical variables
X = pd.get_dummies(X, columns=['gender'], drop_first=True)

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)

# Random Forest Regressor Model with Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

rf_model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_
rf_preds = best_rf_model.predict(X_test)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [15]:
# Model Evaluation
from sklearn.metrics import mean_squared_error, r2_score

# Linear Regression Evaluation
lr_mse = mean_squared_error(y_test, lr_preds)
lr_r2 = r2_score(y_test, lr_preds)
print(f'Linear Regression MSE: {lr_mse}, R²: {lr_r2}')

# Random Forest Evaluation
rf_mse = mean_squared_error(y_test, rf_preds)
rf_r2 = r2_score(y_test, rf_preds)
print(f'Random Forest MSE: {rf_mse}, R²: {rf_r2}')

Linear Regression MSE: 10392378.446592022, R²: 0.04720108714689375
Random Forest MSE: 7080322.433566194, R²: 0.3508585592777764
