In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression


In [2]:
retail_df = pd.read_csv('../data/retail_data.csv')
df_copy = retail_df.copy()
df_copy.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [3]:
# feature engineering for the target variable

df_copy['InvoiceDate'] = pd.to_datetime(df_copy['InvoiceDate'])  # convert to datetime type

df_copy['PurchaseNextMonth'] = df_copy.groupby('CustomerID')['InvoiceDate'].shift(-1)
df_copy['PurchaseNextMonth'] = (df_copy['PurchaseNextMonth'].dt.month == (df_copy['InvoiceDate'].dt.month + 1)).astype(int)

df_copy['TotalAmount'] = df_copy['Quantity'] * df_copy['UnitPrice']
df_copy['TotalSpending'] = df_copy.groupby('CustomerID')['TotalAmount'].transform('sum')
df_copy['Frequency'] = df_copy.groupby('CustomerID')['InvoiceNo'].transform('nunique')

X = df_copy[['TotalSpending', 'Frequency']]
y = df_copy['PurchaseNextMonth']

In [4]:
# time-series split
train_size = int(len(df_copy) * 0.8)
train, test = df_copy.iloc[:train_size], df_copy.iloc[train_size:]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# impute missing values using mean imputation
imputer = SimpleImputer(strategy='mean')

X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)

In [6]:
# using random forest classifier model and training
model = RandomForestClassifier(random_state=42)
model.fit(X_train_imputed, y_train)

In [7]:
# evaluation of random forest classifier model
predictions = model.predict(X_val_imputed)
print("Accuracy:", accuracy_score(y_val, predictions))
print("Classification Report:\n", classification_report(y_val, predictions))

Accuracy: 0.9912070269971028
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00    107432
           1       0.00      0.00      0.00       950

    accuracy                           0.99    108382
   macro avg       0.50      0.50      0.50    108382
weighted avg       0.98      0.99      0.99    108382



In [8]:
# test set predictions
X_test = test[['TotalSpending', 'Frequency']]
y_test = test['PurchaseNextMonth']

X_test_imputed = imputer.transform(X_test)

test_predictions = model.predict(X_test_imputed)

In [9]:
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy on the test set:", accuracy)

Accuracy on the test set: 0.9959679651602664


In [10]:
# count of predicted purchases (1) and non-purchases (0)
num_purchases = sum(test_predictions)
num_non_purchases = len(test_predictions) - num_purchases

print("Number of customers predicted to purchase:", num_purchases)
print("Number of customers predicted not to purchase:", num_non_purchases)


Number of customers predicted to purchase: 2
Number of customers predicted not to purchase: 108380


### Only two customers are predicted to purchase again next month
#### Doesn't 'feel' right, Could be an issue with the model or data

In [16]:
# Random Forest algorithm could be overly complex

In [19]:
# Logistic regression model;
# Accuracy of the logistic regression model on the validation and test set are high;
# Precision, recall, and F1-score for the positive class are all zero,
# indicating that the model did not correctly predict any positive instances.

logistic_model = LogisticRegression(random_state=42)

logistic_model.fit(X_train_imputed, y_train)

# reports
logistic_predictions = logistic_model.predict(X_val_imputed)
logistic_accuracy = accuracy_score(y_val, logistic_predictions)
print("Accuracy of Logistic Regression model:", logistic_accuracy)
print("Classification Report:\n", classification_report(y_val, logistic_predictions))

# predictions and accuracy
logistic_test_predictions = logistic_model.predict(X_test_imputed)

logistic_accuracy_test = accuracy_score(y_test, logistic_test_predictions)
print("Accuracy on the test set using Logistic Regression:", logistic_accuracy_test)


Accuracy of Logistic Regression model: 0.9912347068701445
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00    107432
           1       0.00      0.00      0.00       950

    accuracy                           0.99    108382
   macro avg       0.50      0.50      0.50    108382
weighted avg       0.98      0.99      0.99    108382

Accuracy on the test set using Logistic Regression: 0.9959864184089608


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Define class weights to handle imbalanced data
class_weights = {0: 1, 1: 10}  # Adjust the weights as per your dataset's imbalance

# Create a logistic regression model with class weights
logistic_model_balanced = LogisticRegression(class_weight=class_weights, random_state=42)

# Train the logistic regression model
logistic_model_balanced.fit(X_train_imputed, y_train)

# Evaluate the logistic regression model
logistic_predictions_balanced = logistic_model_balanced.predict(X_val_imputed)
precision = precision_score(y_val, logistic_predictions_balanced)
recall = recall_score(y_val, logistic_predictions_balanced)
f1 = f1_score(y_val, logistic_predictions_balanced)
roc_auc = roc_auc_score(y_val, logistic_predictions_balanced)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)
print("ROC AUC Score:", roc_auc)

Precision: 0.0
Recall: 0.0
F1-score: 0.0
ROC AUC Score: 0.5


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Create a random forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20],        # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],    # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]       # Minimum number of samples required to be at a leaf node
}

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_imputed, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Evaluate the best model on the validation set
best_rf_model = grid_search.best_estimator_
rf_predictions = best_rf_model.predict(X_val_imputed)
precision_rf = precision_score(y_val, rf_predictions)
recall_rf = recall_score(y_val, rf_predictions)
f1_rf = f1_score(y_val, rf_predictions)
roc_auc_rf = roc_auc_score(y_val, rf_predictions)
print("Precision (Random Forest):", precision_rf)
print("Recall (Random Forest):", recall_rf)
print("F1-score (Random Forest):", f1_rf)
print("ROC AUC Score (Random Forest):", roc_auc_rf)