In [1]:
# Import basic libraries
import pandas as pd
import numpy as np

# For visualization (optional but helpful)
import seaborn as sns
import matplotlib.pyplot as plt

# For modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Load the features dataset
df = pd.read_csv("churn_features.csv")

# Preview the data
df.head()

Unnamed: 0,customer_unique_id,num_orders,total_payment,avg_payment,days_since_last_purchase,churn
0,0000366f3b9a7992bf8c76cfdf3221e2,1,141.9,141.9,115,0
1,0000b849f77a49e4a4ce2b2a4ca5be3f,1,27.19,27.19,118,0
2,0000f46a3911fa3c0805444483337064,1,86.22,86.22,541,1
3,0000f6ccb0745a6a4b88665a16c9f078,1,43.62,43.62,325,1
4,0004aac84e0df4da2b147fca70cf8255,1,196.89,196.89,292,1


In [2]:
# Define Features (X) and Target (y) for Classification Model
## Features
X = df.drop(columns=["customer_unique_id", "churn", "days_since_last_purchase"])

## Target 
y = df["churn"]

In [3]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [4]:
# Train a Logistic Regression Model
## Initialize the model
logreg = LogisticRegression(max_iter=1000)

## Train it
logreg.fit(X_train, y_train)  

In [5]:
# Make Prediction
## Predict on the test set
y_pred_lr = logreg.predict(X_test)

In [6]:
# Evaluate the Model
## Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_lr))

## Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lr))

## Accuracy score
print("Accuracy Score:")
print(accuracy_score(y_test, y_pred_lr))


Classification Report:
              precision    recall  f1-score   support

           0       0.39      0.00      0.00     11378
           1       0.60      1.00      0.75     17244

    accuracy                           0.60     28622
   macro avg       0.50      0.50      0.38     28622
weighted avg       0.52      0.60      0.45     28622

Confusion Matrix:
[[    9 11369]
 [   14 17230]]
Accuracy Score:
0.6022989308923206


In [7]:
# Try a Random Forest Model: This usually performs better with more complex patterns in the data
## Initialize the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

In [8]:
## Train it
rf.fit(X_train, y_train)

In [9]:
## Predict Customer Churn: Which customers are likely to stop buying from your business
y_pred_rf = rf.predict(X_test)    

In [10]:
## Evaluate the model
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

print("Accuracy Score:")
print(accuracy_score(y_test, y_pred_rf))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.67      0.68     11378
           1       0.78      0.79      0.79     17244

    accuracy                           0.74     28622
   macro avg       0.73      0.73      0.73     28622
weighted avg       0.74      0.74      0.74     28622

Confusion Matrix:
[[ 7602  3776]
 [ 3544 13700]]
Accuracy Score:
0.7442526727691985


In [13]:
# Evaluate Model Performance with Metrics: Logistic Regression vs. Random Forest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Logistic Regression Metrics
print("📊 Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))

print("\n📊 Random Forest Performance:")
## Random Forest Metrics
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))


📊 Logistic Regression Performance:
Accuracy: 0.6022989308923206
Precision: 0.6024686177838386
Recall: 0.9991881234052424
F1 Score: 0.7516960059332942

📊 Random Forest Performance:
Accuracy: 0.7442526727691985
Precision: 0.7839322499427787
Recall: 0.7944792391556483
F1 Score: 0.7891705069124423


In [12]:
# Save the Model with Predictions 
import numpy as np

np.save("y_test.npy", y_test)
np.save("y_pred_lr.npy", y_pred_lr)
np.save("y_pred_rf.npy", y_pred_rf)