In [31]:
!pip install seaborn matplotlib scikit-learn xgboost joblib

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
import xgboost as xgb
import joblib
import os

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [32]:
# Load dataset
df = pd.read_csv("data/Grocery_Customer_Churn_Data.csv")

In [33]:
# 1. Data Preprocessing
# Handle negative values
df['days_since_last_purchase'] = df['days_since_last_purchase'].abs()  # Fix negative days
df['is_negative_sales'] = df['total_sales'] < 0  # Flag negative sales
df['total_sales'] = df['total_sales'].abs()  # Convert to positive


In [34]:
# Handle missing values
df['avg_purchase_value'].fillna(df['avg_purchase_value'].median(), inplace=True)
df['promotion_type'].fillna('None', inplace=True)
df['purchase_frequency'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['avg_purchase_value'].fillna(df['avg_purchase_value'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['promotion_type'].fillna('None', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate 

In [35]:
# Drop irrelevant columns
df = df.drop(['customer_id', 'transaction_id', 'transaction_date', 'last_purchase_date'], axis=1)

In [36]:
# Encode categorical variables
categorical_cols = ['gender', 'income_bracket', 'marital_status', 'education_level', 
                    'occupation', 'product_category', 'purchase_frequency', 'promotion_type']
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [37]:
# Scale numerical features
numerical_cols = ['age', 'membership_years', 'quantity', 'unit_price', 'avg_purchase_value', 
                  'total_sales', 'total_transactions', 'total_items_purchased', 
                  'avg_discount_used', 'online_purchases', 'in_store_purchases', 
                  'days_since_last_purchase']
scaler = StandardScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

In [38]:
# Save scaler
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [39]:
# Save cleaned dataset
df_encoded.to_csv("cleaned_grocery_churn_data.csv", index=False)
print("Cleaned data saved as 'cleaned_grocery_churn_data.csv'")

Cleaned data saved as 'cleaned_grocery_churn_data.csv'


In [40]:
# 2. EDA
# Churn distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='churn', data=df)
plt.title('Churn Distribution')
plt.savefig('churn_distribution.png')
plt.close()

In [41]:
# Feature importance (preliminary correlation)
plt.figure(figsize=(15, 15))
sns.heatmap(df[numerical_cols + ['churn']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.savefig('correlation_matrix.png')
plt.close()

In [51]:
# 3. Model Training with XGBoost


X = df_encoded.drop('churn', axis=1)
y = df_encoded['churn']
X = pd.get_dummies(X, drop_first=True)

In [52]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(X.dtypes)


age                                    float64
membership_years                       float64
number_of_children                       int64
quantity                               float64
unit_price                             float64
avg_purchase_value                     float64
avg_discount_used                      float64
online_purchases                       float64
in_store_purchases                     float64
total_sales                            float64
total_transactions                     float64
total_items_purchased                  float64
days_since_last_purchase               float64
is_negative_sales                         bool
gender_Male                               bool
gender_Other                              bool
income_bracket_Low                        bool
income_bracket_Medium                     bool
marital_status_Married                    bool
marital_status_Single                     bool
education_level_High School               bool
education_lev

In [53]:
# Define and tune XGBoost
xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss', enable_categorical=True)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0]
}
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)


In [54]:
# Best model
best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)
print("Best ROC-AUC score (CV):", grid_search.best_score_)

Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8}
Best ROC-AUC score (CV): 0.8317516527294952


In [55]:
# Evaluate
y_pred = best_model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Test ROC-AUC:", roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1]))


Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.93      0.84      5038
           1       0.68      0.37      0.48      2131

    accuracy                           0.76      7169
   macro avg       0.73      0.65      0.66      7169
weighted avg       0.75      0.76      0.73      7169

Test ROC-AUC: 0.8287446192605834


In [56]:
# Feature importance
importances = pd.DataFrame({'feature': X.columns, 'importance': best_model.feature_importances_})
importances = importances.sort_values('importance', ascending=False)
print("\nTop 10 Feature Importances:")
print(importances.head(10))


Top 10 Feature Importances:
                        feature  importance
33    purchase_frequency_Weekly    0.197426
34    purchase_frequency_Yearly    0.175424
31   purchase_frequency_Monthly    0.164209
5            avg_purchase_value    0.061255
32   purchase_frequency_Unknown    0.059652
21     education_level_Master's    0.016711
29  product_category_Home Goods    0.013706
25           occupation_Student    0.013454
18       marital_status_Married    0.012767
26        occupation_Unemployed    0.012587


In [57]:
# Visualize feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=importances.head(10))
plt.title('Top 10 Feature Importances')
plt.savefig('feature_importance.png')
plt.close()

In [58]:
# Save model
joblib.dump(best_model, 'churn_model.pkl')
print("Model saved as 'churn_model.pkl'")

Model saved as 'churn_model.pkl'
