In [None]:
import pandas as pd
import numpy as np

In [None]:
#Importing the dataset
data = pd.read_csv("Ecommerce_Customer_Behavior.csv")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Task 1: Understanding Customer Behavior - Overview Statistics and Visualizations

In [None]:
#Calculate summary statistics for key columns
summary_stats = data[['Total Spend', 'Items Purchased', 'Average Rating', 'Days Since Last Purchase']].describe()

#Visualization of Total Spend distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['Total Spend'], kde=True)
plt.title('Distribution of Total Spend')
plt.xlabel('Total Spend')
plt.ylabel('Frequency')
plt.show()

#Visualization of Items Purchased distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['Items Purchased'], kde=True)
plt.title('Distribution of Items Purchased')
plt.xlabel('Items Purchased')
plt.ylabel('Frequency')
plt.show()

# Display summary statistics to user
import ace_tools as tools; tools.display_dataframe_to_user(name="Summary Statistics for Key Customer Behavior Metrics", dataframe=summary_stats)

# Task 2: Data Cleaning and Exploratory Data Analysis (EDA)

In [None]:
#Check for missing values in the dataset
missing_values = data.isnull().sum()

#Handle missing values (if any) - For this demonstration, I'll drop any rows with missing values.
data_cleaned = data.dropna()

#Check for and handle outliers using the Interquartile Range (IQR) method for numerical columns
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

#Remove outliers in the numerical columns
columns_to_clean = ['Total Spend', 'Items Purchased', 'Days Since Last Purchase']
for col in columns_to_clean:
    data_cleaned = remove_outliers(data_cleaned, col)

#Perform EDA: Visualization of Membership Type vs Total Spend
plt.figure(figsize=(10, 6))
sns.boxplot(x='Membership Type', y='Total Spend', data=data_cleaned)
plt.title('Total Spend by Membership Type')
plt.xlabel('Membership Type')
plt.ylabel('Total Spend')
plt.show()

#Perform EDA: Visualization of Satisfaction Level vs Days Since Last Purchase
plt.figure(figsize=(10, 6))
sns.boxplot(x='Satisfaction Level', y='Days Since Last Purchase', data=data_cleaned)
plt.title('Days Since Last Purchase by Satisfaction Level')
plt.xlabel('Satisfaction Level')
plt.ylabel('Days Since Last Purchase')
plt.show()

#Display missing values report to user
tools.display_dataframe_to_user(name="Missing Values in the Dataset", dataframe=missing_values)

# Task 3: Feature Engineering

In [None]:
#Create new features to enhance the predictive model's accuracy

#Feature: Average Spend per Item
data_cleaned['Avg Spend per Item'] = data_cleaned['Total Spend'] / data_cleaned['Items Purchased']

#Feature: Recency Category based on Days Since Last Purchase
data_cleaned['Recency Category'] = pd.cut(data_cleaned['Days Since Last Purchase'],
                                          bins=[0, 15, 30, 45, data_cleaned['Days Since Last Purchase'].max()],
                                          labels=['Very Recent', 'Recent', 'Less Recent', 'Not Recent'])

#Feature: High Spender (flag for customers who spend above the median spend)
median_spend = data_cleaned['Total Spend'].median()
data_cleaned['High Spender'] = data_cleaned['Total Spend'] > median_spend

#Visualize newly created features

#Visualization of Average Spend per Item distribution
plt.figure(figsize=(10, 6))
sns.histplot(data_cleaned['Avg Spend per Item'], kde=True)
plt.title('Distribution of Average Spend per Item')
plt.xlabel('Average Spend per Item')
plt.ylabel('Frequency')
plt.show()

#Visualization of High Spenders by Satisfaction Level
plt.figure(figsize=(10, 6))
sns.countplot(x='Satisfaction Level', hue='High Spender', data=data_cleaned)
plt.title('High Spenders by Satisfaction Level')
plt.xlabel('Satisfaction Level')
plt.ylabel('Count')
plt.legend(title='High Spender')
plt.show()

#Displaying the cleaned dataset with new features to the user
tools.display_dataframe_to_user(name="Enhanced Dataset with New Features", dataframe=data_cleaned)

# Task 4: Building and Training Machine Learning Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [None]:
#Encode categorical variables for model compatibility
data_encoded = data_cleaned.copy()
label_encoders = {}
for column in ['Gender', 'City', 'Membership Type', 'Satisfaction Level', 'Recency Category']:
    le = LabelEncoder()
    data_encoded[column] = le.fit_transform(data_encoded[column])
    label_encoders[column] = le

#Define the target variable and feature set
X = data_encoded[['Age', 'Total Spend', 'Items Purchased', 'Average Rating', 'Days Since Last Purchase',
                  'Avg Spend per Item', 'High Spender', 'Recency Category']]
y = data_encoded['Satisfaction Level']  # Assuming Satisfaction Level as a proxy for churn indicator

#Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#Initialize and train the RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

#Predict on the test set
y_pred = model.predict(X_test)

#Model evaluation
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

#Visualization of Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_mat, annot=True, fmt="d", cmap="Blues", xticklabels=label_encoders['Satisfaction Level'].classes_, yticklabels=label_encoders['Satisfaction Level'].classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()

#Displaying classification report to the user
print("Classification Report:\n", classification_rep)