## Import Libraries

In [1]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

## Retrieve Dataset

In [None]:
df = pd.read_csv('Dataset/OnlineRetail.csv', encoding='ISO-8859-1')
df.head()
df.info()

## Data Cleaning

In [4]:
df = df.dropna(subset=['CustomerID'])
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Handle outliers using IQR method
Q1 = df[['Quantity', 'UnitPrice']].quantile(0.25)
Q3 = df[['Quantity', 'UnitPrice']].quantile(0.75)
IQR = Q3 - Q1

df = df[~((df[['Quantity', 'UnitPrice']] < (Q1 - 1.5 * IQR)) | (df[['Quantity', 'UnitPrice']] > (Q3 + 1.5 * IQR))).any(axis=1)]

df = df.drop_duplicates()



## RFM Analysis

### Create RFM Columns

In [23]:
reference_date = datetime(2011, 12, 10)  # Last date in dataset

df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (reference_date - x.max()).days,
    'InvoiceNo': 'count',
    'TotalPrice': 'sum'
}).rename(columns={
    'InvoiceDate': 'Recency',
    'InvoiceNo': 'Frequency',
    'TotalPrice': 'Monetary'
})

### Boxplot of RFM Columns

In [None]:
features = ['Recency', 'Frequency', 'Monetary']
for feature in features:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=rfm[feature])
    plt.title(f"Boxplot of {feature}")
    plt.show()

### Handle Outliers

In [None]:
# Calculate IQR for each feature
Q1 = rfm[features].quantile(0.25)
Q3 = rfm[features].quantile(0.75)
IQR = Q3 - Q1

# Define thresholds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = ((rfm < lower_bound) | (rfm > upper_bound)).sum()


# Filter out outliers
rfm = rfm[
    ~((rfm < lower_bound) | (rfm > upper_bound)).any(axis=1)
]
print(f"Data shape after removing outliers: {rfm.shape}")


### Normilize Dataset

In [None]:
# Select the RFM metrics
rfm_values = rfm[['Recency', 'Frequency', 'Monetary']]

# Normalize the data
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm_values)

# Check the scaled values
print(rfm_scaled[:5])  # Preview the first 5 rows

## Clustering

### Find the Optimal Number of Clusters

In [None]:
inertia = []  # Sum of squared distances to centroids
k_values = range(1, 11)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(rfm_scaled)
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8, 5))
plt.plot(k_values, inertia, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method to Determine Optimal k')
plt.show()


### Apply K-Means with the Chosen Number of Clusters

In [None]:
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)

# Preview the clusters
print(rfm.head())


### Group By Cluster & Calculate Mean RFM Values

In [None]:
cluster_summary = rfm.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean'
}).round(2)

print(cluster_summary)

### Pairplot to Visualize Clusters

In [None]:
sns.pairplot(rfm, vars=['Recency', 'Frequency', 'Monetary'], hue='Cluster', palette='viridis')
plt.show()

### Validate Clusters

In [None]:
cluster_summary = rfm.groupby('Cluster').mean()
print(cluster_summary)

### Visualize Cluster Size

In [None]:
cluster_sizes = rfm['Cluster'].value_counts()
cluster_sizes.plot(kind='bar', title='Cluster Sizes')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cluster_summary = {
    "Recency": [234.22, 51.83, 45.28, 39.41],
    "Frequency": [19.88, 20.71, 68.95, 115.72],
    "Monetary": [262.92, 299.32, 836.79, 1709.82]
}

df_summary = pd.DataFrame(cluster_summary, index=["Cluster 0", "Cluster 1", "Cluster 2", "Cluster 3"])
df_summary.plot(kind='bar', figsize=(10, 6), title='Cluster Comparison')
plt.show()