In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [None]:
df = pd.read_csv('/kaggle/input/online-retail-customer-clustering/OnlineRetail.csv', encoding='ISO-8859-1')
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()
df.info

In [None]:
#The relationship between the quantity of products purchased (Quantity) and the unit price of the products (UnitPrice).
plt.scatter((df['Quantity']), df['UnitPrice'])
plt.xlabel('Quantity')
plt.ylabel('UnitPrice')
plt.title('Quantity vs UnitProce')
plt.show()

In [None]:
#item with negative quantity prolly means the customers cancelled their orders. As such, we shall exclude them.

df = df[df['Quantity'] > 0]
#visualize the distributions of our numerical variables 

df[(df['Quantity'] > -25) & (df['Quantity'] < 25)].hist(bins=50, figsize=(10,6))
plt.show()

In [None]:
# Compute Recency
current_date = pd.to_datetime('2023-06-17')  # Set the current date
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])  # Convert InvoiceDate to datetime
df['Recency'] = (current_date - df['InvoiceDate']).dt.days

# Compute Frequency
frequency = df.groupby('CustomerID')['InvoiceNo'].nunique()
df = df.merge(frequency, on='CustomerID', suffixes=('', '_Frequency'))
df.rename(columns={'InvoiceNo_Frequency': 'Frequency'}, inplace=True)

# Compute Monetary
df['Monetary'] = df['Quantity'] * df['UnitPrice']

# Calculate RFM scores
rfm_scores = df.groupby('CustomerID').agg({
    'Recency': 'min',
    'Frequency': 'sum',
    'Monetary': 'sum'
}).reset_index()

print(rfm_scores.head())


In [None]:
print('Total Number of Cus: ', df['CustomerID'].nunique())

In [None]:
from sklearn.cluster import KMeans

# Select RFM features for clustering
rfm_features = rfm_scores[['Recency', 'Frequency', 'Monetary']]

# Initialize K-Means model with desired number of clusters
kmeans = KMeans(n_clusters=4, random_state=42)

# Fit the K-Means model to the RFM features
kmeans.fit(rfm_features)

# Assign cluster labels to the RFM scores
rfm_scores['Cluster'] = kmeans.labels_

# Visualize the clusters
import seaborn as sns

plt.figure(figsize=(5, 3))
sns.scatterplot(data=rfm_scores, x='Recency', y='Monetary', hue='Cluster', palette='viridis')
plt.title('RFM Clustering')
plt.xlabel('Recency')
plt.ylabel('Monetary')
plt.show()


In [None]:
Q1 = rfm_scores.quantile(0.05)
Q3 = rfm_scores.quantile(0.95)
IQR = Q3 - Q1

rfm_filtered = rfm_scores[~((rfm_scores < (Q1 - 1.5 * IQR)) | (rfm_scores > (Q3 + 1.5 * IQR))).any(axis=1)]


In [None]:
#before remove outlier
plt.figure(figsize=(10, 7))
plt.subplot(1, 2, 1)
plt.boxplot(rfm_scores.values, labels=rfm_scores.columns)
plt.title('Before Outlier Removal')
plt.xlabel('Features')
plt.ylabel('Values')

#after remove outlier
plt.subplot(1, 2, 2)
plt.boxplot(rfm_filtered.values, labels=rfm_filtered.columns)
plt.title('After Outlier Removal')
plt.xlabel('Features')
plt.ylabel('Values')

plt.tight_layout()
plt.show()


In [None]:
from sklearn.preprocessing import StandardScaler

# Before Outlier Removal
plt.figure(figsize=(10, 7))
plt.subplot(1, 2, 1)
plt.boxplot(rfm_scores.values, labels=rfm_scores.columns)
plt.title('Before Outlier Removal')
plt.xlabel('Features')
plt.ylabel('Values')

# After Outlier Removal
plt.subplot(1, 2, 2)
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm_filtered)
plt.boxplot(rfm_scaled, labels=rfm_filtered.columns)
plt.title('After Outlier Removal')
plt.xlabel('Features')
plt.ylabel('Scaled Values')

plt.tight_layout()
plt.show()


In [None]:
# Select RFM features for clustering
rfm_features = rfm_scores[['Recency', 'Frequency', 'Monetary']]

# Determine the ideal K using the elbow method
inertia = []
k_values = range(1, 11)  # Try K values from 1 to 10

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(rfm_features)
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(10, 6))
plt.plot(k_values, inertia, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Curve')
plt.xticks(np.arange(1, 11, 1))
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

# Select RFM features for clustering
rfm_features = rfm_scores[['Recency', 'Frequency', 'Monetary']]

# Determine the ideal K using Silhouette analysis
silhouette_scores = []
k_values = range(2, 11)  # Try K values from 2 to 10

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(rfm_features)
    labels = kmeans.labels_
    silhouette_scores.append(silhouette_score(rfm_features, labels))

# Plot the Silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(k_values, silhouette_scores, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis')
plt.xticks(np.arange(2, 11, 1))
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics import silhouette_samples
import matplotlib.cm as cm

# Select RFM features for clustering
rfm_features = rfm_scores[['Recency', 'Frequency', 'Monetary']]

# Determine the ideal K using Silhouette analysis
silhouette_scores = []
k_values = range(2, 11)  # Try K values from 2 to 10

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(rfm_features)
    labels = kmeans.labels_
    silhouette_avg = silhouette_score(rfm_features, labels)
    silhouette_scores.append(silhouette_avg)
    sample_silhouette_values = silhouette_samples(rfm_features, labels)

    plt.figure(figsize=(8, 6))
    plt.plot([0, 1], [0, len(rfm_features)], color='green', linestyle='--')  # Vertical line as separation
    y_lower = 10
    
    for i in range(k):
        ith_cluster_silhouette_values = sample_silhouette_values[labels == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / k)
        plt.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, alpha=0.7)
        plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        y_lower = y_upper + 10
    
    plt.title(f'Silhouette Analysis (K = {k})')
    plt.xlabel('Silhouette Coefficient Values')
    plt.ylabel('Cluster Labels')
    plt.yticks([])
    plt.xlim(-0.1, 1)
    plt.ylim(0, len(rfm_features) + (k + 1) * 10)
    plt.axvline(x=silhouette_avg, color='red', linestyle='--')
    plt.show()

# Plot the Silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(k_values, silhouette_scores, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Analysis')
plt.xticks(np.arange(2, 11, 1))
plt.grid(True)
plt.show()


In [None]:
rfm_scores['Cluster_id (K-means)'] = kmeans.labels_

rfm_scores.head()

In [None]:
#viz RFM by Cluster Scores
fig, ax = plt.subplots(1, 3, figsize=(16, 6))

list = ['Recency', 'Frequency', 'Monetary']
for a, b in enumerate(list) :
    sns.boxplot(ax=ax[a], x='Cluster_id (K-means)', y=b, data=rfm_scores)
    
plt.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

# Select RFM features for clustering
rfm_features = rfm_scores[['Recency', 'Frequency', 'Monetary']]

# Perform hierarchical clustering
linked = linkage(rfm_features, method='ward')

# Plot the dendrogram
plt.figure(figsize=(10, 6))
dendrogram(linked, orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample Index')
plt.ylabel('Distance')
plt.show()


In [None]:
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch

# Select RFM features for clustering
rfm_features = rfm_scores[['Recency', 'Frequency', 'Monetary']]

# Perform Hierarchical Clustering
hc = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward')
hc.fit(rfm_features)
cluster_labels = hc.labels_

# Plot Dendrogram
plt.figure(figsize=(12, 6))
dendrogram = sch.dendrogram(sch.linkage(rfm_features, method='ward'))
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean Distances')
plt.show()

# Visualize Clusters
plt.figure(figsize=(10, 8))
plt.subplot(1, 2, 1)
sns.scatterplot(data=rfm_scores, x='Recency', y='Monetary', hue=cluster_labels, palette='viridis')
plt.title('Hierarchical Clustering')
plt.xlabel('Recency')
plt.ylabel('Monetary')

plt.subplot(1, 2, 2)
sns.scatterplot(data=rfm_scores, x='Frequency', y='Monetary', hue=cluster_labels, palette='viridis')
plt.title('Hierarchical Clustering')
plt.xlabel('Frequency')
plt.ylabel('Monetary')

plt.tight_layout()
plt.show()


In [None]:
rfm_scores['Cluster_id (Hierarchical)'] = cluster_labels
rfm_scores.head()

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(16, 6))

list = ['Recency', 'Frequency', 'Monetary']
for a, b in enumerate(list) :
    sns.boxplot(ax=ax[a], x='Cluster_id (Hierarchical)', y=b, data=rfm_scores)
    
plt.show()