In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import joblib
import warnings

warnings.filterwarnings('ignore')

df = pd.read_csv('../data/online_retail.csv', encoding='ISO-8859-1', low_memory=False)
print("Dataset loaded successfully.\n")

df.dropna(axis=0, subset=['CustomerID'], inplace=True)
df['CustomerID'] = df['CustomerID'].astype(int)
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Remove returns/cancelled orders
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]

# Calculate the total price for each transaction
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
print("Data cleaning complete.\n")


Dataset loaded successfully.

Data cleaning complete.



In [7]:

# Customer Segmentation (Unsupervised Learning with K-Means)


# Feature Engineering for Clustering (RFM Analysis)
snapshot_date = df['InvoiceDate'].max() + pd.Timedelta(days=1)

rfm_df = df.groupby(['CustomerID']).agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days, # Recency
    'InvoiceNo': 'nunique',                                   # Frequency
    'TotalPrice': 'sum'                                       # Monetary Value
}).rename(columns={'InvoiceDate': 'Recency', 'InvoiceNo': 'Frequency', 'TotalPrice': 'MonetaryValue'})

scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm_df)

optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
kmeans.fit(rfm_scaled)

# cluster labels back to the customer dataframe
rfm_df['Cluster'] = kmeans.labels_
print(f"Customer segmentation complete. Found 4 distinct customer groups.\n",rfm_df)


Customer segmentation complete. Found 4 distinct customer groups.
             Recency  Frequency  MonetaryValue  Cluster
CustomerID                                            
12346           326          1       77183.60        0
12347             2          7        4310.00        3
12348            75          4        1797.24        3
12349            19          1        1757.55        3
12350           310          1         334.40        1
...             ...        ...            ...      ...
18280           278          1         180.60        1
18281           181          1          80.82        1
18282             8          2         178.05        3
18283             4         16        2094.88        0
18287            43          3        1837.28        3

[4339 rows x 4 columns]


In [8]:

# Final Data for the Recommendation App


# Merging and Saving 
rfm_df.reset_index(inplace=True) 
df_final = pd.merge(df, rfm_df[['CustomerID', 'Cluster']], on='CustomerID', how='left')

# columns needed for the recommendation logic
recommendation_data = df_final[['CustomerID', 'StockCode', 'Description', 'Cluster']]
recommendation_data = recommendation_data.drop_duplicates()

# Save the Final Data Files
joblib.dump(recommendation_data, '../recommendation_data.pkl')
joblib.dump(rfm_df, '../customer_segments.pkl')

print("\n Data files saved successfully :")
print("- recommendation_data.pkl")
print("- customer_segments.pkl")




 Data files saved successfully :
- recommendation_data.pkl
- customer_segments.pkl
