In [3]:
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
st.set_page_config(page_title="Online Retail Dashboard", layout="wide")
st.title("🛍️ Online Retail Customer Segmentation Dashboard")

2025-04-20 19:56:57.785 
  command:

    streamlit run C:\Users\mrsta\AppData\Roaming\Python\Python310\site-packages\ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [5]:
# Load data
df = pd.read_excel("Online_retail.xlsx")
df.dropna(subset=['CustomerID'], inplace=True)
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']


In [6]:
# RFM features
snapshot_date = df['InvoiceDate'].max() + pd.Timedelta(days=1)
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
    'InvoiceNo': 'nunique',
    'TotalPrice': 'sum'
}).reset_index()
rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']

In [7]:
# Standardize
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])

In [8]:
# Clustering
kmeans = KMeans(n_clusters=4, random_state=42)
rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)

In [9]:
# PCA
pca = PCA(n_components=2)
pca_components = pca.fit_transform(rfm_scaled)
rfm['PCA1'] = pca_components[:, 0]
rfm['PCA2'] = pca_components[:, 1]

In [10]:
# Sidebar filters
st.sidebar.header("Filters")
cluster_option = st.sidebar.selectbox("Select Cluster", options=["All"] + sorted(rfm['Cluster'].unique().tolist()))

2025-04-20 19:58:12.175 Session state does not function when running a script without `streamlit run`


In [11]:
# KPI metrics
col1, col2, col3 = st.columns(3)
col1.metric("Total Customers", len(rfm))
col2.metric("Total Revenue", f"${df['TotalPrice'].sum():,.2f}")
col3.metric("Avg Order Size", f"${df.groupby('InvoiceNo')['TotalPrice'].sum().mean():.2f}")



DeltaGenerator()

In [12]:
# Cluster distribution
st.subheader("📈 Cluster Distribution")
fig1, ax1 = plt.subplots()
sns.countplot(data=rfm, x='Cluster', ax=ax1)
st.pyplot(fig1)




DeltaGenerator()

In [13]:
# PCA scatter plot
st.subheader("🔍 Customer Segmentation (PCA View)")
fig2, ax2 = plt.subplots()
plot_data = rfm if cluster_option == "All" else rfm[rfm['Cluster'] == cluster_option]
sns.scatterplot(data=plot_data, x='PCA1', y='PCA2', hue='Cluster', palette='tab10', ax=ax2)
st.pyplot(fig2)



DeltaGenerator()

In [14]:
# Cluster summary table
st.subheader("📋 Cluster Summary")
st.dataframe(rfm.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': ['mean', 'count']
}).round(2))



DeltaGenerator()

In [15]:
# Download segmented data
st.subheader("📥 Download Segmented Customer Data")
st.download_button("Download CSV", data=rfm.to_csv(index=False), file_name="segmented_customers.csv", mime="text/csv")



False