<a href="https://colab.research.google.com/github/aditi2212-bot/Major-Project--Synthetic-Customer-Profiles-for-Product-Recommendation/blob/main/Final_Major_Project_Group1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Synthetic Customer Profiles for Product Recommendation

In today’s digital commerce environment, customer data plays a crucial role in understanding buying behavior, preferences, and trends.  
However, using real customer data often raises privacy and security concerns. To address this challenge, this project focuses on **generating synthetic customer profiles** that closely mimic real-world shopping behaviors — enabling businesses to build and test intelligent **recommendation systems** without compromising sensitive information.


## 1] Installation of Required Libraries

In [1]:
!pip install -q sdv==1.27.0 --upgrade
!pip install -q scikit-learn pandas numpy

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/186.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m186.8/186.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.7/52.7 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.3/74.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.4/198.4 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m60.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### 1.1] Imports & Setup

> Importing

In [2]:
import os, warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=UserWarning, message="missing ScriptRunContext")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, ConfusionMatrixDisplay, classification_report)



import joblib

plt.rcParams.update({"figure.dpi":120, "font.size":11,
                     "axes.titlesize":14, "axes.labelsize":12})
sns.set_style("whitegrid")

print(" Libraries loaded successfully")

 Libraries loaded successfully


### 1.2] Uploading data

In [5]:
df = pd.read_csv("shopping_trends_updated.csv")

print(" Dataset loaded successfully")
display(df.head())

 Dataset loaded successfully


Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


### 1.3] Cleaning the Dataset

In [6]:
print("Columns:", df.columns.tolist())
print("\nMissing values (top 10):\n", df.isnull().sum().sort_values(ascending=False).head(10))
print("Duplicate rows:", df.duplicated().sum())
display(df.describe(include='all').transpose().head(12))

Columns: ['Customer ID', 'Age', 'Gender', 'Item Purchased', 'Category', 'Purchase Amount (USD)', 'Location', 'Size', 'Color', 'Season', 'Review Rating', 'Subscription Status', 'Shipping Type', 'Discount Applied', 'Promo Code Used', 'Previous Purchases', 'Payment Method', 'Frequency of Purchases']

Missing values (top 10):
 Customer ID              0
Age                      0
Gender                   0
Item Purchased           0
Category                 0
Purchase Amount (USD)    0
Location                 0
Size                     0
Color                    0
Season                   0
dtype: int64
Duplicate rows: 0


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Customer ID,3900.0,,,,1950.5,1125.977353,1.0,975.75,1950.5,2925.25,3900.0
Age,3900.0,,,,44.068462,15.207589,18.0,31.0,44.0,57.0,70.0
Gender,3900.0,2.0,Male,2652.0,,,,,,,
Item Purchased,3900.0,25.0,Blouse,171.0,,,,,,,
Category,3900.0,4.0,Clothing,1737.0,,,,,,,
Purchase Amount (USD),3900.0,,,,59.764359,23.685392,20.0,39.0,60.0,81.0,100.0
Location,3900.0,50.0,Montana,96.0,,,,,,,
Size,3900.0,4.0,M,1755.0,,,,,,,
Color,3900.0,25.0,Olive,177.0,,,,,,,
Season,3900.0,4.0,Spring,999.0,,,,,,,


### 1.4] Shape and Unique Values in Dataset

In [38]:
print("Unique Values per Column (Top 10):")
for col in df.columns[:10]:
    print(f"  {col}: {df[col].nunique()} unique")
df.shape

Unique Values per Column (Top 10):
  Customer ID: 3900 unique
  Age: 51 unique
  Gender: 2 unique
  Item Purchased: 25 unique
  Category: 4 unique
  Purchase Amount (USD): 80 unique
  Location: 50 unique
  Size: 4 unique
  Color: 25 unique
  Season: 4 unique


(3900, 19)

### 1.5] Model Traing using CTGAN for generation of Synthetic data


Model Training-CTGAN


In [39]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

metadata.detect_from_dataframe(df)


metadata.validate()

from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer


metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df)
metadata.validate()


synth = CTGANSynthesizer(
    metadata=metadata,
    epochs=500,
    batch_size=128,
    pac=1,
    verbose=True
)

synth.fit(df)


Gen. (-0.29) | Discrim. (-0.07): 100%|██████████| 500/500 [08:18<00:00,  1.00it/s]


### 1.6]Synthetic Data Generation

**For EDA**

In [40]:
synthetic_df = synth.sample(num_rows=3900)
print("Synthetic Data Shape:", synthetic_df.shape)
synthetic_df.shape
synthetic_df.to_csv("synthetic_shopping_3900_rows.csv", index=False)

Synthetic Data Shape: (3900, 19)


In [41]:
synthetic_df.head()

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases,Source
0,884077,55,Female,Sweater,Clothing,95,Texas,L,Pink,Winter,4.1,No,Standard,No,No,5,Bank Transfer,Monthly,Synthetic Data
1,9327565,66,Male,Shoes,Footwear,24,Alaska,L,Cyan,Fall,4.3,No,Express,Yes,Yes,43,PayPal,Fortnightly,Synthetic Data
2,9387911,70,Male,Coat,Outerwear,56,Hawaii,M,Beige,Summer,3.8,No,Free Shipping,No,No,21,Bank Transfer,Every 3 Months,Synthetic Data
3,13624215,63,Female,Hoodie,Clothing,65,Rhode Island,L,Violet,Summer,2.8,No,Free Shipping,Yes,Yes,20,Debit Card,Quarterly,Synthetic Data
4,15153106,70,Male,Belt,Clothing,92,Minnesota,L,Charcoal,Fall,4.5,Yes,Standard,Yes,Yes,12,Debit Card,Monthly,Synthetic Data


**For Training**

In [42]:
synth_df = synth.sample(num_rows=20000)
print("Synthetic Data Shape:", synth_df.shape)
synth_df.to_csv("synthetic_shopping_20000_rows.csv", index=False)

Synthetic Data Shape: (20000, 19)


### REPORTS

In [43]:
!pip install -q sdmetrics

Data Quality Report (3900)

In [13]:
from sdmetrics.reports.single_table import QualityReport

metadata_dict = metadata.to_dict()

report = QualityReport()

report.generate(real_data=df, synthetic_data=synthetic_df, metadata=metadata_dict)


overall_score = report.get_score()
print("\n Overall Synthetic Data Quality Score:", round(overall_score, 3))

print("\n Column Shapes (first 5 rows):")
display(report.get_details("Column Shapes").head())

print("\n Column Pair Trends (first 5 rows):")
display(report.get_details("Column Pair Trends").head())


Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 18/18 [00:00<00:00, 127.16it/s]|
Column Shapes Score: 92.38%

(2/2) Evaluating Column Pair Trends: |██████████| 153/153 [00:01<00:00, 104.28it/s]|
Column Pair Trends Score: 85.88%

Overall Score (Average): 89.13%


 Overall Synthetic Data Quality Score: 0.891

 Column Shapes (first 5 rows):


Unnamed: 0,Column,Metric,Score
0,Age,KSComplement,0.861795
1,Gender,TVComplement,0.968462
2,Item Purchased,TVComplement,0.932564
3,Category,TVComplement,0.978462
4,Purchase Amount (USD),KSComplement,0.913077



 Column Pair Trends (first 5 rows):


Unnamed: 0,Column 1,Column 2,Metric,Score,Real Correlation,Synthetic Correlation
0,Age,Gender,ContingencySimilarity,0.844359,,
1,Age,Item Purchased,ContingencySimilarity,0.778462,,
2,Age,Category,ContingencySimilarity,0.832051,,
3,Age,Purchase Amount (USD),CorrelationSimilarity,0.99611,-0.010424,-0.018203
4,Age,Location,ContingencySimilarity,0.734359,,


Data Quality Report (20000)

In [44]:

from sdmetrics.reports.single_table import QualityReport

metadata_dict = metadata.to_dict()

report = QualityReport()

report.generate(real_data=df, synthetic_data=synth_df, metadata=metadata_dict)


overall_score = report.get_score()
print("\n Overall Synthetic Data Quality Score:", round(overall_score, 3))

print("\n Column Shapes (first 5 rows):")
display(report.get_details("Column Shapes").head())

print("\n Column Pair Trends (first 5 rows):")
display(report.get_details("Column Pair Trends").head())


Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 19/19 [00:00<00:00, 116.52it/s]|
Column Shapes Score: 92.45%

(2/2) Evaluating Column Pair Trends: |██████████| 171/171 [00:03<00:00, 50.09it/s]|
Column Pair Trends Score: 83.58%

Overall Score (Average): 88.02%


 Overall Synthetic Data Quality Score: 0.88

 Column Shapes (first 5 rows):


Unnamed: 0,Column,Metric,Score
0,Age,KSComplement,0.828491
1,Gender,TVComplement,0.998812
2,Item Purchased,TVComplement,0.926483
3,Category,TVComplement,0.957755
4,Purchase Amount (USD),KSComplement,0.875844



 Column Pair Trends (first 5 rows):


Unnamed: 0,Column 1,Column 2,Metric,Score,Real Correlation,Synthetic Correlation
0,Age,Gender,ContingencySimilarity,0.777072,,
1,Age,Item Purchased,ContingencySimilarity,0.730454,,
2,Age,Category,ContingencySimilarity,0.769974,,
3,Age,Purchase Amount (USD),CorrelationSimilarity,0.980267,-0.018203,-0.057669
4,Age,Location,ContingencySimilarity,0.710585,,


In [45]:
model_path = "/content/final_ctgan_model.pkl"
synth.save(model_path)
print(f" Model saved successfully at {model_path}")

 Model saved successfully at /content/final_ctgan_model.pkl


# 2] Exploratory Data Analysis
- Real VS Synthetic Data comparison By Categories

###  2.1] Gender Distribution comparison

In [46]:
import plotly.express as px

real_gender_counts = df['Gender'].value_counts().reset_index()
real_gender_counts.columns = ['Gender', 'Count']
real_gender_counts['Dataset'] = 'Real Data'

synth_gender_counts = synthetic_df['Gender'].value_counts().reset_index()
synth_gender_counts.columns = ['Gender', 'Count']
synth_gender_counts['Dataset'] = 'Synthetic Data'

combined_df = pd.concat([real_gender_counts, synth_gender_counts])

fig = px.pie(combined_df,
             names='Gender',
             values='Count',
             color='Gender',
             facet_col='Dataset',  # two side-by-side pies
             title=' Gender Distribution Comparison: Real vs Synthetic Data',
             color_discrete_sequence=px.colors.qualitative.Pastel)

fig.update_traces(textinfo='percent+label')
fig.update_layout(title_x=0.3, showlegend=True)
fig.show()

### 2.2] Review Rating,Age And Purchased amount comparison

In [47]:
import plotly.graph_objects as go

metrics = ["Purchase Amount (USD)", "Age", "Review Rating"]

real_means = []
synth_means = []

for col in metrics:
    if col in df.columns and col in synthetic_df.columns:
        real_means.append(df[col].mean())
        synth_means.append(synthetic_df[col].mean())
    else:
        real_means.append(None)
        synth_means.append(None)


fig = go.Figure()

fig.add_trace(go.Bar(
    y=metrics,
    x=real_means,
    name="Real Data",
    orientation='h',
    marker=dict(color="royalblue")
))

fig.add_trace(go.Bar(
    y=metrics,
    x=synth_means,
    name="Synthetic Data",
    orientation='h',
    marker=dict(color="darkorange")
))


fig.update_layout(
    title="Real vs Synthetic Data Comparison",
    xaxis_title="Average Value",
    barmode='group',
    template="plotly_white",
    bargap=0.4,
    legend=dict(orientation="h", y=-0.2)
)

fig.show()


### 2.3] Product comparison

In [48]:
import pandas as pd
import plotly.graph_objects as go

real_counts = df["Category"].value_counts().nlargest(10)
synth_counts = synthetic_df["Category"].value_counts().nlargest(10)

fig = go.Figure()
fig.add_trace(go.Bar(x=real_counts.values, y=real_counts.index, orientation='h', name='Real Data', marker_color='teal'))
fig.add_trace(go.Bar(x=synth_counts.values, y=synth_counts.index, orientation='h', name='Synthetic Data', marker_color='salmon'))

fig.update_layout(
    title=" Product Popularity: Real vs Synthetic Data",
    xaxis_title="Count",
    yaxis_title="Product Category",
    barmode='group',
    title_font=dict(size=20, color="darkblue"),
    plot_bgcolor="white"
)
fig.show()

### 2.4] Customer Insights



In [49]:
import pandas as pd
import plotly.express as px
import numpy as np

def create_sunburst_comparison(df, source_name, top_n=10):

    mean_price = df['Purchase Amount (USD)'].mean()
    df['Price_Euclidean_Distance'] = np.sqrt((df['Purchase Amount (USD)'] - mean_price)**2)


    bins = [0, 18, 35, 55, np.inf]
    labels = ['Under 18', 'Youth', 'Adult', 'Middle-Age']
    df['Age Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)


    item_affinity = (
        df.groupby('Item Purchased')['Purchase Amount (USD)']
        .count()
        .rank(pct=True)
        .to_dict()
    )
    df['Affinity_Score'] = df['Item Purchased'].map(item_affinity).fillna(0.3)


    fig_sunburst = px.sunburst(
        df,
        path=['Gender', 'Age Group', 'Item Purchased'],
        values='Purchase Amount (USD)',
        color='Affinity_Score',
        color_continuous_scale='Viridis',   # softer palette
        title=f"Customer Insights: {source_name} - Gender → Age Group → Product",
        hover_data={'Affinity_Score':':.2f', 'Purchase Amount (USD)':':.2f'}
    )
    fig_sunburst.update_layout(template='plotly_white')


    affinity_sorted = (
        df.groupby('Item Purchased')['Affinity_Score']
        .mean()
        .sort_values(ascending=False)
    )

    fig_bar = px.bar(
        affinity_sorted.head(top_n).reset_index(),
        x='Item Purchased',
        y='Affinity_Score',
        title=f"Top {top_n} High-Affinity Items - {source_name}",
        text='Affinity_Score',
        color='Affinity_Score',
        color_continuous_scale='Viridis'
    )
    fig_bar.update_traces(texttemplate='%{text:.2f}', textposition='outside')
    fig_bar.update_layout(template='plotly_white', yaxis_range=[0, 1])

    return fig_sunburst, fig_bar



df_real = pd.read_csv('shopping_trends_updated.csv')
df_synthetic = pd.read_csv('synthetic_shopping_3900_rows.csv')

fig_real_sun, fig_real_bar = create_sunburst_comparison(df_real.copy(), "REAL DATA")
fig_syn_sun, fig_syn_bar = create_sunburst_comparison(df_synthetic.copy(), "SYNTHETIC DATA")

fig_real_sun.show()
fig_real_bar.show()
fig_syn_sun.show()
fig_syn_bar.show()


### 2.5] Payment Method

In [50]:
import pandas as pd
import plotly.express as px

df_real = pd.read_csv('shopping_trends_updated.csv')
df_synthetic = pd.read_csv('synthetic_shopping_3900_rows.csv')


def create_loyalty_sunburst(df, source_name):
    fig = px.sunburst(
        df,
        path=['Subscription Status', 'Payment Method', 'Frequency of Purchases'],
        values='Purchase Amount (USD)',
        color='Subscription Status',
        title=f" Payment Behavior: {source_name}",
        color_discrete_sequence=px.colors.qualitative.Pastel,
        template='plotly_white'
    )
    fig.update_layout(
        font=dict(size=13, color='black'),
        plot_bgcolor='white',
        paper_bgcolor='white'
    )
    return fig

fig_real = create_loyalty_sunburst(df_real.copy(), "REAL DATA")
fig_real.show()


fig_synthetic = create_loyalty_sunburst(df_synthetic.copy(), "SYNTHETIC DATA")
fig_synthetic.show()


### 2.6] Product Affinity

In [51]:
import pandas as pd
import plotly.express as px

df_real = pd.read_csv('shopping_trends_updated.csv')
df_synthetic = pd.read_csv('synthetic_shopping_3900_rows.csv')


def create_affinity_treemap(df, source_name):

    df['Purchase Amount (USD)'] = pd.to_numeric(df['Purchase Amount (USD)'], errors='coerce')


    affinity = (
        df.groupby(['Category', 'Item Purchased'])
          ['Purchase Amount (USD)'].mean()
          .reset_index()
          .sort_values(by='Purchase Amount (USD)', ascending=False)
    )

    fig = px.treemap(
        affinity,
        path=['Category', 'Item Purchased'],
        values='Purchase Amount (USD)',
        color='Purchase Amount (USD)',
        color_continuous_scale='Blues',
        title=f"Product Affinity TreeMap: {source_name} - Avg Purchase Amount"
    )

    fig.update_layout(
        font=dict(size=13),
        template='plotly_white'
    )

    return fig


fig_real = create_affinity_treemap(df_real.copy(), "REAL DATA")
fig_real.show()


fig_synthetic = create_affinity_treemap(df_synthetic.copy(), "SYNTHETIC DATA")
fig_synthetic.show()

### 2.7] Purchase Amount

In [52]:
import pandas as pd
import plotly.express as px

df_real = pd.read_csv('shopping_trends_updated.csv')
df_real['Source'] = 'Real Data'
df_synthetic = pd.read_csv('synthetic_shopping_3900_rows.csv')
df_synthetic['Source'] = 'Synthetic Data'
df_combined = pd.concat([df_real, df_synthetic], ignore_index=True)

fig = px.box(
    df_combined,
    x='Season',
    y='Purchase Amount (USD)',
    color='Source',
    title="Comparison of Purchase Amount Distribution by Season (Real vs. Synthetic)",
    color_discrete_sequence=['#1f77b4', '#ff7f0e'],
    template='plotly_white'
)


season_order = ['Spring', 'Summer', 'Fall', 'Winter']
fig.update_xaxes(categoryorder='array', categoryarray=season_order)

fig.update_layout(
    xaxis_title="Season",
    yaxis_title="Purchase Amount (USD)",
    boxmode='group'
)

fig.show()

### 2.8] Shipping Type vs Purchase Frequency

In [53]:
import pandas as pd
import plotly.express as px

df_real = pd.read_csv('shopping_trends_updated.csv')
df_synthetic = pd.read_csv('synthetic_shopping_3900_rows.csv')


def create_shipping_frequency_treemap(df, source_name):
    fig = px.treemap(
        df,
        path=['Shipping Type', 'Frequency of Purchases'], # Defines the hierarchy
        title=f'Hierarchical Comparison: {source_name} - Shipping Type vs Purchase Frequency',
        color='Frequency of Purchases', # Color by the deeper level for distinction
        color_discrete_sequence=px.colors.qualitative.Pastel
    )

    fig.update_layout(
        font=dict(size=13),
        template='plotly_white'
    )

    return fig

fig_real = create_shipping_frequency_treemap(df_real.copy(), "REAL DATA")
fig_real.show()


fig_synthetic = create_shipping_frequency_treemap(df_synthetic.copy(), "SYNTHETIC DATA")
fig_synthetic.show()

### 2.9] Category Distribution

In [54]:
df_cat_counts = df_combined.groupby(['Category', 'Source']).size().reset_index(name='Count')

fig_cat_comp = px.bar(df_cat_counts,
                      x='Category',
                      y='Count',
                      color='Source',
                      barmode='group',
                      title=" Category Distribution (Real vs. Synthetic)",
                      color_discrete_sequence=['#1f77b4', '#ff7f0e'],
                      template='plotly_white')
fig_cat_comp.update_layout(xaxis_title="Item Category", yaxis_title="Count")
fig_cat_comp.show()

### 2.10] Purchase Amount Distribution

In [55]:
fig_hist_comp = px.histogram(df_combined,
                             x='Purchase Amount (USD)',
                             color='Source',
                             barmode='overlay',
                             opacity=0.6,
                             title=" Purchase Amount Distribution (Real vs. Synthetic)",
                             color_discrete_sequence=['#1f77b4', '#ff7f0e'],
                             template='plotly_white')
fig_hist_comp.update_layout(yaxis_title="Count")
fig_hist_comp.show()

### 2.11] Purchase Amount by Category

In [56]:
fig_box_comp = px.box(df_combined,
                      x='Category',
                      y='Purchase Amount (USD)',
                      color='Source',
                      title=" Purchase Amount by Category (Real vs. Synthetic)",
                      color_discrete_sequence=['#1f77b4', '#ff7f0e'],
                      template='plotly_white')
fig_box_comp.update_layout(xaxis_title="Item Category", yaxis_title="Purchase Amount (USD)")
fig_box_comp.show()

## 2.12] Correlation HeatMap


In [57]:
numerical_cols = ['Age', 'Purchase Amount (USD)', 'Review Rating', 'Previous Purchases']

corr_real = df_real[numerical_cols].corr()
corr_synthetic = df_synthetic[numerical_cols].corr()

fig_heatmap_real = px.imshow(corr_real,
                        text_auto=True,
                        aspect="auto",
                        color_continuous_scale='RdBu_r',
                        title="2.2.1 Real Data: Correlation Heatmap")

fig_heatmap_synthetic = px.imshow(corr_synthetic,
                        text_auto=True,
                        aspect="auto",
                        color_continuous_scale='RdBu_r',
                        title="2.2.2 Synthetic Data: Correlation Heatmap")

def update_heatmap_axes(fig):
    fig.update_xaxes(side="bottom")
    fig.update_layout(
        xaxis = dict(tickmode='array', tickvals=list(range(len(numerical_cols))), ticktext=numerical_cols),
        yaxis = dict(tickmode='array', tickvals=list(range(len(numerical_cols))), ticktext=numerical_cols)
    )

update_heatmap_axes(fig_heatmap_real)
update_heatmap_axes(fig_heatmap_synthetic)

fig_heatmap_real.show()
fig_heatmap_synthetic.show()

### 2.13] Total Purchase Amount by State

In [58]:
import pandas as pd
import plotly.express as px


state_map = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
    'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
    'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
    'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA',
    'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT',
    'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM',
    'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK',
    'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
    'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
}

df_real = pd.read_csv('shopping_trends_updated.csv')
df_real['Source'] = 'Real Data'
df_synthetic = pd.read_csv('synthetic_shopping_3900_rows.csv')
df_synthetic['Source'] = 'Synthetic Data'

df_combined = pd.concat([df_real, df_synthetic], ignore_index=True)


df_combined['State Code'] = df_combined['Location'].map(state_map)
df_geo_agg = df_combined.groupby(['State Code', 'Source'])['Purchase Amount (USD)'].sum().reset_index()
df_geo_agg.rename(columns={'Purchase Amount (USD)': 'Total Purchase Amount (USD)'}, inplace=True)



fig = px.choropleth(df_geo_agg,
                    locations='State Code',
                    locationmode='USA-states',
                    scope='usa',
                    color="Total Purchase Amount (USD)",
                    facet_col='Source',
                    color_continuous_scale="Plasma",
                    hover_name='State Code',
                    hover_data={'Total Purchase Amount (USD)': ':,0f'},
                    title=(" Total Purchase Amount by State (Real vs. Synthetic)")
)


fig.update_geos(showsubunits=True, subunitcolor="Black")

fig.show()

### 2.14] Top 10 Customer's Location

In [59]:

df = df_real

if 'Location' in df.columns:
    top_loc = df['Location'].value_counts().nlargest(10).reset_index()
    top_loc.columns = ["Location","Count"]

    fig = px.scatter(top_loc, x="Location", y="Count", size="Count",
                     color="Location", text="Location",
                     title="Top 10 Customer Locations - REAL DATA",
                     color_discrete_sequence=px.colors.qualitative.Bold)
    fig.update_traces(textposition="top center")
    fig.show()

df = df_synthetic

if 'Location' in df.columns:
    top_loc = df['Location'].value_counts().nlargest(10).reset_index()
    top_loc.columns = ["Location","Count"]

    fig = px.scatter(top_loc, x="Location", y="Count", size="Count",
                     color="Location", text="Location",
                     title="Top 10 Customer Locations - SYNTHETIC DATA",
                     color_discrete_sequence=px.colors.qualitative.Bold)
    fig.update_traces(textposition="top center")
    fig.show()


### 2.15] Comparison of Top Categories

In [60]:
import pandas as pd
import plotly.express as px

df_real = pd.read_csv('shopping_trends_updated.csv')
df_real['Source'] = 'Real Data'
df_synthetic = pd.read_csv('synthetic_shopping_3900_rows.csv')
df_synthetic['Source'] = 'Synthetic Data'
df_combined = pd.concat([df_real, df_synthetic], ignore_index=True)

product_col = "Category"

if product_col in df_combined.columns:

    top_products_combined = df_combined.groupby([product_col, 'Source']).size().reset_index(name='Count')


    real_order = top_products_combined[top_products_combined['Source'] == 'Real Data'].sort_values('Count', ascending=False)[product_col]

    fig = px.bar(
        top_products_combined,
        x="Count",
        y=product_col,
        color="Source",
        orientation="h",
        barmode='group',
        text="Count",
        title=f"Comparison of Top {product_col}",
        category_orders={product_col: real_order.tolist()},
        color_discrete_sequence=['#1f77b4', '#ff7f0e']
    )

    fig.update_layout(
        yaxis_title=product_col,
        xaxis_title="Number of Purchases",
        title_font=dict(size=20, color="darkblue"),
        plot_bgcolor="white"
    )

    fig.update_traces(textposition="outside")
    fig.show()
else:
    print(f"Column '{product_col}' not found in the combined dataset.")

### 2.16] Customer Age distribution

In [61]:
import pandas as pd
import plotly.express as px

df_real = pd.read_csv('shopping_trends_updated.csv')
df_real['Source'] = 'Real Data'
df_synthetic = pd.read_csv('synthetic_shopping_3900_rows.csv')
df_synthetic['Source'] = 'Synthetic Data'
df_combined = pd.concat([df_real, df_synthetic], ignore_index=True)


if "Age" in df_combined.columns:
    fig = px.histogram(
        df_combined,
        x="Age",
        color="Source",
        nbins=30,
        marginal="box",
        opacity=0.6,
        barmode='overlay',
        title="Comparison of Customer Age Distribution (Real vs. Synthetic)",
        color_discrete_sequence=['#1f77b4', '#ff7f0e']
    )

    fig.update_layout(
        xaxis_title="Age",
        yaxis_title="Count",
        title_font=dict(size=20, color="darkblue"),
        bargap=0.05,
        plot_bgcolor="white"
    )

    fig.show()
else:
    print(" 'Age' column not found in the combined dataset.")

### 2.17] Customer age distribution by gender

In [62]:
import pandas as pd
import plotly.express as px


df_real = pd.read_csv('shopping_trends_updated.csv')
df_real['Source'] = 'Real Data'
df_synthetic = pd.read_csv('synthetic_shopping_3900_rows.csv')
df_synthetic['Source'] = 'Synthetic Data'
df_combined = pd.concat([df_real, df_synthetic], ignore_index=True)


if {"Age", "Gender"}.issubset(df_combined.columns):
    fig = px.histogram(
        df_combined,
        x="Age",
        color="Gender",
        facet_col="Source",
        nbins=30,
        barmode="overlay",
        title="Comparison of Age Distribution by Gender (Real vs. Synthetic)",
        color_discrete_sequence=px.colors.qualitative.Bold
    )


    fig.update_layout(
        xaxis_title="Age",
        yaxis_title="Count",
        title_font=dict(size=20, color="darkblue"),
        plot_bgcolor="white",
        bargap=0.05
    )

    fig.update_traces(opacity=0.6)
    fig.update_xaxes(matches=None)

    fig.show()
else:
    print(" 'Age' and 'Gender' columns not found in the combined dataset.")

### 2.18] Total Purchase amount by season

In [63]:
import plotly.express as px
import pandas as pd


df_season_agg = df_combined.groupby(['Season', 'Source'])['Purchase Amount (USD)'].sum().reset_index()
df_season_agg.rename(columns={'Purchase Amount (USD)': 'Total Purchase Amount (USD)'}, inplace=True)

fig = px.bar(
    df_season_agg,
    x='Season',
    y='Total Purchase Amount (USD)',
    color='Source',
    barmode='group',
    text='Total Purchase Amount (USD)',
    title="Comparison of Total Purchase Amount by Season (Real vs. Synthetic)",
    color_discrete_sequence=['#1f77b4', '#ff7f0e'],
    template='plotly_white'
)


season_order = ['Spring', 'Summer', 'Fall', 'Winter']
fig.update_xaxes(categoryorder='array', categoryarray=season_order)
fig.update_traces(texttemplate='$%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

fig.show()

### 2.19] Purchase Amount based on Discount Applied

In [64]:
import plotly.express as px


df_real = pd.read_csv('shopping_trends_updated.csv')
df_real['Source'] = 'Real Data'
df_synthetic = pd.read_csv('synthetic_shopping_3900_rows.csv')
df_synthetic['Source'] = 'Synthetic Data'
df_combined = pd.concat([df_real, df_synthetic], ignore_index=True)


fig = px.violin(
    df_combined,
    x='Discount Applied',
    y='Purchase Amount (USD)',
    color='Source',
    box=True,
    title=" Purchase Amount based on Discount Applied",
    color_discrete_sequence=['#1f77b4', '#ff7f0e'],
    template='plotly_white'
)
fig.update_layout(
    xaxis_title="Discount Applied",
    yaxis_title="Purchase Amount (USD)",
    violinmode='group'
)


### 2.20] Payment Method by Gender

In [65]:

df_real = pd.read_csv('shopping_trends_updated.csv')
df_real['Source'] = 'Real Data'
df_synthetic = pd.read_csv('synthetic_shopping_3900_rows.csv')
df_synthetic['Source'] = 'Synthetic Data'
df_combined = pd.concat([df_real, df_synthetic], ignore_index=True)


df_grouped = df_combined.groupby(['Payment Method', 'Gender', 'Source']).size().reset_index(name='Count')

fig = px.bar(
    df_grouped,
    x='Payment Method',
    y='Count',
    color='Gender',
    facet_col='Source',
    barmode='group',
    title="Payment Method by Gender ",
    color_discrete_sequence=px.colors.qualitative.D3,
    template='plotly_white'
)
fig.update_layout(
    xaxis_title="Payment Method",
    yaxis_title="Count",
)


## 3] Model Training
Models trained: Random Forest, XGBoost, and LightGBM on an expanded synthetic dataset containing 20,000 samples along with real dataset having 3900 samples


In [66]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, top_k_accuracy_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings("ignore")


synthetic_data = pd.read_csv("synthetic_shopping_20000_rows.csv")
real_data = pd.read_csv("shopping_trends_updated.csv")


numeric_cols = ['Age', 'Review Rating', 'Previous Purchases', 'Purchase Amount (USD)']


for col in numeric_cols:
    if col in synthetic_data.columns:
        synthetic_data[col] = pd.to_numeric(synthetic_data[col], errors='coerce')
    if col in real_data.columns:
        real_data[col] = pd.to_numeric(real_data[col], errors='coerce')

synthetic_data.fillna(0, inplace=True)
real_data.fillna(0, inplace=True)


categorical_cols = ['Gender', 'Location', 'Size', 'Color', 'Season',
                    'Subscription Status', 'Shipping Type', 'Discount Applied',
                    'Promo Code Used', 'Payment Method', 'Frequency of Purchases']


target_cols = ['Category', 'Item Purchased']
all_cols = categorical_cols + target_cols
encoders = {}


for col in all_cols:
    if col in synthetic_data.columns and col in real_data.columns:
        enc = LabelEncoder()
        combined_values = pd.concat([synthetic_data[col], real_data[col]], ignore_index=True)
        enc.fit(combined_values)
        synthetic_data[col] = enc.transform(synthetic_data[col])
        real_data[col] = enc.transform(real_data[col])
        encoders[col] = enc


combined_data = pd.concat([synthetic_data, real_data], ignore_index=True)
features = numeric_cols + categorical_cols
X = combined_data[features]
y_category = combined_data['Category']


scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

X_train, X_test, y_train, y_test = train_test_split(
    X, y_category, test_size=0.2, random_state=42, stratify=y_category
)


rf_model = RandomForestClassifier(n_estimators=500, max_depth=25, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

xgb_model = xgb.XGBClassifier(n_estimators=500, max_depth=10, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

lgb_model = lgb.LGBMClassifier(n_estimators=500, max_depth=15, learning_rate=0.1)
lgb_model.fit(X_train, y_train)
y_pred_lgb = lgb_model.predict(X_test)

top_k = 3
results_summary = []

models = {
    "Random Forest": (rf_model, y_pred_rf),
    "XGBoost": (xgb_model, y_pred_xgb),
    "LightGBM": (lgb_model, y_pred_lgb)
}

classification_reports = {}

for name, (model, y_pred) in models.items():
    acc = accuracy_score(y_test, y_pred)
    topk = top_k_accuracy_score(y_test, model.predict_proba(X_test), k=top_k)
    results_summary.append({
        "Model": name,
        "Accuracy": round(acc,4),
        f"Top-{top_k} Accuracy": round(topk,4)
    })


    report_dict = classification_report(y_test, y_pred, output_dict=True)
    df_report = pd.DataFrame(report_dict).transpose().round(4)
    classification_reports[name] = df_report


df_summary = pd.DataFrame(results_summary)
print("Model Summary")
display(df_summary)


for model_name, df_report in classification_reports.items():
    print(f"\n {model_name} Classification Report ")
    display(df_report)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002482 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 324
[LightGBM] [Info] Number of data points in the train set: 19120, number of used features: 15
[LightGBM] [Info] Start training from score -1.218195
[LightGBM] [Info] Start training from score -0.881470
[LightGBM] [Info] Start training from score -1.712941
[LightGBM] [Info] Start training from score -2.209750
Model Summary


Unnamed: 0,Model,Accuracy,Top-3 Accuracy
0,Random Forest,0.4387,0.9004
1,XGBoost,0.4487,0.905
2,LightGBM,0.4579,0.9059



 Random Forest Classification Report 


Unnamed: 0,precision,recall,f1-score,support
0,0.448,0.268,0.3354,1414.0
1,0.4408,0.8298,0.5758,1980.0
2,0.375,0.0731,0.1223,862.0
3,0.3077,0.0229,0.0426,524.0
accuracy,0.4387,0.4387,0.4387,0.4387
macro avg,0.3929,0.2985,0.269,4780.0
weighted avg,0.4165,0.4387,0.3645,4780.0



 XGBoost Classification Report 


Unnamed: 0,precision,recall,f1-score,support
0,0.4459,0.4371,0.4414,1414.0
1,0.4807,0.653,0.5537,1980.0
2,0.3359,0.203,0.2531,862.0
3,0.3224,0.1126,0.1669,524.0
accuracy,0.4487,0.4487,0.4487,0.4487
macro avg,0.3962,0.3514,0.3538,4780.0
weighted avg,0.4269,0.4487,0.4239,4780.0



 LightGBM Classification Report 


Unnamed: 0,precision,recall,f1-score,support
0,0.4597,0.4399,0.4496,1414.0
1,0.4892,0.6641,0.5634,1980.0
2,0.3551,0.2146,0.2675,862.0
3,0.3073,0.1279,0.1806,524.0
accuracy,0.4579,0.4579,0.4579,0.4579
macro avg,0.4028,0.3616,0.3653,4780.0
weighted avg,0.4364,0.4579,0.4344,4780.0


### Visuals of Trained Models


In [67]:
import pandas as pd
import plotly.express as px


top_k = 3

df_summary_melted = df_summary.melt(id_vars='Model',
                                    value_vars=['Accuracy', f'Top-{top_k} Accuracy'],
                                    var_name='Metric',
                                    value_name='Score')


fig_bar = px.bar(
    df_summary_melted,
    x='Metric',
    y='Score',
    color='Metric',
    barmode='group',
    facet_col='Model',
    text=df_summary_melted['Score'].apply(lambda x: f'{x:.4f}'),
    title=f'Model Performance Summary (Accuracy & Top-{top_k} Accuracy)'
)

fig_bar.update_layout(
    yaxis_range=[0, 1.05],
    yaxis_title="Score",
    xaxis_title="",
    showlegend=True
)

fig_bar.update_traces(textposition='outside')
fig_bar.show()

In [68]:
import pandas as pd
import plotly.express as px


df_rf_report = classification_reports["Random Forest"]


class_rows = df_rf_report.index.difference(['accuracy', 'macro avg', 'weighted avg', 'support'])
df_support = df_rf_report.loc[class_rows, ['support']].reset_index()
df_support.columns = ['Category_Encoded', 'Support']


fig_pie = px.pie(
    df_support,
    values='Support',
    names='Category_Encoded',
    title='Distribution of Test Set Samples by Encoded Category (Support)',
    hover_data=['Support'],
    labels={'Category_Encoded': 'Category (Encoded)'}
)

fig_pie.update_traces(
    textinfo='percent+label',
    pull=[0.03] * len(df_support),
    marker=dict(line=dict(color='#000000', width=1))
)

fig_pie.show()

### Top 15 Feature Importances for Category Prediction

In [69]:
# Define features and prepare data
numeric_cols = ['Age', 'Review Rating', 'Previous Purchases', 'Purchase Amount (USD)']
categorical_cols = ['Gender', 'Location', 'Size', 'Color', 'Season',
                    'Subscription Status', 'Shipping Type', 'Discount Applied',
                    'Promo Code Used', 'Payment Method', 'Frequency of Purchases']
features = numeric_cols + categorical_cols
target_cols = ['Category', 'Item Purchased']

# Ensure combined data is ready with encoding/scaling (re-using the logic from your code)
temp_real = df_real.copy()
temp_synthetic = df_synthetic.copy()
# Ensure numeric and fillna
for col in numeric_cols:
    temp_real[col] = pd.to_numeric(temp_real[col], errors='coerce').fillna(0)
    temp_synthetic[col] = pd.to_numeric(temp_synthetic[col], errors='coerce').fillna(0)

# Label Encoding on combined data
for col in categorical_cols + target_cols:
    enc = LabelEncoder()
    combined_values = pd.concat([temp_real[col], temp_synthetic[col]], ignore_index=True)
    enc.fit(combined_values)
    temp_real[col] = enc.transform(temp_real[col])
    temp_synthetic[col] = enc.transform(temp_synthetic[col])

temp_combined = pd.concat([temp_real, temp_synthetic], ignore_index=True)
X = temp_combined[features]
y_category = temp_combined['Category']

# Scaling
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

X_train, _, y_train, _ = train_test_split(
    X, y_category, test_size=0.2, random_state=42, stratify=y_category
)

# Random Forest Classifier (re-fit)
rf_model = RandomForestClassifier(n_estimators=500, max_depth=25, random_state=42, class_weight='balanced')
rf_model.fit(X_train, y_train)


feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

fig_imp = px.bar(
    feature_importances.head(15),
    x='Importance',
    y='Feature',
    orientation='h',
    title='Top 15 Feature Importances for Category Prediction (Random Forest)',
    color='Importance',
    color_continuous_scale=px.colors.sequential.Agsunset,
    template='plotly_white'
)

fig_imp.update_yaxes(autorange="reversed")

###3.1] Two-Stage Item Recommendation using Lightgbm

In [70]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, top_k_accuracy_score
import lightgbm as lgb
import warnings
import joblib
warnings.filterwarnings("ignore")


synthetic_data = pd.read_csv("synthetic_shopping_20000_rows.csv")
real_data = pd.read_csv("shopping_trends_updated.csv")

numeric_cols = ['Age', 'Review Rating', 'Previous Purchases', 'Purchase Amount (USD)']
categorical_cols = ['Gender', 'Location', 'Size', 'Color', 'Season',
                    'Subscription Status', 'Shipping Type', 'Discount Applied',
                    'Promo Code Used', 'Payment Method', 'Frequency of Purchases']
target_cols = ['Category', 'Item Purchased']


for col in numeric_cols:
    for df in [synthetic_data, real_data]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
synthetic_data.fillna(0, inplace=True)
real_data.fillna(0, inplace=True)


all_cols = categorical_cols + target_cols
encoders = {}
for col in all_cols:
    enc = LabelEncoder()
    combined_values = pd.concat([synthetic_data[col], real_data[col]], ignore_index=True)
    enc.fit(combined_values)
    for df in [synthetic_data, real_data]:
        df[col] = enc.transform(df[col])
    encoders[col] = enc


combined_data = pd.concat([synthetic_data, real_data], ignore_index=True)


scaler = StandardScaler()
combined_data[numeric_cols] = scaler.fit_transform(combined_data[numeric_cols])


features = numeric_cols + categorical_cols


allowed_items_per_gender = {
    'Male': ['Shirt','Pants','Shoes','Jacket','Sweater','Hoodie','Shorts','Socks',
             'Belt','Hat','Gloves','Scarf','Backpack','Sneakers','Coat','Boots',
             'Sunglasses','Jeans','Watch'],
    'Female': ['Blouse','Dress','Skirt','Heels','Sandals','Shirt','Pants','Shoes',
               'Jacket','Sweater','Hoodie','Shorts','Socks','Belt','Hat','Gloves',
               'Scarf','Backpack','Sneakers','Coat','Boots','Handbag','Jewelry',
               'Sunglasses','Watch','T-shirt']
}

def gender_filter(gender_enc, pred_items_enc):
    gender = encoders['Gender'].inverse_transform([gender_enc])[0]
    allowed = allowed_items_per_gender.get(gender, [])
    items = encoders['Item Purchased'].inverse_transform(pred_items_enc)
    return [item for item in items if item in allowed]


X_cat = combined_data[features]
y_cat = combined_data['Category']

X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(
    X_cat, y_cat, test_size=0.2, random_state=42, stratify=y_cat
)

lgb_cat_model = lgb.LGBMClassifier(n_estimators=500, max_depth=15, learning_rate=0.1, verbose=-1)
lgb_cat_model.fit(X_train_cat, y_train_cat)

y_pred_cat = lgb_cat_model.predict(X_test_cat)
cat_acc = accuracy_score(y_test_cat, y_pred_cat)
topk_cat = top_k_accuracy_score(
    y_test_cat,
    lgb_cat_model.predict_proba(X_test_cat),
    k=3,
    labels=np.arange(len(encoders['Category'].classes_))
)

print("\nCategory Model Summary : ")
print(pd.DataFrame({
    'Model': ['Category Model'],
    'Accuracy': [cat_acc],
    'Top-3 Accuracy': [topk_cat]
}).to_string(index=False))


unique_categories = combined_data['Category'].unique()
item_models = {}
item_results = []

all_item_labels = np.arange(len(encoders['Item Purchased'].classes_))

for cat in unique_categories:
    cat_df = combined_data[combined_data['Category'] == cat]
    X_item = cat_df[features]
    y_item = cat_df['Item Purchased']

    X_train_item, X_test_item, y_train_item, y_test_item = train_test_split(
        X_item, y_item, test_size=0.2, random_state=42, stratify=y_item
    )

    lgb_item = lgb.LGBMClassifier(n_estimators=500, max_depth=10, learning_rate=0.1, verbose=-1)
    lgb_item.fit(X_train_item, y_train_item)

    y_pred_item = lgb_item.predict(X_test_item)
    acc = accuracy_score(y_test_item, y_pred_item)
    topk_item = top_k_accuracy_score(
        y_test_item,
        lgb_item.predict_proba(X_test_item),
        k=3,
        labels=all_item_labels
    )

    item_results.append({
        'Category': encoders['Category'].inverse_transform([cat])[0],
        'Item Accuracy': acc,
        'Top-3 Accuracy': topk_item
    })

    item_models[cat] = lgb_item

print("\nItem Models Summary : ")
print(pd.DataFrame(item_results).to_string(index=False))


Category Model Summary : 
         Model  Accuracy  Top-3 Accuracy
Category Model   0.45795        0.905858

Item Models Summary : 
   Category  Item Accuracy  Top-3 Accuracy
   Clothing       0.161111        0.379798
   Footwear       0.306265        0.643852
Accessories       0.183876        0.456153
  Outerwear       0.483810        0.819048


###3.2] Saving Lightgbm trained models

In [71]:
joblib.dump(lgb_cat_model, "lgb_category_model.pkl")
joblib.dump(item_models, "lgb_item_models.pkl")
joblib.dump(encoders, "label_encoders.pkl")
joblib.dump(scaler, "scaler.pkl")

print("\nAll models saved successfully!")


All models saved successfully!


#4] Deployement Using Streamlit

### 4.1] Installing Libraries





In [72]:
!pip install -q streamlit pyngrok plotly


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/10.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/10.1 MB[0m [31m35.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━[0m [32m7.9/10.1 MB[0m [31m110.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m10.1/10.1 MB[0m [31m120.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m82.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/6.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m6.9/6.9 MB[0m [31m206.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m107.7 MB/s[0m eta [36m0:00:00[0m
[?25

### 4.2] Creating UI Using Streamlit

In [73]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib
import lightgbm as lgb
import plotly.express as px

# LOAD MODELS AND DATA
lgb_cat_model = joblib.load("lgb_category_model.pkl")
lgb_item_models = joblib.load("lgb_item_models.pkl")
encoders = joblib.load("label_encoders.pkl")
scaler = joblib.load("scaler.pkl")

# LOAD BOTH REAL + SYNTHETIC DATASETS
data = pd.read_csv("shopping_trends_updated.csv")

# PAGE CONFIG
st.set_page_config(page_title="AI-Powered Product Recommendation Dashboard", page_icon="☄️", layout="wide")

# Style the Streamlit
st.markdown("""
<style>
[data-testid="stAppViewContainer"] {
    background-color: #ffffff;
    color: #03045e;
    font-family: 'Helvetica', sans-serif;
}
[data-testid="stSidebar"] {
    background-color: #caf0f8;
    border-radius: 10px;
    color: #03045e;
}
h1, h2, h3, h4, h5, h6 {
    color: #023e8a !important;
    font-weight: 600;
}
p, label {
    color: #03045e !important;
    font-size: 16px;
}
div.stButton > button {
    background-color: #0077b6;
    color: white;
    border-radius: 8px;
    border: none;
    padding: 8px 20px;
    font-weight: 500;
    transition: 0.3s;
}
div.stButton > button:hover {
    background-color: #00b4d8;
    color: #03045e;
}
hr {
    border: none;
    border-top: 2px solid #00b4d8;
    margin: 25px 0;
}
.block-container {
    border-radius: 12px;
    padding: 20px;
}
.js-plotly-plot .gtitle {
    text-anchor: start !important;
}
</style>
""", unsafe_allow_html=True)

# HEADER
st.markdown("""
<h2 style="text-align:center; color:#023e8a;">AI-Powered Product Recommendation Dashboard</h2>
<p style="text-align:center; color:#0077b6;">
Synthetic customer profiling for accurate, data-driven shopping recommendations.
</p>
""", unsafe_allow_html=True)

st.markdown("<hr>", unsafe_allow_html=True)

# SIDEBAR
st.sidebar.header("Enter Your Details")
age = st.sidebar.slider("Age", 15, 70, 25)
gender = st.sidebar.selectbox("Gender", ["Male", "Female"])
season = st.sidebar.selectbox("Season", ["Winter", "Spring", "Summer", "Fall"])
prev_purchases = st.sidebar.slider("Previous Purchases", 0, 50, 3)
location = st.sidebar.selectbox("Location", sorted(data["Location"].unique()))

if st.sidebar.button("Get Recommendations"):
    # PREPARE USER INPUT
    user_df = pd.DataFrame({
        "Age": [age],
        "Gender": [gender],
        "Season": [season],
        "Previous Purchases": [prev_purchases],
        "Location": [location],
        "Review Rating": [4.0],
        "Purchase Amount (USD)": [50],
        "Size": ["M"],
        "Color": ["Blue"],
        "Subscription Status": ["Yes"],
        "Shipping Type": ["Standard"],
        "Discount Applied": ["No"],
        "Promo Code Used": ["No"],
        "Payment Method": ["Credit Card"],
        "Frequency of Purchases": ["Monthly"]
    })

    # ENCODE + SCALE
    for col in encoders:
        if col in user_df.columns:
            user_df[col] = encoders[col].transform(user_df[col])

    numeric_cols = ['Age', 'Review Rating', 'Previous Purchases', 'Purchase Amount (USD)']
    user_df[numeric_cols] = scaler.transform(user_df[numeric_cols])

    # CATEGORY PREDICTION
    cat_probs = lgb_cat_model.predict_proba(user_df)[0]
    top3_cat_idx = np.argsort(cat_probs)[-3:][::-1]
    top3_categories = encoders['Category'].inverse_transform(top3_cat_idx)

    st.markdown("<h3 style='text-align:center; color:#023e8a;'>Top Predicted Product Categories</h3>", unsafe_allow_html=True)
    st.caption("These categories are ranked based on model confidence for your profile.")
    for i, cat in enumerate(top3_categories, 1):
        st.markdown(f"<p style='color:#03045e;'><strong>{i}. {cat}</strong> — Confidence Score: <code>{cat_probs[top3_cat_idx[i-1]]:.2f}</code></p>", unsafe_allow_html=True)

    st.markdown("<hr>", unsafe_allow_html=True)

    # GENDER FILTER DICTIONARY
    allowed_items_per_gender = {
        'Male': ['Shirt','Pants','Shoes','Jacket','Sweater','Hoodie','Shorts','Socks',
                 'Belt','Hat','Gloves','Scarf','Backpack','Sneakers','Coat','Boots',
                 'Sunglasses','Jeans','Watch','T-shirt'],
        'Female': ['Blouse','Dress','Skirt','Heels','Sandals','Shirt','Pants','Shoes',
                   'Jacket','Sweater','Hoodie','Shorts','Socks','Belt','Hat','Gloves',
                   'Scarf','Backpack','Sneakers','Coat','Boots','Handbag','Jewelry',
                   'Sunglasses','Watch','T-shirt']
    }

    # ITEM PREDICTION (GENDER FILTERED)
    st.markdown("<h3 style='text-align:center; color:#023e8a;'>Recommended Items Within Each Category</h3>", unsafe_allow_html=True)
    st.caption("Based on your predicted categories, here are the top items you’re most likely to purchase (gender-filtered).")

    for cat in top3_categories:
        item_model = lgb_item_models[encoders['Category'].transform([cat])[0]]
        item_probs = item_model.predict_proba(user_df)[0]
        top_idx = np.argsort(item_probs)[::-1]
        all_items = encoders['Item Purchased'].inverse_transform(top_idx)

        # Apply gender filter
        allowed_items = [item for item in all_items if item in allowed_items_per_gender[gender]]
        top3_items = allowed_items[:3] if len(allowed_items) >= 3 else allowed_items

        if top3_items:
            st.markdown(f"<p style='color:#03045e;'><strong>{cat} — </strong> {', '.join(top3_items)}</p>", unsafe_allow_html=True)
        else:
            st.markdown(f"<p style='color:#03045e;'><strong>{cat} — </strong>No gender-appropriate items available.</p>", unsafe_allow_html=True)

    st.markdown("<hr>", unsafe_allow_html=True)

    # Average Spending
    avg_prices = (
        data[data['Category'].isin(top3_categories)]
        .groupby('Category')['Purchase Amount (USD)']
        .mean()
        .reset_index()
    )
    fig1 = px.bar(
        avg_prices, x='Category', y='Purchase Amount (USD)',
        color='Category', text_auto=".2f",
        color_discrete_sequence=["#0077b6", "#0096c7", "#48cae4"],
        title="Average Spending per Category"
    )
    fig1.update_layout(
        title_x=0.0,
        plot_bgcolor="#ffffff",
        paper_bgcolor="#ffffff",
        font_color="#03045e",
        yaxis_title="Average Purchase Amount (USD)"
    )
    st.plotly_chart(fig1, use_container_width=True)
    st.caption("This chart shows how much customers typically spend in each of your top recommended categories.")

    st.markdown("<hr>", unsafe_allow_html=True)

    # Seasonal Popularity
    seasonal_pop = (
        data[data['Category'].isin(top3_categories)]
        .groupby(['Season', 'Category'])
        .size()
        .reset_index(name='Count')
    )
    fig2 = px.bar(
        seasonal_pop, x='Season', y='Count', color='Category',
        barmode='group', title="Seasonal Popularity Trends",
        color_discrete_sequence=["#0077b6", "#0096c7", "#48cae4"]
    )
    fig2.update_layout(
        title_x=0.0,
        plot_bgcolor="#ffffff",
        paper_bgcolor="#ffffff",
        font_color="#03045e",
        yaxis_title="Number of Purchases"
    )
    st.plotly_chart(fig2, use_container_width=True)
    st.caption("This chart highlights how product demand changes across different seasons for your top categories.")

    # Top 5 Location By Average Purchase
    st.markdown("<hr>", unsafe_allow_html=True)
    st.markdown("<h3 style='color:#023e8a;'>Top 5 Locations by Average Purchase</h3>", unsafe_allow_html=True)

    top_locations = (
        data.groupby('Location')['Purchase Amount (USD)']
        .mean().nlargest(5).reset_index()
    )

    fig3 = px.bar(
        top_locations,
        x='Purchase Amount (USD)',
        y='Location',
        orientation='h',
        color='Location',
        color_discrete_sequence=["#0077b6", "#0096c7", "#00b4d8", "#48cae4", "#023e8a"],
        title="Top 5 Locations by Average Purchase"
    )

    fig3.update_layout(
        title_x=0.0,
        plot_bgcolor="#ffffff",
        paper_bgcolor="#ffffff",
        font_color="#03045e",
        xaxis_title="Average Purchase Amount (USD)",
        yaxis_title=None,
        showlegend=False
    )

    st.plotly_chart(fig3, use_container_width=True)
    st.caption("This chart shows the top 5 locations where customers spend the most on average.")

else:
    st.markdown("<p style='text-align:center; color:#0077b6;'>Fill in your details on the left and click <strong>Get Recommendations</strong> to view results.</p>", unsafe_allow_html=True)


Writing app.py


###4.3] Launching ngrok

In [None]:
from pyngrok import ngrok

NGROK_AUTH_TOKEN = "33fBeYzcWpPnErKaVDXIZrOfX5q_3i9Cjbm9JLbRuh5jSrZmb"

ngrok.set_auth_token(NGROK_AUTH_TOKEN)
public_url = ngrok.connect(8501)
print("Public URL:", public_url)

!streamlit run app.py --server.port 8501



Public URL: NgrokTunnel: "https://squiffy-shirley-cosmetically.ngrok-free.dev" -> "http://localhost:8501"

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.187.133.251:8501[0m
[0m


###4.4] Killing active tunnels

In [None]:
#from pyngrok import ngrok
#ngrok.kill()
