In [180]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set the default template to 'plotly_dark' for all Plotly figures
import plotly.io as pio
pio.templates.default = 'plotly_dark'

In [181]:
df = pd.read_csv('../data/clean/clustered.csv').set_index('id')
df.head()

Unnamed: 0_level_0,age,gender,income,spending_score,membership_years,purchase_frequency,preferred_category,last_purchase_amount,age_range,income_level,spending_score_category,purchase_frequency_category,membership_score,cluster
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,38,Female,99342,90,3,24,Groceries,113.53,"(30, 40]",Medium,Premium,Medium,Bronze,1
2,21,Female,78852,60,2,42,Sports,41.93,"(20, 30]",Medium,High,High,Bronze,1
3,60,Female,126573,30,2,28,Clothing,424.36,"(50, 60]",High,Medium,Medium,Bronze,1
4,40,Other,47099,74,9,5,Home & Garden,991.93,"(30, 40]",Low,High,Low,Gold,2
5,65,Female,140621,21,3,25,Electronics,347.08,"(60, 70]",High,Low,Medium,Bronze,1


In [None]:
df['cluster'] = df['cluster'].astype('category')

In [183]:
df.select_dtypes('object').columns

Index(['gender', 'preferred_category', 'age_range', 'income_level',
       'spending_score_category', 'purchase_frequency_category',
       'membership_score'],
      dtype='object')

In [184]:
dimensions = df.select_dtypes('object').columns

for dimension in dimensions:

    data = df.groupby(['cluster', dimension]).size().reset_index(name='count')
    data['group_total'] = data.groupby('cluster')['count'].transform('sum')
    data['pct_of_group_total'] = (data['count'] / data['group_total']) * 100

    x = data['cluster'].astype("object")
    y = data['pct_of_group_total']
    fig = px.bar(data, x=x, y=y, color=dimension)
    fig.show()











































In [185]:
data = df.groupby('cluster')[['last_purchase_amount', 'purchase_frequency']].sum()

for col in data.columns:
    x = data.index
    y = data[col]
    fig = px.bar(data, x=x, y=y)
    fig.show()





In [186]:
data = df.groupby('cluster')[['last_purchase_amount', 'purchase_frequency']].mean()

for col in data.columns:
    x = data.index
    y = data[col]
    fig = px.bar(data, x=x, y=y)
    fig.show()





In [190]:
df.dtypes

age                               int64
gender                           object
income                            int64
spending_score                    int64
membership_years                  int64
purchase_frequency                int64
preferred_category               object
last_purchase_amount            float64
age_range                        object
income_level                     object
spending_score_category          object
purchase_frequency_category      object
membership_score                 object
cluster                        category
dtype: object

In [188]:
data = df.groupby('cluster')[['last_purchase_amount', 'purchase_frequency', 'cluster']].agg({'last_purchase_amount': 'mean', 'purchase_frequency': 'mean', 'cluster': 'count'}).round(0)
fig = px.scatter(data, x=data['last_purchase_amount'], y=data['purchase_frequency'], size='cluster', color=data.index, size_max=100)
fig.show()





In [196]:
df_reset = df.reset_index()
dims = ['age_range', 'gender', 'income_level', 'spending_score_category', 'membership_score', 'preferred_category']
x_axis = 'last_purchase_amount'
y_axis = 'purchase_frequency'
g1 = 'cluster'
for dim in dims:
    data = df_reset.groupby([g1, dim])[[x_axis, y_axis, 'id']].agg({x_axis: 'mean', y_axis: 'mean', 'id': 'count'}).round(0).reset_index()
    fig = px.scatter(data, x=data[x_axis], y=data[y_axis], size='id', color=g1, size_max=60, hover_data=[dim])
    fig.show()

























In [203]:
fig = px.sunburst(df, path=['cluster', 'membership_score'])
fig.show()







In [None]:
data = df.groupby(['cluster', 'preferred_category'])['spending_score'].mean().reset_index().round(0).sort_values(by=['cluster', 'spending_score'], ascending=[True, True])

clusters = data['cluster'].unique()

fig = make_subplots(
    rows=len(clusters),
    cols=1,
    shared_xaxes=True,
    vertical_spacing=0.05
)

fig.update_layout(margin=dict(l=250))

for i, cluster in enumerate(clusters):

    cat_df = data[data['cluster'] == cluster]

    fig.add_trace(go.Bar(
        x=cat_df['spending_score'],
        y=cat_df['preferred_category'],
        orientation='h',
        name=f"Cluster: {cluster}",
        text=cat_df['spending_score'],
        textfont=dict(color='white')
    ),
    row=i+1,
    col=1)

    fig.add_annotation(
        xref='paper',
        yref='y' + str(i + 1),
        xanchor='right',
        x=-0.12,
        y=cat_df['preferred_category'].iloc[len(cat_df) // 2],
        text=f"Cluster: {cluster}",
        showarrow=False,
        font=dict(size=12)
    )
    
    fig.update_yaxes(
        showline=True,
        linecolor='lightgrey',
        linewidth=1,
        ticklabelposition='outside',
        ticklen=7,
        tickcolor='white',
        row=i+1,
        col=1
    )


fig.show()





In [270]:
data = df.groupby(['cluster', 'preferred_category']).size().reset_index(name='count').sort_values(by=['cluster', 'count'], ascending=[True, True])
data['group_total'] = data.groupby('cluster')['count'].transform('sum')
data['pct_of_group_total'] = ((data['count'] / data['group_total']) * 100).round(0)
display(data)

clusters = data['cluster'].unique()

fig = make_subplots(
    rows=len(clusters),
    cols=1,
    shared_xaxes=True,
    vertical_spacing=0.05
)

fig.update_layout(margin=dict(l=250))

for i, cluster in enumerate(clusters):

    cat_df = data[data['cluster'] == cluster]

    fig.add_trace(go.Bar(
        x=cat_df['pct_of_group_total'],
        y=cat_df['preferred_category'],
        orientation='h',
        name=f"Cluster: {cluster}",
        text=cat_df['pct_of_group_total'],
        textfont=dict(color='white')
    ),
    row=i+1,
    col=1)

    fig.add_annotation(
        xref='paper',
        yref='y' + str(i + 1),
        xanchor='right',
        x=-0.12,
        y=cat_df['preferred_category'].iloc[len(cat_df) // 2],
        text=f"Cluster: {cluster}",
        showarrow=False,
        font=dict(size=12)
    )
    
    fig.update_yaxes(
        showline=True,
        linecolor='lightgrey',
        linewidth=1,
        ticklabelposition='outside',
        ticklen=7,
        tickcolor='white',
        row=i+1,
        col=1
    )


fig.show()







Unnamed: 0,cluster,preferred_category,count,group_total,pct_of_group_total
0,0,Clothing,70,431,16.0
2,0,Groceries,82,431,19.0
4,0,Sports,91,431,21.0
1,0,Electronics,93,431,22.0
3,0,Home & Garden,95,431,22.0
8,1,Home & Garden,58,318,18.0
5,1,Clothing,64,318,20.0
7,1,Groceries,64,318,20.0
9,1,Sports,64,318,20.0
6,1,Electronics,68,318,21.0
