<a href="https://colab.research.google.com/github/cbonnin88/EDA_Projects/blob/main/E_commerce_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import polars as pl
import plotly.express as px
import numpy as np
import random
from datetime import datetime, timedelta

In [None]:
random.seed(42)
np.random.seed(42)

# **Generating Customers Data**

In [None]:
num_customers = 1000
customer_ids = [f'C{i:03d}' for i in range(1,num_customers + 1)]
countries = ['France','Denmark','Ireland','Luxembourg']
segments = ['Consumer','Corporate','Home Office']


customer_df = pl.DataFrame({
    'customer_id': customer_ids,
    'countries': np.random.choice(countries, num_customers),
    'segment': np.random.choice(segments, num_customers),
    'signup_date': [
        (datetime(2023,1,1) + timedelta(days=random.randint(0,365))).strftime('%Y-%m-%d')
        for _ in range(num_customers)
    ]
})

# **Generate Products Data**

In [None]:
num_products = 20
product_ids = [f'P{i:03d}' for i in range(1,num_products + 1)]
categories = ['Electronics', 'Furnitures','Clothing']

products_df = pl.DataFrame({
    'product_id': product_ids,
    'category': np.random.choice(categories, num_products),
    'price': np.round(np.random.uniform(10,500, num_products),2),
    'cost': np.round(np.random.uniform(5,300,num_products),2)
})

# **Generate Orders Data**

In [None]:
num_orders = 500
order_ids = [f'0{i:04d}' for i in range(1, num_orders + 1)]

orders_df = pl.DataFrame({
    'order_id': order_ids,
    'customer_id': np.random.choice(customer_ids, num_orders),
    'product_id': np.random.choice(product_ids, num_orders),
    'order_date': [
        (datetime(2023,1,1) + timedelta(days=random.randint(0,365))).strftime('%Y-%m-%d')
        for _ in range(num_orders)
    ],
    'quantity': np.random.randint(1,5, num_orders)
})

# **Save to CSV**

In [None]:
customer_df.write_csv('customers.csv')
products_df.write_csv('products.csv')
orders_df.write_csv('orders.csv')

print('Files created: customers.csv, products.csv, orders.csv')

Files created: customers.csv, products.csv, orders.csv


# **Polars - EDA**

In [None]:
df_cust = pl.read_csv('customers.csv')
df_prod = pl.read_csv('products.csv')
df_ord = pl.read_csv('orders.csv')

In [None]:
df_cust.head()

customer_id,countries,segment,signup_date
str,str,str,str
"""C001""","""Ireland""","""Corporate""","""2023-11-24"""
"""C002""","""Luxembourg""","""Home Office""","""2023-02-27"""
"""C003""","""France""","""Consumer""","""2023-01-13"""
"""C004""","""Ireland""","""Consumer""","""2023-05-21"""
"""C005""","""Ireland""","""Consumer""","""2023-05-06"""


In [None]:
df_prod.head()

product_id,category,price,cost
str,str,f64,f64
"""P001""","""Electronics""",88.6,29.81
"""P002""","""Electronics""",173.66,101.3
"""P003""","""Electronics""",126.96,170.15
"""P004""","""Furnitures""",465.74,39.17
"""P005""","""Electronics""",61.86,116.13


In [None]:
df_prod.head()

product_id,category,price,cost
str,str,f64,f64
"""P001""","""Electronics""",88.6,29.81
"""P002""","""Electronics""",173.66,101.3
"""P003""","""Electronics""",126.96,170.15
"""P004""","""Furnitures""",465.74,39.17
"""P005""","""Electronics""",61.86,116.13


# **Joining Tables to create a Master Table**

In [None]:
df_master = (
    df_ord
    .join(df_cust, on='customer_id',how='left')
    .join(df_prod, on='product_id',how='left')
)

In [None]:
df_master = df_master.with_columns(
    (pl.col('quantity')* pl.col('price')).alias('total_sales')
)

In [None]:
display(df_master.head())

order_id,customer_id,product_id,order_date,quantity,countries,segment,signup_date,category,price,cost,total_sales
i64,str,str,str,i64,str,str,str,str,f64,f64,f64
1,"""C524""","""P011""","""2023-08-06""",3,"""Ireland""","""Home Office""","""2023-10-03""","""Furnitures""",440.75,132.51,1322.25
2,"""C918""","""P010""","""2023-02-28""",1,"""Luxembourg""","""Corporate""","""2023-11-23""","""Clothing""",238.08,10.44,238.08
3,"""C401""","""P018""","""2023-10-06""",4,"""Luxembourg""","""Corporate""","""2023-10-11""","""Electronics""",380.75,232.4,1523.0
4,"""C906""","""P018""","""2023-04-26""",2,"""Denmark""","""Corporate""","""2023-06-04""","""Electronics""",380.75,232.4,761.5
5,"""C886""","""P017""","""2023-11-28""",3,"""France""","""Corporate""","""2023-04-09""","""Clothing""",282.53,56.26,847.59


In [None]:
display(df_master.null_count())

order_id,customer_id,product_id,order_date,quantity,countries,segment,signup_date,category,price,cost,total_sales
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0


# **Category Performance Analysis**

In [None]:
cat_stats = (
    df_master
    .group_by('category')
    .agg([
        pl.col('total_sales').sum().alias('revenue'),
        pl.col('quantity').mean().alias('avg_qty')
    ])
    .sort('revenue',descending=True)
)
display(cat_stats)

category,revenue,avg_qty
str,f64,f64
"""Electronics""",121139.37,2.456522
"""Furnitures""",117712.63,2.5078125
"""Clothing""",79400.51,2.605634


In [None]:
fig_cpa = px.bar(
    cat_stats.to_pandas(),
    x='category',
    y='revenue',
    title='Total Revenue by Product Category',
    text_auto='.2s',
    color='category',
    color_discrete_sequence=px.colors.qualitative.Pastel

)

fig_cpa.show()

# **Monthly Sales Trend (Time Series)**

In [None]:
monthly_sales = (
    df_master
    .with_columns(
        pl.col('order_date').str.to_date('%Y-%m-%d').dt.truncate('1mo').alias('month')
    )
    .group_by('month')
    .agg(
        pl.col('total_sales').sum().alias('revenue')
    )
    .sort('month')
)

display(monthly_sales)

month,revenue
date,f64
2023-01-01,23465.44
2023-02-01,20445.44
2023-03-01,32055.91
2023-04-01,16592.44
2023-05-01,24038.33
…,…
2023-08-01,32614.81
2023-09-01,26176.86
2023-10-01,34360.65
2023-11-01,30407.79


In [None]:
fig_ms = px.line(
    monthly_sales.to_pandas(),
    x='month',
    y='revenue',
    title='Monthly Revenue Trend',
    markers=True
)

fig_ms.update_layout(xaxis_title='Month',yaxis_title='Revenue (€)')
fig_ms.show()

# **Customer Segmentation (Scatter Plot)**

In [None]:
customer_stats = (
    df_master
    .group_by(['customer_id','segment'])
    .agg([
        pl.col('order_id').n_unique().alias('order_count'),
        pl.col('total_sales').sum().alias('total_spend')
    ])
)

display(customer_stats)

customer_id,segment,order_count,total_spend
str,str,u32,f64
"""C955""","""Consumer""",1,1443.12
"""C936""","""Corporate""",1,694.64
"""C285""","""Consumer""",2,529.71
"""C849""","""Home Office""",1,25.04
"""C243""","""Home Office""",1,88.6
…,…,…,…
"""C401""","""Corporate""",2,2043.98
"""C617""","""Corporate""",1,418.56
"""C455""","""Corporate""",1,50.08
"""C335""","""Home Office""",1,244.06


In [None]:
fig_cs = px.scatter(
    customer_stats.to_pandas(),
    x='order_count',
    y='total_spend',
    color='segment',
    title='Customer Value: Frequency vs Spend',
    hover_data = ['customer_id'],
    size='total_spend'
)

fig_cs.show()