In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import duckdb
from scipy.stats import ks_2samp

# 1. Sampling Process 

## 1.1 Sampling

In [None]:
# # Connect with memory limit config
# con = duckdb.connect(database=':memory:')


# # Step 1: Segment customers and sample ~10%
# con.execute("""
#     CREATE TABLE sampled_customers AS
#     WITH customer_segments AS (
#       SELECT 
#         store_id,
#         cust_id,
#         SUM(sales_amt) AS total_sales,
#         CASE 
#           WHEN SUM(sales_amt) < 0 THEN 'negative_spender'
#           WHEN SUM(sales_amt) < 50 THEN 'low_spender'
#           WHEN SUM(sales_amt) < 100 THEN 'mid_spender'
#           WHEN SUM(sales_amt) < 1000 THEN 'high_spender'
#           ELSE 'vip'
#         END AS spending_segment
#       FROM read_parquet('combined.parquet')
#       GROUP BY store_id, cust_id
#     )
#     SELECT * 
#     FROM customer_segments 
#     WHERE RANDOM() <= 0.10
# """)

# # Step 2: Join transactions with the sampled customers
# con.execute("""
#     CREATE TABLE sampled_transactions AS
#     SELECT t.*
#     FROM read_parquet('combined.parquet') t
#     JOIN sampled_customers cs
#       ON t.store_id = cs.store_id AND t.cust_id = cs.cust_id
# """)

# con.execute("COPY sampled_transactions TO 'sampled_combined.parquet' (FORMAT 'parquet')")


In [None]:
df_sample = con.execute("SELECT * FROM sampled_transactions").fetchdf()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [None]:
df_sample.head()

## 1.2 Distribution KS Test

### 1.2.1 Sales Amount

In [None]:
con = duckdb.connect()
df_all_sales_amt = con.execute("""
    SELECT sales_amt
    FROM 'combined.parquet'
""").fetchdf()


In [None]:
columns_to_test = ['sales_amt']  
for col in columns_to_test:
    # Drop NA values
    col_all = df_all_sales_amt.dropna()
    col_sample = df_sample[col].dropna()
    
    sample_size = 100_000
    if len(col_all) > sample_size:
        col_all = col_all.sample(sample_size, random_state=42)
    if len(col_sample) > sample_size:
        col_sample = col_sample.sample(sample_size, random_state=42)

    ks_stat, p_val = ks_2samp(col_all, col_sample)
    print(f"Kolmogorov-Smirnov test for '{col}':")
    print(f"  KS statistic: {ks_stat:.6f}")
    print(f"  p-value:      {p_val:.6g}")
    print("---------------------------------------------------")


### 1.2.2 Customer

In [None]:
df_all_cust = con.execute("""
    SELECT cust_id
    FROM 'combined.parquet'
""").fetchdf()


In [None]:
cust_all = df_all_cust['cust_id'].dropna()
cust_sample = df_sample['cust_id'].dropna()

ks_stat, p_val = ks_2samp(cust_all, cust_sample)

print("Kolmogorov-Smirnov test for 'cust_id':")
print(f"  KS statistic: {ks_stat:.6f}")

### 1.2.3 Store

In [None]:
df_all_store = con.execute("""
    SELECT store_id
    FROM 'combined.parquet'
""").fetchdf()


In [None]:

columns_to_test = ['store_id']  

for col in columns_to_test:
    col_all = df_all_store[col].dropna()
    col_sample = df_sample[col].dropna()
    
    sample_size = 100_000
    if len(col_all) > sample_size:
        col_all = col_all.sample(sample_size, random_state=42)
    if len(col_sample) > sample_size:
        col_sample = col_sample.sample(sample_size, random_state=42)

    ks_stat, p_val = ks_2samp(col_all, col_sample)
    print(f"Kolmogorov-Smirnov test for '{col}':")
    print(f"  KS statistic: {ks_stat:.6f}")
    print(f"  p-value:      {p_val:.6g}")
    print("---------------------------------------------------")


### 1.2.4 Product

In [None]:
con = duckdb.connect()
df_all_product = con.execute("""
    SELECT prod_category
    FROM 'combined.parquet'
""").fetchdf()


In [None]:
columns_to_test = ['prod_category']  

for col in columns_to_test:
    col_all = df_all_product[col].dropna()
    col_sample = df_sample[col].dropna()
    
    sample_size = 100_000
    if len(col_all) > sample_size:
        col_all = col_all.sample(sample_size, random_state=42)
    if len(col_sample) > sample_size:
        col_sample = col_sample.sample(sample_size, random_state=42)

    ks_stat, p_val = ks_2samp(col_all, col_sample)
    print(f"Kolmogorov-Smirnov test for '{col}':")
    print(f"  KS statistic: {ks_stat:.6f}")
    print(f"  p-value:      {p_val:.6g}")
    print("---------------------------------------------------")


# 2. EDA

## 2.1 Load Samples and Observe Dataset

In [None]:
# Load the sampled transactions 
df_sampled = pd.read_parquet("sampled_combined.parquet")
print("Sampled transactions shape:", df_sampled.shape)
print(df_sampled.head(3))

In [None]:
## Understand the data structure 
# 1. Aggregate product frequencies from your sampled transactions data.
product_counts = df_sampled['prod_desc'].value_counts()

# Display basic statistics:
print("Total unique products in sampled data:", len(product_counts))
print("\nTop 10 most popular products:")
print(product_counts.head(10))

# Calculate percentage of products purchased only once:
single_purchase_count = (product_counts == 1).sum()
print(f"\nProducts purchased only once: {single_purchase_count} " +
      f"({100 * single_purchase_count / len(product_counts):.2f}%)")


# 2. Prepare the Data for Long Tail Plot

# Sort the frequencies in descending order
# product_counts is already sorted by .value_counts()
sorted_counts = product_counts.values  # numpy array of counts, sorted descending
ranks = np.arange(1, len(sorted_counts) + 1)

# ---3. Plot on a Linear Scale

plt.figure(figsize=(10, 5))
plt.plot(ranks, sorted_counts, color='blue', lw=2)
plt.title("Product Frequency Long Tail (Linear Scale)")
plt.xlabel("Rank (Products sorted by frequency)")
plt.ylabel("Frequency (Number of Purchases)")
plt.grid(True)
plt.show()


# ---4. Plot on a Log-Log Scale

plt.figure(figsize=(10, 5))
plt.loglog(ranks, sorted_counts, color='blue', lw=2)
plt.title("Product Frequency Long Tail (Log-Log Scale)")
plt.xlabel("Rank (log scale)")
plt.ylabel("Frequency (log scale)")
plt.grid(True, which="both", ls="--")
plt.show()

# --- 5. Cumulative Distribution Function (CDF) ---
# Sort counts in descending order (again, already sorted in sorted_counts)
cumulative_sum = np.cumsum(sorted_counts)
total_purchases = cumulative_sum[-1]
cumulative_percentage = (cumulative_sum / total_purchases) * 100

plt.figure(figsize=(10, 6))
plt.plot(np.arange(1, len(cumulative_percentage) + 1), cumulative_percentage, marker='o', linestyle='-')
plt.xlabel("Rank of Product")
plt.ylabel("Cumulative Percentage of Total Purchases (%)")
plt.title("CDF of Product Purchases")
plt.grid(True)
plt.show()


# --- 6. 80/20 Analysis ---
# Find how many products make up 80% of total purchases:
# product_counts is a Series indexed by product, with total purchase counts
product_counts = df_sampled.groupby('prod_id')['sales_qty'].sum()

# compute percentages and cumulative percentages
total_sales = product_counts.sum()
percentage = (product_counts / total_sales) * 100
cumulative_percentage = percentage.sort_values(ascending=False).cumsum()
total_unique = cumulative_percentage.shape[0]

idx_80 = np.where(cumulative_percentage >= 80)[0][0]
num_products_80 = idx_80 + 1  # since index is zero-based
perc_products_needed = 100 * num_products_80 / total_unique

print(f"\nNumber of products to reach 80% of total purchases: {num_products_80}")
print(f"Percentage of total products needed to reach 80% of sales: {perc_products_needed:.2f}%")


We have a Very Skewed Sample Dataset  SO WE WOULD WANT TO REMOVE THE LONGTAIL PRODUCTS 

## 2.2 Remove longtail


In [None]:
## Remove longtail products + consumers to reduce dimensionality 
# === 1. Filter Products by Cumulative Sales (80% cutoff) ===
# Aggregate sales by product ID
product_sales = df_sampled.groupby('prod_id')['sales_qty'].sum()

# Sort products in descending order based on sales
product_sales = product_sales.sort_values(ascending=False)

# Total sales quantity for all products
total_sales = product_sales.sum()

# Compute cumulative sales
cumulative_sales = product_sales.cumsum()

# Calculate cumulative percentage of sales for each product
cumulative_percentage = (cumulative_sales / total_sales) * 100

# Determine the cutoff index where cumulative sales reaches at least 80%
threshold_index = np.where(cumulative_percentage >= 80)[0][0]

# Select product IDs that contribute to 80% of the sales (include threshold index)
selected_products = product_sales.index[:threshold_index + 1]

print("Number of unique products before filtering:", df_sampled['prod_id'].nunique())
print("Number of products selected (top 80% cumulative sales):", len(selected_products))

# Filter the transactions to only include these top products
df_filtered = df_sampled[df_sampled['prod_id'].isin(selected_products)]

# === 2. Filter Customers by Transaction Count ===

# Calculate the number of transactions per customer in the filtered data
user_transactions = df_filtered['cust_id'].value_counts()

# Define active customers as those with at least 3 transactions
active_users = user_transactions[user_transactions >= 3].index

# Filter the data to keep only the active customers
df_filtered = df_filtered[df_filtered['cust_id'].isin(active_users)]

print("Number of unique customers before filtering:", df_sampled['cust_id'].nunique())
print("Number of customers after filtering:", df_filtered['cust_id'].nunique())

# Optional: Display the shape of the filtered DataFrame
print("Filtered DataFrame shape:", df_filtered.shape)


In [None]:
df_filtered.to_parquet("df_filtered.parquet", index=False)
print("Filtered dataset saved to df_filtered.parquet")

# 3. DATA PREPROCESSING

In [None]:
df_filtered = pd.read_parquet("df_filtered.parquet")
print("Sampled transactions shape:", df_sampled.shape)
print(df_filtered.head(3))

In [None]:
import duckdb
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, save_npz

#########################################
# Part 2: Pre-Aggregation – Aggregate Purchases
#########################################

# Register df_sample with DuckDB (assuming df_sample is already defined)
con.register("df_sample", df_filtered)

query_agg = """
SELECT 
  cust_id,
  prod_desc,
  COUNT(*) AS purchase_count
FROM df_sample
GROUP BY cust_id, prod_desc
"""

df_agg = con.execute(query_agg).fetchdf()
print("New Aggregated data shape:", df_agg.shape)
print(df_agg.head())

# Save aggregated data for later use
df_agg.to_parquet("aggregated_data_new.parquet", index=False)

#########################################
# Part 3: Building the Full Sparse User–Item Matrix
#########################################

# Instead of pivoting to a dense DataFrame, build a sparse matrix directly.
# Step 1: Use LabelEncoder to convert cust_id and prod_desc into numeric indices.
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

user_indices = user_encoder.fit_transform(df_agg['cust_id'])
item_indices = item_encoder.fit_transform(df_agg['prod_desc'])

# Step 2: Use the purchase_count as the data/values for the sparse matrix.
values = df_agg['purchase_count'].astype(np.float32)

# Step 3: Build the CSR sparse matrix.
user_item_sparse = csr_matrix(
    (values, (user_indices, item_indices)),
    shape=(len(user_encoder.classes_), len(item_encoder.classes_))
)

print("Sparse matrix shape:", user_item_sparse.shape)

# Save the sparse matrix to disk
save_npz("user_item_matrix_sparse_new.npz", user_item_sparse)

# Save the mappings so you can recover the original customer and product labels later.
pd.DataFrame({'cust_id': user_encoder.classes_}).to_csv("user_id_mapping_new.csv", index=False)
pd.DataFrame({'prod_desc': item_encoder.classes_}).to_csv("item_id_mapping_new.csv", index=False)

#########################################
# Part 4: Extract Product Metadata
#########################################

# Extract distinct product metadata columns.
metadata_cols = ['prod_desc', 'prod_category', 'prod_subcategory', 'prod_mfc_brand_cd']
product_metadata_df = df_sample[metadata_cols].drop_duplicates('prod_desc')
print("Product metadata shape:", product_metadata_df.shape)
print(product_metadata_df.head())

# Save product metadata for later use.
product_metadata_df.to_parquet("product_metadata_new.parquet", index=False)
product_metadata_df.to_csv("product_metadata_new.csv", index=False)

#########################################
# Final Print Summary
#########################################
print("\n=== Preprocessing Complete ===")
print("1. NEW Sampled Data (df_sample_new) created.")
print("2. NEW Aggregated data saved to aggregated_data_new.parquet.")
print("3. NEW Sparse user–item matrix saved to user_item_matrix_sparse_new.npz.")
print("   (Mappings saved in  user_id_mapping_new.csv and  item_id_mapping_new.csv)")
print("4. Product Metadata saved to product_metadata_new.parquet and product_metadata_new.csv.")


## Brand Tagging
This filters the rows that contain products that belongs to BUDW or MLSON Brand

In [None]:
def tag_brands(data):
    data["is_budweiser"] = (
        data["prod_desc"].str.contains("bud", case=False, na=False) |
        data["prod_mfc_brand_cd"].str.contains("BUDW", na=False)
    )
    
    data["is_molson"] = (
        data["prod_desc"].str.contains("mlson|molson", case=False, na=False) |
        data["prod_mfc_brand_cd"].str.contains("MLSON", na=False)
    )
    
    return data

In [None]:
df_agg = tag_brands(df_agg)