In [5]:
# 📦 Customer Segmentation + Market Basket Analysis Pipeline

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from mlxtend.frequent_patterns import apriori, association_rules
from google.colab import drive
drive.mount('/content/drive')

# === Load & Clean Data ===
import pandas as pd

file_path = '/content/drive/My Drive/Online Retail.xlsx'
df = pd.read_excel(file_path)


# Remove cancelled transactions (InvoiceNo starts with 'C')
df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]

# Drop rows with missing CustomerID
df = df.dropna(subset=['CustomerID'])

# Filter out non-positive quantities or prices
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]

# Convert InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Create TotalPrice
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

# === RFM Analysis ===
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (df['InvoiceDate'].max() - x.max()).days,
    'InvoiceNo': 'nunique',
    'TotalPrice': 'sum'
}).reset_index()

rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']

# Scale the data
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])

# K-Means Clustering
kmeans = KMeans(n_clusters=4, random_state=42)
rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)

# === Market Basket Analysis ===
# Prepare basket format
df_basket = df[df['Country'] == 'United Kingdom']  # optional filter for one region
basket = df_basket.groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().fillna(0)
basket = basket.applymap(lambda x: 1 if x > 0 else 0)

# Apply Apriori
frequent_itemsets = apriori(basket, min_support=0.02, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# Output useful columns
rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

# Save results
rfm.to_csv("rfm_segments.csv", index=False)
rules.to_csv("market_basket_rules.csv", index=False)

print("✅ Segmentation and Market Basket Analysis complete!")


Mounted at /content/drive


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
  basket = basket.applymap(lambda x: 1 if x > 0 else 0)


✅ Segmentation and Market Basket Analysis complete!


In [6]:
from google.colab import files
files.download("rfm_segments.csv")
files.download("market_basket_rules.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>