**Official Example from OpenAI:**

[Clusring.ipynb](https://github.com/openai/openai-cookbook/blob/main/examples/Clustering.ipynb)

In [1]:
'''Import libraries'''
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import json


# Initial setup

In [3]:
'''Load the data'''
df = pd.read_csv("data.csv", encoding='ISO-8859-1')

# Basic cleaning
df = df.dropna(subset=['CustomerID', 'Description'])
df = df[df['Quantity'] > 0]
df['Amount'] = df['Quantity'] * df['UnitPrice']
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['CustomerID'] = df['CustomerID'].astype(str)


# Generate Text Embeddings for Product Descriptions

In [5]:
# Get unique product descriptions
unique_descriptions = df['Description'].dropna().unique()
print(f"{len(unique_descriptions)} unique descriptions found.")


3877 unique descriptions found.


## Load OpenAI and define embedding function

In [10]:
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()

client = OpenAI()

EMBED_MODEL = "text-embedding-ada-002"

def get_embedding(text, model=EMBED_MODEL):
    try:
        response = client.embeddings.create(input=[text], model=model)
        return response.data[0].embedding
    except Exception as e:
        print(f"Error embedding '{text[:30]}...': {e}")
        return None


## Generate saved embeddings for product descriptions

In [13]:
# Optional: Load saved embeddings if available
embedding_path = "data/desc_embeddings.json"

if os.path.exists(embedding_path):
    with open(embedding_path, "r") as f:
        desc_embeddings = json.load(f)
else:
    desc_embeddings = {}

# Generate embeddings for new descriptions only
for desc in tqdm(unique_descriptions):
    if desc not in desc_embeddings:
        embedding = get_embedding(desc)
        if embedding:
            desc_embeddings[desc] = embedding

# Save embeddings for future use
with open(embedding_path, "w") as f:
    json.dump(desc_embeddings, f)


# Aggregate Embeddings to the User Level

In [25]:
# Replace product description with its embedding vector
df['desc_vector'] = df['Description'].map(desc_embeddings)

print("Check the first few rows of product description vectors:")
print(df[['Description', 'desc_vector']].head())

# Convert desc_vector -> np.ndarray
df['desc_vector'] = df['desc_vector'].apply(lambda x: np.array(x) if isinstance(x, list) else x)

# Average product vectors per user (representing interest)
user_vectors = {}
for cid, group in df.groupby('CustomerID'):
    vectors = [v for v in group['desc_vector'] if isinstance(v, np.ndarray)]
    if vectors:
        user_vectors[cid] = np.mean(vectors, axis=0)


Check the first few rows of product description vectors:
                           Description  \
0   WHITE HANGING HEART T-LIGHT HOLDER   
1                  WHITE METAL LANTERN   
2       CREAM CUPID HEARTS COAT HANGER   
3  KNITTED UNION FLAG HOT WATER BOTTLE   
4       RED WOOLLY HOTTIE WHITE HEART.   

                                         desc_vector  
0  [-0.024560807272791862, -0.010772868990898132,...  
1  [0.003209017449989915, -0.023925375193357468, ...  
2  [-0.035232964903116226, -0.0016704994486644864...  
3  [-0.021004119887948036, -0.015705782920122147,...  
4  [-0.04536698758602142, -0.020229782909154892, ...  


In [27]:
'''Convert to dataframe'''
user_df = pd.DataFrame.from_dict(user_vectors, orient='index')

# Rename columns for clarity
user_df.columns = [f'emb_{i}' for i in range(user_df.shape[1])]
user_df['CustomerID'] = user_df.index


# Cluster Users Based on Interest Embeddings

In [29]:
from sklearn.cluster import KMeans

# Use only the embedding columns for clustering
X = user_df[[col for col in user_df.columns if col.startswith("emb_")]]

# Fit KMeans with 5 clusters (you can tune this)
kmeans = KMeans(n_clusters=5, random_state=42)
user_df['InterestCluster'] = kmeans.fit_predict(X)


In [30]:
user_df[['CustomerID', 'InterestCluster']].head()


Unnamed: 0,CustomerID,InterestCluster
12346.0,12346.0,4
12347.0,12347.0,3
12348.0,12348.0,3
12349.0,12349.0,4
12350.0,12350.0,2


# Build RFM Table

In [32]:
NOW = df['InvoiceDate'].max() + pd.Timedelta(days=1)


In [33]:
'''Calculate Recency, Frequency, and Monetary values'''
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (NOW - x.max()).days,   # Recency
    'InvoiceNo': 'nunique',                          # Frequency
    'Amount': 'sum'                                  # Monetary
}).rename(columns={
    'InvoiceDate': 'Recency',
    'InvoiceNo': 'Frequency',
    'Amount': 'Monetary'
}).reset_index()


In [34]:
'''Add RFM scoring and user type'''
rfm['R_score'] = pd.qcut(rfm['Recency'], 5, labels=[5,4,3,2,1]).astype(int)
rfm['F_score'] = pd.qcut(rfm['Frequency'].rank(method='first'), 5, labels=[1,2,3,4,5]).astype(int)
rfm['M_score'] = pd.qcut(rfm['Monetary'], 5, labels=[1,2,3,4,5]).astype(int)

rfm['RFM_Score'] = rfm['R_score'] + rfm['F_score'] + rfm['M_score']

# Optional: classify user types
def classify_user(row):
    if row['RFM_Score'] >= 13:
        return 'High-Value'
    elif row['RFM_Score'] >= 9:
        return 'Potential'
    elif row['RFM_Score'] >= 5:
        return 'Regular'
    else:
        return 'Churn Risk'

rfm['UserType'] = rfm.apply(classify_user, axis=1)


#  Merge RFM Profiles with Interest Clusters

In [35]:
# Merge on CustomerID to enrich RFM table with interest clusters
result = pd.merge(rfm, user_df[['CustomerID', 'InterestCluster']], on='CustomerID', how='left')

# Preview combined segmentation
result.head()


Unnamed: 0,CustomerID,Recency,Frequency,Monetary,R_score,F_score,M_score,RFM_Score,UserType,InterestCluster
0,12346.0,326,1,77183.6,1,1,5,7,Regular,4
1,12347.0,2,7,4310.0,5,5,5,15,High-Value,3
2,12348.0,75,4,1797.24,2,4,4,10,Potential,3
3,12349.0,19,1,1757.55,4,1,4,9,Potential,4
4,12350.0,310,1,334.4,1,1,2,4,Churn Risk,2


# Visualize Interest Clusters in 3D

In [38]:
import plotly.graph_objs as go
import plotly.offline as py

trace = go.Scatter3d(
    x=result['Recency'],
    y=result['Frequency'],
    z=result['Monetary'],
    mode='markers',
    marker=dict(
        color=result['InterestCluster'],
        size=6,
        line=dict(width=1),
        opacity=0.8
    )
)

layout = go.Layout(
    scene=dict(
        xaxis=dict(title='Recency'),
        yaxis=dict(title='Frequency'),
        zaxis=dict(title='Monetary')
    ),
    margin=dict(l=0, r=0, b=0, t=0)
)

fig = go.Figure(data=[trace], layout=layout)
py.plot(fig)


'temp-plot.html'