In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

## Feature Engineering Functions

In [2]:
# remove consecutive duplicates from list
def remove_consec_duplicates(raw_lst):
  previous_value = None
  new_lst = []

  for elem in raw_lst:
    if elem != previous_value:
        new_lst.append(elem)
        previous_value = elem
        
  return new_lst

# generate sequential products
def generate_sequential_products(data):
  import pandas as pd
  from sklearn.preprocessing import LabelEncoder

  data_modified = data.copy()
  # Label encode the product names
  encoder = LabelEncoder()
  data_modified['product_id'] = encoder.fit_transform(data_modified['product_name'])

  # Remove sessions where only a single product is viewed
  data_modified = data_modified.groupby("session_id").filter(lambda x: len(x) > 1)
  # Group product view sequences by session id
  data_modified = data_modified.groupby("session_id")["product_id"].apply(list)
  # Remove consecutive duplicate product views from the sequences genereated in the previous step
  data_modified = data_modified.apply(remove_consec_duplicates)

  #Convert series to data frame and reset index
  data_modified = data_modified.to_frame().reset_index().rename(columns={"product_id": "chronological_product_sequence"})

  return data_modified, encoder

# create product embeddings using word2vec
def create_product_embeddings(data_modified):
  import gensim
  from gensim.models import Word2Vec
  import pandas as pd

  # Convert the product sequences to list of lists
  session_based_product_sequences = data_modified.copy()
  
  # Create Gensim CBOW model
  session_product_sequences = session_based_product_sequences['chronological_product_sequence'].apply(list)
  word2vec_model = gensim.models.Word2Vec(session_product_sequences, min_count = 1, vector_size = 10, window = 5)
  
  # numpy.ndarrays of product vectors
  product_vectors = word2vec_model.wv.vectors

  # productID_list = word2vec_model.wv.vocab.keys()
  productID_list = word2vec_model.wv.index_to_key
  vector_list = word2vec_model.wv.vectors.tolist()
  data_tuples = list(zip(productID_list,vector_list))
  product_ids_and_vectors = pd.DataFrame(data_tuples, columns=['Product_ID','Vectors'])

  return product_ids_and_vectors

## Plotting Functions

In [3]:
# Plotting the cluster distribution
def plot_cluster_distribution(kmeans_model):
  import matplotlib.pyplot as plt
  import numpy as np

  plt.figure(figsize=(10, 6))
  plt.hist(kmeans_model.labels_, bins=np.arange(kmeans_model.n_clusters + 1) - 0.5, rwidth=0.7, color='skyblue', edgecolor='black')
  plt.xticks(np.arange(kmeans_model.n_clusters))
  plt.ylabel("Number of Products", fontsize=14)
  plt.xlabel("Cluster No", fontsize=14)
  plt.title("Cluster Distribution", fontsize=18, fontweight='bold')
  plt.grid(axis='y', linestyle='--', linewidth=0.7)
  plt.tight_layout()
  plt.savefig('cluster_distribution.png')

  # Hide the plot
  plt.close()

# Plotting the cluster scatter
def plot_cluster_scatter(product_vectors):
  import matplotlib.pyplot as plt
  from sklearn.decomposition import PCA
  from sklearn.cluster import KMeans
  import numpy as np

  pca = PCA(n_components=2)
  two_dimensions_vectors = pca.fit_transform(product_vectors)

  kmeans_model = KMeans(n_clusters=10, random_state=0).fit(two_dimensions_vectors)
  labels = kmeans_model.labels_

  # Getting the Centroids
  centroids = kmeans_model.cluster_centers_
  unique_labels = np.unique(labels)

  plt.figure(figsize=(12, 8))
  
  # Plotting the results
  for label in unique_labels:
      plt.scatter(two_dimensions_vectors[labels == label, 0], 
                  two_dimensions_vectors[labels == label, 1], 
                  label=f'Cluster {label}', s=50, alpha=0.6)

  plt.scatter(centroids[:, 0], centroids[:, 1], s=200, color='black', marker='X', label='Centroids')

  plt.xlabel('PCA Component 1', fontsize=14)
  plt.ylabel('PCA Component 2', fontsize=14)
  plt.title("Cluster Scatter", fontsize=18, fontweight='bold')
  plt.legend(loc='best', fontsize=12)
  plt.grid(True, linestyle='--', linewidth=0.7)
  plt.tight_layout()
  plt.savefig('cluster_scatter.png')

  # Hide the plot
  plt.close()

## Model Building Functions

In [4]:
# Clustering model
def fit_kmeans(product_ids_and_vectors):
  import gensim
  from gensim.models import Word2Vec
  from sklearn.cluster import KMeans
  import matplotlib.pyplot as plt
  import numpy as np
  
  product_ids_and_vectors_train = product_ids_and_vectors.copy()

  # Get product vectors from Word2Vec
  array_product_vectors = np.array(product_ids_and_vectors_train["Vectors"].values.tolist())

  # Fit K-Means algorithm on those embeddings
  kmeans_model = KMeans(n_clusters=10, random_state=0).fit(array_product_vectors)

  # Cluster Distribution Plot
  plot_cluster_distribution(kmeans_model)

  # Cluster Scatter Plot
  plot_cluster_scatter(array_product_vectors)
  
  return kmeans_model

# Final product clusters
def save_final_product_clusters(clustering_model, product_ids_and_vectors):
  import numpy as np

  model = clustering_model
  
  product_ids_and_vectors_modified = product_ids_and_vectors.copy()
  array_product_vectors = np.array(product_ids_and_vectors_modified["Vectors"].values.tolist())

  assigned_cluster_no = model.fit_predict(array_product_vectors).tolist()

  product_ids_and_vectors_modified["Cluster_No"] = assigned_cluster_no
  cluster_members_df = product_ids_and_vectors_modified[["Product_ID","Cluster_No"]].groupby("Cluster_No")['Product_ID'].apply(list).to_frame().reset_index().rename(columns={'Product_ID': 'Cluster_Member_List'})


  return cluster_members_df

## Execution

In [5]:
# Import the data
data = pd.read_csv('data/session_data.csv')

# Display the first few rows of the data
data.head()

Unnamed: 0,session_id,product_name
0,000ed966131fcb96e0efc4ff2b716a3e,beetroot
1,000ed966131fcb96e0efc4ff2b716a3e,cucumber
2,0013eab657eaf2d82d7f1e13023d95c2,onion
3,0013eab657eaf2d82d7f1e13023d95c2,long shelf life milk
4,0013fabde1e543dd541be925266aadbc,dates


In [6]:
# Generate sequential products
data, encoder = generate_sequential_products(data)

# Create product embeddings
product_ids_and_vectors = create_product_embeddings(data)

# Fit KMeans clustering model
clustering_model = fit_kmeans(product_ids_and_vectors)

# Save final product clusters
final_product_clusters = save_final_product_clusters(clustering_model, product_ids_and_vectors)

## Example

In [7]:
# Product ID to generate recommendations
product_id = 260 # notebook
# Get product name from product ID
product_name = encoder.inverse_transform([product_id])[0]

# Get Vector (Embedding) array of the given product
vector_array = np.array(product_ids_and_vectors[product_ids_and_vectors["Product_ID"]==product_id]["Vectors"].tolist())

# Get cluster number for the given product assigned by the model
cluster_no = clustering_model.predict(vector_array)[0]

# Get members list of the cluster that the given product is assigned to 
cluster_members_list = final_product_clusters[final_product_clusters['Cluster_No']==cluster_no]['Cluster_Member_List'].iloc[0]

# Randomly select 5 product recommendations from the cluster members excluding the given product
from random import sample
# cluster_members_list.remove(product_id)
five_product_recommendations = sample(cluster_members_list, 5)

# Map the recommended product IDs to product names in initial data
five_product_recommendations = encoder.inverse_transform(five_product_recommendations)


print("5 Similar Product Recommendations for {}: ".format(product_name),five_product_recommendations)

5 Similar Product Recommendations for notebook:  ['notebook' 'colours' 'sharpener' 'craft paper' 'pencil']


## Model Evaluation

In [9]:
# train-test split
from sklearn.model_selection import train_test_split

# Import the data
data = pd.read_csv('data/session_data.csv')

train_data, test_data = train_test_split(data, test_size=0.2, random_state=0)

# Generate sequential products
train_data, encoder = generate_sequential_products(train_data)
test_data, encoder = generate_sequential_products(test_data)

# Create product embeddings
train_product_ids_and_vectors = create_product_embeddings(train_data)
test_product_ids_and_vectors = create_product_embeddings(test_data)

# Fit KMeans clustering model
clustering_model = fit_kmeans(train_product_ids_and_vectors)

# Evaluate the model
from sklearn.metrics import silhouette_score

# Get product vectors from Word2Vec
array_train_product_vectors = np.array(train_product_ids_and_vectors["Vectors"].values.tolist())
array_test_product_vectors = np.array(test_product_ids_and_vectors["Vectors"].values.tolist())

# Get cluster number for the given product assigned by the model
train_assigned_cluster_no = clustering_model.predict(array_train_product_vectors)
test_assigned_cluster_no = clustering_model.predict(array_test_product_vectors)

# Calculate silhouette score
train_silhouette_score = silhouette_score(array_train_product_vectors, train_assigned_cluster_no)
test_silhouette_score = silhouette_score(array_test_product_vectors, test_assigned_cluster_no)

print("Train Silhouette Score: ", train_silhouette_score)
print("Test Silhouette Score: ", test_silhouette_score)

Train Silhouette Score:  0.21531029411878816
Test Silhouette Score:  0.3921875518440196
