## 2. Load Modules and Data

In [None]:
!pip install umap-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import umap        
import random
import os
import pickle
import logging


from datetime import datetime, timedelta
from tqdm import tqdm
from collections import Counter
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from PIL import Image

Configure logging and plitting styles.

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# plt.style.use('seaborn-v0_8-<seaborn-whitegrid>')
# sns.set_palette('viridis')
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', 1000)

Load in data:

In [None]:
txs_path = '/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv'
cxs_path = '/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv'
axs_path = '/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv'
img_path ='/kaggle/input/h-and-m-personalized-fashion-recommendations/images'

data = {
    'transactions': pd.read_csv(txs_path, dtype={'article_id': str},parse_dates=['t_dat']),
    'articles': pd.read_csv(axs_path, dtype={'article_id': str}),
    'customers': pd.read_csv(cxs_path)
}

In [None]:
transactions = data['transactions']
articles = data['articles']
customers = data['customers']

customers.fillna(0, inplace=True)  # replace NaNs to -1


print(f"Transactions shape: {transactions.shape}")
print(f"Articles shape: {articles.shape}")
print(f"Customers shape: {customers.shape}")

## 3. Data Exploration

### 3.1 Cursory View

Explore transactions data:

In [None]:
print("Transactions dataset info:")
print(f"Shape: {transactions.shape}")
print(f"Columns: {transactions.columns.tolist()}")
print("\nSample transactions:")
transactions.head()

Explore articles of clothing:

In [None]:
# Explore articles data
print("Articles dataset info:")
print(f"Shape: {articles.shape}")
print(f"Columns: {articles.columns.tolist()}")
print("\nSample articles:")
articles.head()

Explore customer data:

In [None]:
# Explore customers data
print("Customers dataset info:")
print(f"Shape: {customers.shape}")
print(f"Columns: {customers.columns.tolist()}")
print("\nSample customers:")
customers.head()

### 3.2 Transactions Analysis

Take a look at purchases by day, week, and month. This is important to identify trends in customer behavior. This may be useful for providing recommendations relevant to to the season or current week.

In [None]:
print(f"Date range: {transactions['t_dat'].min()} to {transactions['t_dat'].max()}")

transactions['week'] = transactions['t_dat'].dt.to_period('W')
transactions['month'] = transactions['t_dat'].dt.to_period('M')

daily_purchases = transactions.groupby('t_dat').size()
weekly_purchases = transactions.groupby('week').size()
monthly_purchases = transactions.groupby('month').size()

# Plot weekly and monthly trends
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(15, 10))

daily_purchases.plot(ax=ax1)
ax1.set_title('Daily Purchase Volume')
ax1.set_ylabel('Number of Purchases')
ax1.set_xlabel('Day')
ax1.grid(True)

weekly_purchases.plot(ax=ax2)
ax2.set_title('Weekly Purchase Volume')
ax2.set_ylabel('Number of Purchases')
ax2.set_xlabel('Week')
ax2.grid(True)

monthly_purchases.plot(ax=ax3)
ax3.set_title('Monthly Purchase Volume')
ax3.set_ylabel('Number of Purchases')
ax3.set_xlabel('Month')
ax3.grid(True)

plt.tight_layout()
plt.show()

### 3.3 Customer Analysis

Look at customer purchase frequency on linear and logular scale so we can see the long tail of the distribution. Every non-outlier customer has fewer than 1250 purchases, which is evident on the log scale. There is a large frequency around zero purchases.

In [None]:
# Customer purchase frequency
customer_purchase_counts = transactions.groupby('customer_id').size()

print(f"Average purchases per customer: {customer_purchase_counts.mean()}")
print(f"Median purchases per customer: {customer_purchase_counts.median()}")
print(f"Min purchases per customer: {customer_purchase_counts.min()}")
print(f"Max purchases per customer: {customer_purchase_counts.max()}")

# lin distribution
plt.figure(figsize=(12, 6))
customer_purchase_counts.hist(bins=50)
plt.title('Distribution of Purchases per Customer')
plt.xlabel('Number of Purchases')
plt.ylabel('Number of Customers')
plt.grid(True)
plt.tight_layout()
plt.show()

# log distribution
plt.figure(figsize=(12, 6))
customer_purchase_counts.hist(bins=50, log=True)
plt.title('Distribution of Purchases per Customer (Log Scale)')
plt.xlabel('Number of Purchases')
plt.ylabel('Number of Customers (Log Scale)')
plt.grid(True)
plt.tight_layout()
plt.show()

### 3.4 Product Analysis

Now, we can take a look at product popularity by item and by category. The most popular item is a pair of Jade HW Skinny Denim TRS, and the most populr category is trousers.

In [None]:
product_popularity = transactions['article_id'].value_counts()

print(f"Total unique products purchased: {len(product_popularity)}")
print(f"Average purchases per product: {product_popularity.mean()}")
print(f"Median purchases per product: {product_popularity.median()}")
print(f"Min purchases per product: {product_popularity.min()}")
print(f"Max purchases per product: {product_popularity.max()}")

In [None]:
# map article_id to product name
article_names = articles[['article_id', 'prod_name']].set_index('article_id')['prod_name']

# replace article_ids with product names
product_popularity_named = product_popularity.copy()
product_popularity_named.index = [article_names.get(id, id) for id in product_popularity.index]

most_popular_article_id = product_popularity.index[0]
most_popular_article_name = product_popularity_named.index[0]

plt.figure(figsize=(12, 6))
product_popularity_named.head(50).plot(kind='bar')
plt.title('Top 50 Most Popular Products')
plt.xlabel('Product Name')
plt.ylabel('Purchase Count')
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()

In [None]:
subfolder = most_popular_article_id[:3]
img_path = os.path.join(images_dir, subfolder, f"{most_popular_article_id}.jpg")

plt.figure(figsize=(6, 6))
img = Image.open(img_path)
plt.imshow(img)
plt.title(f"Most Popular Item: {most_popular_article_name}", fontsize=14)
plt.axis('off')
plt.tight_layout()
plt.show()

In [None]:
transactions_with_products = transactions.merge(articles, on='article_id', how='left')

product_type_counts = transactions_with_products['product_type_name'].value_counts()
product_group_counts = transactions_with_products['product_group_name'].value_counts()

plt.figure(figsize=(14, 8))
product_type_counts.head(20).plot(kind='barh')
plt.title('Top 20 Product Types')
plt.xlabel('Purchase Count')
plt.ylabel('Product Type')
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()

## 4. Data Preprocessing

In [None]:
def prepare_transaction_sequences(transactions, min_purchases=2, max_seq_length=50):

    print("Preparing transaction sequences...")
    
    # sort transactions by customer and date
    transactions_sorted = transactions.sort_values(['customer_id', 't_dat'])
    
    # group by customer_id and gather purchase sequences
    customer_sequences = []
    
    for customer_id, group in tqdm(transactions_sorted.groupby('customer_id')):
        if len(group) >= min_purchases:
            # Get purchased articles sequence
            sequence = group['article_id'].tolist()
            
            # limit sequence length
            if len(sequence) > max_seq_length:
                sequence = sequence[-max_seq_length:]
                
            customer_sequences.append(sequence)
    
    print(f"Created {len(customer_sequences)} customer sequences")
    return customer_sequences

In [None]:
def create_time_splits(transactions, test_days=7, validation_days=7):

    print("Creating time splits...")
    
    # set split dates
    max_date = transactions['t_dat'].max()
    test_start_date = max_date - timedelta(days=test_days-1)
    validation_start_date = test_start_date - timedelta(days=validation_days)
    
    # split data
    train = transactions[transactions['t_dat'] < validation_start_date]
    validation = transactions[(transactions['t_dat'] >= validation_start_date) & 
                              (transactions['t_dat'] < test_start_date)]
    test = transactions[transactions['t_dat'] >= test_start_date]
    
    print(f"Train period: until {validation_start_date - timedelta(days=1)}")
    print(f"Validation period: {validation_start_date} to {test_start_date - timedelta(days=1)}")
    print(f"Test period: {test_start_date} to {max_date}")
    
    print(f"Train shape: {train.shape}")
    print(f"Validation shape: {validation.shape}")
    print(f"Test shape: {test.shape}")
    
    return {
        'train': train,
        'validation': validation,
        'test': test
    }

In [None]:
def create_ground_truth(transactions, customer_ids, start_date, days=7):

    print("Creating ground truth...")
    
    end_date = start_date + timedelta(days=days-1)
    
    # filter transactions for the evaluation period
    eval_transactions = transactions[(transactions['t_dat'] >= start_date) & 
                                    (transactions['t_dat'] <= end_date)]
    
    # make dictionary
    ground_truth = {}
    
    for customer_id, group in eval_transactions.groupby('customer_id'):
        if customer_id in customer_ids:
            # get unique purchased articles
            purchased_articles = group['article_id'].unique().tolist()
            ground_truth[customer_id] = purchased_articles
    
    print(f"Created ground truth for {len(ground_truth)} customers")
    return ground_truth

## 4. Product2Vec Model Implementation

Create a class for the Product2Vec model:

In [None]:
# Product2Vec model for learning product embeddings from purchase sequences using word2vec
class Product2Vec:

    def __init__(self, vector_size=100, window=5, min_count=5, 
                 sg=1, workers=4, epochs=20, ns_exponent=0.75):

        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.sg = sg
        self.workers = workers
        self.epochs = epochs
        self.ns_exponent = ns_exponent
        self.model = None
        self.word_vectors = None
        self.item_embeddings = None
        self.item_biases = None
        
    def fit(self, sequences):

        print(f"Training Product2Vec with {len(sequences)} sequences...")

        # convert sequences dictionary to list if needed
        if isinstance(sequences, dict):
            sequences = list(sequences.values())
        
        # train
        self.model = Word2Vec(
            sentences=sequences,
            vector_size=self.vector_size,
            window=self.window,
            min_count=self.min_count,
            sg=self.sg,
            workers=self.workers,
            epochs=self.epochs,
            ns_exponent=self.ns_exponent
        )
        
        # store embeddings
        self.item_embeddings = {item: self.model.wv[item] 
                                for item in self.model.wv.index_to_key}
        
        # store biases
        if hasattr(self.model, 'trainables') and hasattr(self.model.trainables, 'biases'):
            self.item_biases = {item: self.model.trainables.biases[self.model.wv.key_to_index[item]] 
                                for item in self.model.wv.index_to_key}
        
        print(f"Model trained with {len(self.item_embeddings)} items")
        
        return self
    
    def get_embedding(self, item_id):
        return self.model.wv[item_id]
    
    def get_similar_items(self, item_id, top_n=12): # playing with top_n, was 10
        return self.model.wv.most_similar(item_id, topn=top_n)

    
    def get_recommendations(self, history, top_n=12, strategy='mean', 
                           recency_bias=True, exclude_history=True):

        # filter history items that are in the vocabulary
        valid_history = [item for item in history if item in self.model.wv.key_to_index]
        
        if not valid_history:
            # if no valid items in history, return empty list
            return []
        
        # apply recency bias if enabled
        if recency_bias and len(valid_history) > 1:
            # more recent items get higher weights
            weights = np.linspace(0.5, 1.0, len(valid_history))
        else:
            weights = np.ones(len(valid_history))
        
        # Get embeddings for history items
        history_embeddings = [self.get_embedding(item) for item in valid_history]
        
        # combine embeddings based on strategy
        if strategy == 'mean':
            user_vector = np.mean(history_embeddings, axis=0)
        elif strategy == 'weighted_mean':
            user_vector = np.average(history_embeddings, axis=0, weights=weights)
        elif strategy == 'recent':
            # Use only the most recent item
            user_vector = history_embeddings[-1]
        else:
            raise ValueError(f"Unknown strategy: {strategy}")
        
        # find similar items to the user vector
        items_to_exclude = set(valid_history) if exclude_history else set()
        similar_items = self._find_similar_items(user_vector, top_n, items_to_exclude)
        
        return [item_id for item_id, _ in similar_items]
    
    def _find_similar_items(self, vector, top_n=12, exclude_items=None):

        if exclude_items is None:
            exclude_items = set()
        
        # norm the query vector
        norm_vector = vector / np.linalg.norm(vector)
        
        # get all item vectors
        all_items = [(item, self.model.wv[item]) for item in self.model.wv.index_to_key 
                    if item not in exclude_items]
        
        # Calculate similarities
        similarities = [(item, np.dot(norm_vector, vec / np.linalg.norm(vec))) 
                       for item, vec in all_items]
        
        # sort similarities descendingly 
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        return similarities[:top_n]
    
    def save(self, model_path, embeddings_path=None):

        # save the model
        self.model.save(model_path)
        
        # save embeddings
        if embeddings_path:
            with open(embeddings_path, 'wb') as f:
                pickle.dump(self.item_embeddings, f)
    
    @classmethod
    def load(cls, model_path, embeddings_path=None):

        # create a new instance
        instance = cls()
        
        # load the Word2Vec model
        instance.model = Word2Vec.load(model_path)
        
        # load embeddings if path provided, otherwise extract from model
        if embeddings_path and os.path.exists(embeddings_path):
            with open(embeddings_path, 'rb') as f:
                instance.item_embeddings = pickle.load(f)
        else:
            instance.item_embeddings = {item: instance.model.wv[item] 
                                     for item in instance.model.wv.index_to_key}
        
        return instance

In [None]:
def generate_user_recommendations(model, user_histories, top_n=12, 
                                strategy='weighted_mean', recency_bias=True):

    print(f"Generating recommendations for {len(user_histories)} users...")
    
    recommendations = {}
    
    for user_id, history in tqdm(user_histories.items()):
        recs = model.get_recommendations(
            history=history,
            top_n=top_n,
            strategy=strategy,
            recency_bias=recency_bias
        )
        
        # if not enough recommendations, pad with popular items
        if len(recs) < top_n:

            # do not include already recommended items
            remaining_popular = [item for item in popular_items if item not in recs]
            
            # add popular items until we reach fill up top_n slots
            needed = top_n - len(recs)
            recs.extend(remaining_popular[:needed])
                
        recommendations[user_id] = recs[:top_n]
    
    print(f"Generated recommendations for {len(recommendations)} users")
    return recommendations

In [None]:
def create_item_sequences(transactions, column='article_id', time_column='t_dat', 
                         customer_column='customer_id'):

    # sort transactions by customer and time
    sorted_df = transactions.sort_values([customer_column, time_column])
    
    # group by customer and create sequences
    sequences = {}
    
    for customer, group in tqdm(sorted_df.groupby(customer_column)):
        sequences[customer] = group[column].tolist()
    
    return sequences

In [None]:
base_params = {
    'vector_size': 100,
    'window': 5,
    'min_count': 5,
    'sg': 1,  # Skip-gram
    'workers': 4,
    'epochs': 20,
    'ns_exponent': 0.75  # Negative sampling distribution exponent
}

train_sequences = prepare_transaction_sequences(transactions, min_purchases=2)
customer_histories = create_item_sequences(transactions)
 
# initialize and train the model
base_model = Product2Vec(**base_params)
base_model.fit(train_sequences)

In [None]:
# find similar products for an example item
article_id = "0110065011"  
similar_items = base_model.get_similar_items(article_id, top_n=5)
print(f"Similar products to {article_id}:")
for similar_id, similarity in similar_items:
    print(f"  {similar_id} (Similarity: {similarity})")

In [None]:
# generate recommendations for a user
customer_id = "00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e"  
history = transactions[transactions['customer_id'] == customer_id]['article_id'].tolist()
recommendations = base_model.get_recommendations(
    history=history,
    top_n=12,
    strategy='weighted_mean',
    recency_bias=True
)
print(f"Recommendations for customer {customer_id}:")
print(recommendations)

In [None]:
def display_product_recommendations(article_ids, articles_df=None, title="Product Recommendations"):


    fig, axes = plt.subplots(3, 4, figsize=(15, 10))
    fig.suptitle(title, fontsize=16)
    
    axes = axes.flatten()
    
    for i, article_id in enumerate(article_ids[:12]):
        
        subfolder = article_id[:3]
        img_path = os.path.join(images_dir, subfolder, f"{article_id}.jpg")
        
        if os.path.exists(img_path):
            img = Image.open(img_path)
            axes[i].imshow(img)
            axes[i].set_title(f"Article: {article_id}", fontsize=10)
        else:
            print('oopsie')
        
        axes[i].axis('off')
    
    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.show()

In [None]:
def show_recommendations_for_customer(customer_id, model, transactions, articles_df=None):

    # get customer purchase history
    history = transactions[transactions['customer_id'] == customer_id]['article_id'].tolist()
    
    # generaet recommendations
    recommendations = model.get_recommendations(
        history=history,
        top_n=12,
        strategy='weighted_mean',
        recency_bias=True
    )
    
    print(f"Displaying {len(recommendations)} recommendations for customer {customer_id}")
    display_product_recommendations(recommendations, articles_df=articles_df, 
                                    title=f"Recommendations for Customer {customer_id}",
                                    img_path)
    
    return recommendations

## 5. Evaluation

### 5.1 Model Evaluations

In [None]:
def map_at_k_kaggle(actual, predicted, k=12):
    
    ap_at_k = []
    
    for act, pred in zip(actual, predicted):
        if len(act) > 0:
            pred_k = pred[:k]
            
            precision_sum = 0
            num_correct = 0
            
            for i, item in enumerate(pred_k):
                if item in act:
                    num_correct += 1
                    precision_sum += num_correct / (i + 1)
            
            ap = precision_sum / min(len(act), k) if len(act) > 0 else 0
            ap_at_k.append(ap)
    
    return np.mean(ap_at_k) if ap_at_k else 0

In [None]:
def evaluate_model(model, user_histories, ground_truth, top_n=12, strategy='weighted_mean', 
                  recency_bias=True):

    print(f"Evaluating model with strategy '{strategy}'...")
    
    recommendations = {}
    actual_lists = []
    pred_lists = []
    
    eval_users = list(set(user_histories.keys()) & set(ground_truth.keys()))
    print(f"Evaluating for {len(eval_users)} users")
    
    for user_id in tqdm(eval_users):
        history = user_histories[user_id]
        actual = ground_truth[user_id]
        
        recs = model.get_recommendations(
            history=history,
            top_n=top_n,
            strategy=strategy,
            recency_bias=recency_bias
        )
        
        if len(recs) > 0:
            recommendations[user_id] = recs
            actual_lists.append(actual)
            pred_lists.append(recs)
    
    map_score = map_at_k_kaggle(actual_lists, pred_lists, k=top_n)
    
    all_items = set(item for user, items in ground_truth.items() for item in items)
    recommended_items = set(item for user, items in recommendations.items() for item in items)
    
    item_coverage = len(recommended_items & all_items) / len(all_items) if len(all_items) > 0 else 0
    customer_coverage = len(recommendations) / len(ground_truth) if len(ground_truth) > 0 else 0
    
    return {
        'map@12': map_score,
        'item_coverage': item_coverage,
        'customer_coverage': customer_coverage,
        'recommendations': recommendations
    }

In [None]:
# Create time-based splits
splits = create_time_splits(transactions, test_days=7, validation_days=7)
train_df = splits['train']
validation_df = splits['validation']
test_df = splits['test']

In [None]:
# Create ground truth
validation_ground_truth = create_ground_truth(
    validation_df, 
    list(customer_histories.keys()), 
    validation_df['t_dat'].min(),
    days=7
)

In [None]:
# Evaluate model
results = evaluate_model(
    base_model,
    customer_histories,
    validation_ground_truth,
    top_n=12,
    strategy='weighted_mean',
    recency_bias=True
)

print(f"MAP@12: {results['map@12']}")
print(f"Item coverage: {results['item_coverage']}")
print(f"Customer coverage: {results['customer_coverage']}")

### 5.2 Visualize Embeddings

In [None]:
# 3. Define the missing visualization function
def visualize_embeddings(model, articles_df, dim_reduction='tsne', n_items=1000, 
                        color_by='product_group_name', figsize=(12, 10)):

    # get embeddings for visualization
    vocab_items = list(model.model.wv.key_to_index.keys())
    
    # dim reduction options
    if dim_reduction == 'tsne':
        reducer = TSNE(n_components=2, random_state=19)
    elif dim_reduction == 'umap':
        reducer = umap.UMAP(random_state=19)
    
    # sample items if there are too many
    if len(vocab_items) > n_items:
        sample_items = random.sample(vocab_items, n_items)
    else:
        sample_items = vocab_items
    
    item_embeddings = np.array([model.model.wv[item] for item in sample_items])
    
    reduced_embeddings = reducer.fit_transform(item_embeddings)
    
    # create df for plotting
    plot_df = pd.DataFrame({
        'x': reduced_embeddings[:, 0],
        'y': reduced_embeddings[:, 1],
        'article_id': sample_items
    })
    
    plot_df = plot_df.merge(articles_df, on='article_id', how='left')
    
    plt.figure(figsize=figsize)
    sns.scatterplot(data=plot_df, x='x', y='y', hue=color_by, alpha=0.7, s=20)
    plt.title(f'Product Embeddings Visualization ({dim_reduction.upper()})')
    plt.xlabel(f'{dim_reduction.upper()} Dimension 1')
    plt.ylabel(f'{dim_reduction.upper()} Dimension 2')
    
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    
    return plt.gcf()

Visualize embeddings using t-SNE:

In [None]:
tsne_fig = visualize_embeddings(
    base_model, 
    articles, 
    dim_reduction='tsne', 
    n_items=2000, 
    color_by='product_group_name',
    figsize=(15, 12)
)

Visualize embeddings using UMAP:

In [None]:
umap_fig = visualize_embeddings(
    base_model, 
    articles, 
    dim_reduction='umap', 
    n_items=2000, 
    color_by='colour_group_name',
    figsize=(15, 12)
)

##