In [105]:
# Data analysis
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import os
import random

# Machine learning
from sklearn.preprocessing import MinMaxScaler
import networkx as nx    
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset 
from collections import defaultdict
from sklearn.model_selection import train_test_split
import torch.multiprocessing as mp
from torch.cuda.amp import autocast, GradScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pd.set_option('display.max_columns', None)

## **Data Description**

This project uses three datasets:

#### `train_data.csv`: 
This dataset contains two years (2022 & 2023) of historical transactions for 100,000 Carrefour customers. It has 10 columns:

* ***date***: Date of the transaction
* ***transaction_id***: ID of the transaction
* ***customer_id***: Customer ID
* ***product_id***: Product purchased
* ***has_loyality_card***: Flag indicating whether the customer has a loyalty card
* ***store_id***: Store where the purchase was made
* ***is_promo***: Flag indicating whether there was a discount on the product
* ***quantity***: Quantity purchased of the product
* ***format***: Ecommerce activity format (clcv, lex, or DRIVE)
  - clcv : courses livrées chez vous
  - lex : livraison express
  - DRIVE.
* ***orderChannelCode***: Indicates whether the online activity was made through the website or mobile app

#### `products_data.csv`: 
This dataset contains detailed information about the products. The following columns are relevant to this project:

* ***product_id*** : Product name
* ***product_description*** : Product description
* ***department_key***: Department key
* ***class_key***: Class key
* ***subclass_key***: Subclass key
* ***sector***: sector name
* ***brand_key***: Brand name
* ***shelf_level1***: Top-level shelf category
* ***shelf_level2***: Second-level shelf category
* ***shelf_level3***: Third-level shelf category
* ***shelf_level4***: Fourth-level shelf category
* ***sector***: Sector
* ***bio***: Flag indicating whether the product is organic
* ***sugar_free***: Flag indicating whether the product is sugar-free
* ***aspartame_free***: Flag indicating whether the product is aspartame-free
* ***gluten_free***: Flag indicating whether the product is gluten-free
* ***halal***: Flag indicating whether the product is halal
* ***casher***: Flag indicating whether the product is kosher
* ***eco_friendly***: Flag indicating whether the product is eco-friendly
* ***local_french***: Flag indicating whether the product is locally produced in France
* ***artificial_coloring_free***: Flag indicating whether the product is free of artificial coloring
* ***taste_enhancer_free***: Flag indicating whether the product is free of taste enhancers
* ***naturality***: Naturality score
* ***antibiotic_free***: Flag indicating whether the product is antibiotic-free
* ***reduced_sugar***: Flag indicating whether the product has reduced sugar content
* ***vegetarian***: Flag indicating whether the product is vegetarian
* ***pesticide_free***: Flag indicating whether the product is pesticide-free
* ***grain_free***: Flag indicating whether the product is grain-free
* ***no_added_sugar***: Flag indicating whether the product has no added sugar
* ***salt_reduced***: Flag indicating whether the product has reduced salt content
* ***nitrite_free***: Flag indicating whether the product is nitrite-free
* ***fed_without_ogm***: Flag indicating whether the animals were fed without GMOs
* ***no_added_salt***: Flag indicating whether the product has no added salt
* ***no_artificial_flavours***: Flag indicating whether the product has no artificial flavors
* ***porc***: Flag indicating whether the product contains pork
* ***vegan***: Flag indicating whether the product is vegan
* ***frozen***: Flag indicating whether the product is frozen
* ***fat_free***: Flag indicating whether the product is fat-free
* ***reduced_fats***: Flag indicating whether the product has reduced fat content
* ***fresh***: Flag indicating whether the product is fresh
* ***alcool***: Flag indicating whether the product contains alcohol
* ***lactose_free***: Flag indicating whether the product is lactose-free
* ***phenylalanine_free***: Flag indicating whether the product is phenylalanine-free
* ***palm_oil_free***: Flag indicating whether the product is palm oil-free
* ***ecoscore***: Ecoscore
* ***produits_du_monde***: Flag indicating whether the product is an international product
* ***regional_product***: Flag indicating whether the product is a regional product
* ***national_brand***: Flag indicating whether the product is a national brand
* ***first_price_brand***: Flag indicating whether the product is a first-price brand
* ***carrefour_brand***: Flag indicating whether the product is a Carrefour brand

#### `test_data.csv`: 
This dataset contains the actual purchases of the first 80,000 customers in 2024. It has three columns:

* ***transaction_id***: ID of the transaction
* ***customer_id***: Customer ID
* ***product_id***: the id of the purchased product

### **Load data**

* Load *train_data.csv*, *products_data.csv* and *test_data.csv* using pandas.

In [3]:
train_dataframes = []
for i in tqdm(range(1, 11)):
    train_dataframes.append(pd.read_csv(f'data-train/train_data_part_{i}.csv'))
train_data = pd.concat(train_dataframes, ignore_index=True)

# free up memory by deleting the dataframes we no longer need
del train_dataframes

print(train_data.shape)
print(train_data.head(10))

100%|██████████| 10/10 [02:41<00:00, 16.12s/it]


(87037462, 10)
         date       transaction_id     customer_id     product_id  \
0  2023-11-15  Transaction_1730125    Household_39   Product_5362   
1  2022-07-20  Transaction_1560535    Household_39  Product_67174   
2  2022-07-20  Transaction_1560535    Household_39  Product_82254   
3  2023-11-15  Transaction_1730125    Household_39   Product_3895   
4  2022-07-20  Transaction_1560535    Household_39  Product_34014   
5  2023-07-12  Transaction_1479608   Household_167   Product_8327   
6  2023-07-18   Transaction_993002   Household_167   Product_3846   
7  2023-02-16  Transaction_1318448   Household_416  Product_21347   
8  2022-06-13  Transaction_1372043  Household_1264  Product_39217   
9  2022-06-13  Transaction_1372043  Household_1264  Product_36923   

   has_loyality_card store_id  is_promo  quantity format order_channel  
0                  0  Store_2         0       1.0  DRIVE    MOBILE_APP  
1                  0  Store_2         0       2.0  DRIVE       WEBSITE  
2     

In [4]:
# This code reads the data from a CSV file named "products_data.csv"
products_data = pd.read_csv('data-train/products_data.csv')
print(products_data.shape)

(82966, 49)


  products_data = pd.read_csv('data-train/products_data.csv')


In [5]:
# This code reads the data from a CSV file named "test_data.csv"
test_data = pd.read_csv('data-train/test_data.csv')
test_data = pd.DataFrame(test_data)

print(test_data.shape)
print(test_data.head(10))

(1220706, 3)
       transaction_id      customer_id     product_id
0  Transaction_2024_1  Household_16874   Product_9790
1  Transaction_2024_1  Household_16874  Product_68295
2  Transaction_2024_1  Household_16874  Product_19494
3  Transaction_2024_1  Household_16874  Product_11109
4  Transaction_2024_4   Household_9247  Product_57151
5  Transaction_2024_4   Household_9247    Product_379
6  Transaction_2024_4   Household_9247  Product_46331
7  Transaction_2024_4   Household_9247  Product_49682
8  Transaction_2024_5  Household_76806  Product_72217
9  Transaction_2024_5  Household_76806   Product_4897


### **Data transformation**

#### **Training set with relevant product information**

In [6]:
# Aggregate customer purchase data
customer_data = train_data.groupby(['customer_id', 'product_id']).agg({
    'quantity': 'sum'  # Total quantity purchased per product per customer
}).reset_index()

customer_data['quantity'] = customer_data['quantity'].astype(int)
customer_data = customer_data.sort_values(by=['customer_id', 'quantity'], ascending=[True, False])

print(customer_data.shape)
print(customer_data.head(10))

(33485786, 3)
     customer_id     product_id  quantity
347  Household_1  Product_57942        21
421  Household_1  Product_67459        20
116  Household_1  Product_24334        18
504  Household_1   Product_7783        15
51   Household_1  Product_16409        13
390  Household_1  Product_64067        12
6    Household_1   Product_1128        11
541  Household_1    Product_833        10
440  Household_1   Product_7006         9
97   Household_1  Product_21613         7


In [7]:
features_to_keep = [
    'product_id', 'brand_key', 'shelf_level1', 'shelf_level2', 'shelf_level3',
    'bio', 'sugar_free', 'gluten_free', 'halal', 'reduced_sugar', 'vegetarian', 'vegan',
    'pesticide_free', 'no_added_sugar', 'salt_reduced', 'no_added_salt', 'no_artificial_flavours', 
    'porc', 'frozen', 'fat_free', 'reduced_fats', 'fresh', 'alcool', 'lactose_free'
]

# Select only the required columns from the products table for efficiency
products_reduced = products_data[features_to_keep]

# Merge customer_data with the filtered products table on 'product_id'
purchase_data = customer_data.merge(products_reduced, on='product_id', how='inner')

# Convert customer_id and product_id to integer IDs (removing non-numeric characters)
purchase_data['customer_id'] = purchase_data['customer_id'].str.replace('Household_', '').astype(int)
purchase_data['product_id'] = purchase_data['product_id'].str.replace('Product_', '').astype(int)

purchase_data = purchase_data.sort_values(by=['customer_id', 'quantity'], ascending=[True, False])

# Display the result
print(purchase_data.head())

# Check the shape to ensure efficient merging
print(purchase_data.shape)

   customer_id  product_id  quantity   brand_key             shelf_level1  \
0            1       57942        21  CRISTALINE                 Boissons   
1            1       67459        20  FLEURY MIC  Charcuterie et Traiteur   
2            1       24334        18       SIMPL   Entretien et Nettoyage   
3            1        7783        15   CRF CLASS                 Boissons   
4            1       16409        13  ZZZZZZZZZZ        Fruits et Légumes   

                                shelf_level2  \
0                                       Eaux   
1                                Charcuterie   
2  Essuie-tout, Papier toilette et Mouchoirs   
3                   Jus de fruits et légumes   
4                                     Fruits   

                        shelf_level3  bio  sugar_free  gluten_free  halal  \
0                        Eaux plates    0           0            0      0   
1            Jambons blancs et Rôtis    0           0            0      0   
2                

In [8]:
# Define the frequency encoding function
def frequency_encode(df, column):
    freq = df[column].value_counts()  # Calculate frequencies of each category
    return df[column].map(freq)  # Map categories to their corresponding frequencies

# Apply frequency encoding to the specified categorical columns
purchase_data['brand_key_encoded'] = frequency_encode(purchase_data, 'brand_key')
purchase_data['shelf_level1_encoded'] = frequency_encode(purchase_data, 'shelf_level1')
purchase_data['shelf_level2_encoded'] = frequency_encode(purchase_data, 'shelf_level2')
purchase_data['shelf_level3_encoded'] = frequency_encode(purchase_data, 'shelf_level3')

# Drop the original categorical columns
purchase_data.drop(['brand_key', 'shelf_level1', 'shelf_level2', 'shelf_level3'], axis=1, inplace=True)

# List of columns to normalize
numerical_columns = ['quantity', 'brand_key_encoded', 'shelf_level1_encoded', 
                     'shelf_level2_encoded', 'shelf_level3_encoded']

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Normalize the specified numerical columns
purchase_data[numerical_columns] = scaler.fit_transform(purchase_data[numerical_columns])


# Display the updated dataset
print(purchase_data.head())

   customer_id  product_id  quantity  bio  sugar_free  gluten_free  halal  \
0            1       57942  0.004157    0           0            0      0   
1            1       67459  0.003949    0           0            0      0   
2            1       24334  0.003534    0           0            0      0   
3            1        7783  0.002910    0           0            0      0   
4            1       16409  0.002494    0           0            0      0   

   reduced_sugar  vegetarian  vegan  pesticide_free  no_added_sugar  \
0              0           0      0               0               0   
1              0           0      0               0               0   
2              0           0      0               0               0   
3              0           0      0               0               0   
4              0           0      0               0               0   

   salt_reduced  no_added_salt  no_artificial_flavours  porc  frozen  \
0             0              0        

In [None]:
#purchase_data.to_csv('purchase_data.csv', index=False)

In [None]:
#purchase_data = pd.read_csv('purchase_data.csv')

In [9]:
test_data = test_data.sort_values(by=['customer_id', 'product_id'], ascending=[True, True])
print(test_data.head(40))

                transaction_id   customer_id     product_id
813978  Transaction_2024_17216   Household_1  Product_15693
69458   Transaction_2024_17216   Household_1    Product_170
813974  Transaction_2024_17216   Household_1  Product_20576
813977  Transaction_2024_17216   Household_1  Product_23579
813973  Transaction_2024_17216   Household_1  Product_23625
69459   Transaction_2024_17216   Household_1  Product_24334
813975  Transaction_2024_17216   Household_1  Product_24503
407363  Transaction_2024_17216   Household_1  Product_35730
813979  Transaction_2024_17216   Household_1  Product_39719
69457   Transaction_2024_17216   Household_1  Product_45719
813980  Transaction_2024_17216   Household_1  Product_47925
813972  Transaction_2024_17216   Household_1  Product_49682
407364  Transaction_2024_17216   Household_1  Product_57011
407365  Transaction_2024_17216   Household_1  Product_57942
69460   Transaction_2024_17216   Household_1  Product_64067
813971  Transaction_2024_17216   Househo

#### **Test set**

In [10]:
test_set = test_data.drop(columns=["transaction_id"])

if not pd.api.types.is_numeric_dtype(test_set['customer_id']):
    test_set['customer_id'] = test_set['customer_id'].str.replace('Household_', '').astype(int)

if not pd.api.types.is_numeric_dtype(test_set['product_id']):
    test_set['product_id'] = test_set['product_id'].str.replace('Product_', '').astype(int)

test_set = test_set.sort_values(by=['customer_id', 'product_id'], ascending=[True, True])

print(test_set.head(40))
print(test_set.shape)


        customer_id  product_id
69458             1         170
813978            1       15693
813974            1       20576
813977            1       23579
813973            1       23625
69459             1       24334
813975            1       24503
407363            1       35730
813979            1       39719
69457             1       45719
813980            1       47925
813972            1       49682
407364            1       57011
407365            1       57942
69460             1       64067
813971            1       66850
813976            1       72217
407366            1       73323
338989            2        2182
813983            2        3753
813984            2        4796
813985            2        6493
407367            2        7978
813989            2        8816
813986            2       15970
407369            2       17770
407371            2       28757
407370            2       30477
338990            2       38436
813987            2       42748
338991  

### **Graph structure creation**

In [None]:
customer_nodes = purchase_data['customer_id'].unique().astype(int)
product_nodes = purchase_data['product_id'].unique().astype(int)

B = nx.Graph()

# Add nodes for customers and products with the appropriate types
for _, row in purchase_data.iterrows():
    # Ensure the IDs are integers before creating the nodes
    customer_id = int(row['customer_id'])
    product_id = int(row['product_id'])

    customer_node = f"Customer_{customer_id}"
    product_node = f"Product_{product_id}"

    # Add customer node (if not already in graph)
    if customer_node not in B:
        B.add_node(customer_node, type='customer')
    
    # Add product node (if not already in graph)
    if product_node not in B:
        B.add_node(product_node, type='product')
    
    # Add edge between customer and product (purchase relationship)
    B.add_edge(customer_node, product_node, weight=row['quantity'])

#### Saving graph

In [70]:

# Save nodes with attributes to a file
with open("nodes.pkl", "wb") as f:
    pickle.dump(dict(B.nodes(data=True)), f, protocol=pickle.HIGHEST_PROTOCOL)

chunk_size = 9_000_000
edges = list(B.edges(data=True))  # Get all edges with attributes

# Save edges in chunks
for i in range(0, len(edges), chunk_size):
    chunk = edges[i:i + chunk_size]
    with open(f"edges_{i // chunk_size}.pkl", "wb") as f:
        pickle.dump(chunk, f, protocol=pickle.HIGHEST_PROTOCOL)


In [11]:
with open("nodes.pkl", "rb") as f:
    nodes = pickle.load(f)

B = nx.Graph() 
B.add_nodes_from(nodes.items())

i = 0
while True:
    try:
        with open(f"edges_{i}.pkl", "rb") as f:
            edges = pickle.load(f)
            B.add_edges_from(edges)
        i += 1
    except FileNotFoundError:
        break 


In [31]:
customer_nodes = [n for n in B.nodes if B.nodes[n].get('type') == 'customer']
product_nodes = [n for n in B.nodes if B.nodes[n].get('type') == 'product']

print(f"Customer nodes: {len(customer_nodes)}")  # Should be 100,000
print(f"Product nodes: {len(product_nodes)}")    # Should be 82,815

total_edges = B.number_of_edges()
print(f"Total number of edges: {total_edges}")

Customer nodes: 100000
Product nodes: 82815
Total number of edges: 33485786


### **Embedding Initialization**

In [32]:
# Extract unique product IDs and their features
product_features = purchase_data.drop_duplicates(subset="product_id").set_index("product_id")
product_feature_columns = product_features.columns.drop(["customer_id", "quantity"])
product_features = product_features[product_feature_columns].to_dict(orient="index")

print(list(product_features.items())[:1])

[(57942, {'bio': 0, 'sugar_free': 0, 'gluten_free': 0, 'halal': 0, 'reduced_sugar': 0, 'vegetarian': 0, 'vegan': 0, 'pesticide_free': 0, 'no_added_sugar': 0, 'salt_reduced': 0, 'no_added_salt': 0, 'no_artificial_flavours': 0, 'porc': 0, 'frozen': 0, 'fat_free': 0, 'reduced_fats': 0, 'fresh': 0, 'alcool': 0, 'lactose_free': 0, 'brand_key_encoded': 0.02577269959931473, 'shelf_level1_encoded': 0.39584798240990027, 'shelf_level2_encoded': 0.1804035327943097, 'shelf_level3_encoded': 0.28661262169814977})]


In [33]:
# Set the embedding dimension
embedding_dim = 128

# Function to handle dimensionality mismatch for product embeddings
def get_product_embedding(product_id):
    features = np.array(list(product_features[product_id].values()))
    if len(features) < embedding_dim:
        padding = np.random.rand(embedding_dim - len(features))
        return np.concatenate([features, padding])
    elif len(features) > embedding_dim:
        return features[:embedding_dim]
    return features

# Initialize customer embeddings as random tensors and normalize them
customer_embeddings = torch.randn(len(customer_nodes), embedding_dim)
customer_embeddings = F.normalize(customer_embeddings, p=2, dim=1)  # Normalize to unit length

# Initialize product embeddings using product features, normalize after padding/truncation
product_embeddings = torch.zeros(len(product_nodes), embedding_dim)

for idx, product_id in enumerate(product_nodes):
    product_embedding = get_product_embedding(int(product_id.split('_')[1]))  # Extract product ID from the node
    product_embeddings[idx] = torch.tensor(product_embedding, dtype=torch.float32)

product_embeddings = F.normalize(product_embeddings, p=2, dim=1)  # Normalize product embeddings

# Combine customer and product embeddings into a single tensor (for easier processing later)
embeddings = torch.cat([customer_embeddings, product_embeddings], dim=0)

# Convert node IDs to tensor indices for later use
customer_node_indices = torch.tensor([i for i, n in enumerate(B.nodes) if B.nodes[n].get('type') == 'customer'])
product_node_indices = torch.tensor([i for i, n in enumerate(B.nodes) if B.nodes[n].get('type') == 'product'])

In [34]:
# Print shape of embeddings to verify
print(f"Shape of embeddings tensor: {embeddings.shape}")  # Should be (number of nodes, embedding_dim)

# Example: Print embedding for a specific customer and a specific product
customer_index = 10
product_index = len(customer_nodes)  # Index of the product (it follows customer nodes in the embeddings tensor)

# Print the embeddings
print(f"Customer embedding (index {customer_index}): {customer_embeddings[customer_index]}")
print(f"Product embedding (index {product_index}): {product_embeddings[product_index - len(customer_nodes)]}")

Shape of embeddings tensor: torch.Size([182815, 128])
Customer embedding (index 10): tensor([ 0.0783,  0.1410, -0.0485,  0.0551,  0.0812, -0.0043,  0.0348, -0.0825,
         0.0408,  0.0782, -0.0697,  0.0831, -0.0893, -0.0819, -0.0079, -0.0926,
        -0.1070,  0.0532,  0.0763, -0.1843, -0.0144, -0.1745, -0.0685, -0.1193,
         0.0580, -0.0565,  0.0254, -0.0339,  0.0429, -0.1232,  0.1187,  0.0838,
         0.0879, -0.1233, -0.1376,  0.0362, -0.0118,  0.0968,  0.0216, -0.0442,
         0.0231, -0.0466, -0.0051, -0.1075, -0.0082,  0.1044, -0.1241, -0.0916,
        -0.1195,  0.0070,  0.1671, -0.0364, -0.0341,  0.0088, -0.0421,  0.2043,
        -0.0045, -0.0578,  0.0057, -0.1108, -0.0250, -0.0090,  0.0726,  0.0838,
        -0.0743,  0.0437,  0.0070,  0.1479, -0.1536,  0.1529, -0.0360,  0.1777,
        -0.0656,  0.0684, -0.0538, -0.0521,  0.0021, -0.0711, -0.0144, -0.0978,
         0.1119, -0.0374, -0.0377, -0.0657, -0.0483,  0.0033, -0.0365, -0.1231,
        -0.0474,  0.0780,  0.1065, 

#### **Graph Propagation Layers**

In [35]:
class EmbeddingPropagationLayer(nn.Module):
    def __init__(self, embedding_dim, alpha=0.2):
        super(EmbeddingPropagationLayer, self).__init__()
        # Trainable weight matrices for embedding transformation and interaction
        self.W1 = nn.Parameter(torch.randn(embedding_dim, embedding_dim))
        self.W2 = nn.Parameter(torch.randn(embedding_dim, embedding_dim))
        self.alpha = alpha  # LeakyReLU coefficient

    def forward(self, user_embeddings, item_embeddings, adjacency_matrix):
        """
        Perform one layer of embedding propagation between users and items.
        """
        # Normalize adjacency matrix (graph Laplacian scaling factor)
        row_sum = adjacency_matrix.sum(dim=1, keepdim=True)
        col_sum = adjacency_matrix.sum(dim=0, keepdim=True)
        scaling_factor = torch.sqrt(row_sum * col_sum) + 1e-8
        normalized_adj = adjacency_matrix / scaling_factor

        # Compute messages from items to users
        message_item_to_user = self.compute_message(item_embeddings, user_embeddings, normalized_adj.T)

        # Aggregate messages for users
        updated_user_embeddings = self.aggregate_embeddings(user_embeddings, message_item_to_user)

        # Compute messages from users to items
        message_user_to_item = self.compute_message(user_embeddings, item_embeddings, normalized_adj)

        # Aggregate messages for items
        updated_item_embeddings = self.aggregate_embeddings(item_embeddings, message_user_to_item)

        return updated_user_embeddings, updated_item_embeddings

    def compute_message(self, source_embeddings, target_embeddings, adjacency_matrix):
        """
        Compute messages from source nodes (e.g., items) to target nodes (e.g., users).
        """
        # Interaction term between source and target embeddings (element-wise multiplication)
        interaction_term = source_embeddings.unsqueeze(1) * target_embeddings.unsqueeze(0)

        # Message encoding (Equation 3 in the paper)
        message = torch.matmul(source_embeddings, self.W1) + torch.matmul(interaction_term, self.W2)
        
        # Aggregate the messages with the adjacency matrix
        message = torch.bmm(adjacency_matrix.unsqueeze(2), message)  # Adjust shape for broadcasting

        return message

    def aggregate_embeddings(self, embeddings, message):
        """
        Aggregate messages and update embeddings with LeakyReLU (Equation 4 in the paper).
        """
        aggregated_message = message.sum(dim=1) + embeddings  # Self-connection included
        updated_embeddings = F.leaky_relu(aggregated_message, negative_slope=self.alpha)

        return updated_embeddings

In [36]:
class GraphEmbeddingPropagation(nn.Module):
    def __init__(self, user_embeddings, item_embeddings, adjacency_matrix, embedding_dim=128, num_layers=3, sample_size=50, alpha=0.2):
        super(GraphEmbeddingPropagation, self).__init__()
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.sample_size = sample_size  # Number of neighbors to sample
        self.alpha = alpha  # LeakyReLU coefficient

        # Initialize user and item embeddings as trainable parameters
        self.user_embeddings = nn.Parameter(torch.tensor(user_embeddings, dtype=torch.float32))
        self.item_embeddings = nn.Parameter(torch.tensor(item_embeddings, dtype=torch.float32))

        # Store adjacency matrix as sparse for memory efficiency
        self.adjacency_matrix = adjacency_matrix.coalesce()  # Convert to sparse format

        # Stack embedding propagation layers
        self.layers = nn.ModuleList([EmbeddingPropagationLayer(embedding_dim, alpha) for _ in range(num_layers)])

    def sample_neighbors(self, node_idx, num_neighbors):
        """
        Sample a fixed number of neighbors for a given node index.
        """
        neighbors = self.adjacency_matrix.indices()[1][self.adjacency_matrix.indices()[0] == node_idx]
        if len(neighbors) > num_neighbors:
            neighbors = random.sample(neighbors.tolist(), num_neighbors)
        return neighbors

    def create_sampled_adjacency(self, batch_nodes):
        """
        Create a sampled adjacency matrix for the batch nodes.
        """
        indices = []
        values = []

        for node in batch_nodes:
            neighbors = self.sample_neighbors(node, self.sample_size)
            for neighbor in neighbors:
                indices.append([node, neighbor])
                values.append(self.adjacency_matrix[node, neighbor].item())

        # Convert to sparse adjacency matrix
        indices = torch.tensor(indices, dtype=torch.long).t()  # Transpose for PyTorch format
        values = torch.tensor(values, dtype=torch.float32)
        sampled_adj = torch.sparse_coo_tensor(indices, values, self.adjacency_matrix.size())
        return sampled_adj.coalesce()

    def forward(self, batch_nodes):
        """
        Perform multi-layer embedding propagation with neighbor sampling and message passing.
        """
        # Initialize embeddings
        user_embeddings, item_embeddings = self.user_embeddings, self.item_embeddings

        # Create sampled adjacency matrix
        sampled_adj = self.create_sampled_adjacency(batch_nodes)

        # Sequentially apply each embedding propagation layer
        for layer in self.layers:
            user_embeddings, item_embeddings = layer(user_embeddings, item_embeddings, sampled_adj)

        return user_embeddings, item_embeddings

In [37]:
class NGCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_size=64, num_layers=3, dropout_ratio=0.1, p1=0.1, p2=0.0):
        super(NGCF, self).__init__()

        self.num_users = num_users
        self.num_items = num_items
        self.embedding_size = embedding_size
        self.num_layers = num_layers
        
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
        
        # Define the layer transformations
        self.weights = nn.ModuleList([nn.Linear(embedding_size, embedding_size) for _ in range(num_layers)])
        
        # Initialize with Xavier initialization
        self.apply(self.xavier_init)
        
        # Dropout rates
        self.p1 = p1
        self.p2 = p2
        
        # Final linear layer to map concatenated embeddings back to the embedding size
        self.final_user_linear = nn.Linear(embedding_size * (num_layers + 1), embedding_size)
        self.final_item_linear = nn.Linear(embedding_size * (num_layers + 1), embedding_size)

    def xavier_init(self, m):
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            if m.bias is not None:
                nn.init.zeros_(m.bias)

    def forward(self, user_indices, item_indices, negative_item_indices):
        user_emb = self.user_embedding(user_indices)
        item_emb = self.item_embedding(item_indices)
        negative_item_emb = self.item_embedding(negative_item_indices)

        all_user_embeddings = [user_emb]
        all_item_embeddings = [item_emb]
        
        # Propagate through layers
        for layer in range(self.num_layers):
            user_emb = F.dropout(user_emb, p=self.p1, training=self.training)
            item_emb = F.dropout(item_emb, p=self.p1, training=self.training)
            
            user_emb = F.relu(self.weights[layer](user_emb))
            item_emb = F.relu(self.weights[layer](item_emb))
            
            all_user_embeddings.append(user_emb)
            all_item_embeddings.append(item_emb)
        
        # Concatenate embeddings
        user_final_emb = torch.cat(all_user_embeddings, dim=-1)  # Shape: (batch_size, embedding_size * (num_layers + 1))
        item_final_emb = torch.cat(all_item_embeddings, dim=-1)  # Shape: (batch_size, embedding_size * (num_layers + 1))

        # Final projection to embedding size
        user_final_emb = self.final_user_linear(user_final_emb)
        item_final_emb = self.final_item_linear(item_final_emb)

        # Apply dropout to final embeddings
        if self.p2 > 0:
            user_final_emb = F.dropout(user_final_emb, p=self.p2, training=self.training)
            item_final_emb = F.dropout(item_final_emb, p=self.p2, training=self.training)

        # Compute predictions: positive items and negative items
        prediction_pos = torch.sum(user_final_emb * item_final_emb, dim=-1)
        prediction_neg = torch.sum(user_final_emb * negative_item_emb, dim=-1)

        return prediction_pos, prediction_neg


#### **Training and test data**

In [131]:
# Initialize mappings
customer_to_int = {}
product_to_int = {}

# Go through the graph and create mappings based on original IDs
for node in B.nodes:
    if B.nodes[node].get('type') == 'customer' and node.startswith('Customer_'):
        customer_id = int(node.split('_')[1])
        customer_to_int[node] = customer_id
    elif B.nodes[node].get('type') == 'product' and node.startswith('Product_'):
        product_id = int(node.split('_')[1])
        product_to_int[node] = product_id

In [152]:
def hard_negative_sampling(user_idx, positive_items, all_items, product_embeddings, num_negatives=5, subset_size=1000):
    """
    Perform hard negative sampling efficiently with subset sampling and optimized cosine similarity.
    """
    # Use GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    product_embeddings = product_embeddings.to(device)

    # Efficiently sample a subset of all_items
    if isinstance(all_items, range):
        subset_indices = torch.randint(0, len(all_items), (min(subset_size, len(all_items)),), device=device)
    else:
        subset_indices = torch.tensor(
            random.sample(all_items, min(subset_size, len(all_items))),
            device=device,
            dtype=torch.long
        )

    # Exclude positive items from the subset
    positive_items_tensor = torch.tensor(positive_items, device=device, dtype=torch.long)
    mask = ~torch.isin(subset_indices, positive_items_tensor)
    filtered_subset = subset_indices[mask]

    # Ensure at least `num_negatives` items are available
    if len(filtered_subset) < num_negatives:
        print(f"Warning: Not enough filtered negatives, reducing to {len(filtered_subset)}")
        num_negatives = len(filtered_subset)

    # Get embeddings for positive items and the filtered subset
    positive_embeddings = product_embeddings[positive_items_tensor]
    subset_embeddings = product_embeddings[filtered_subset]

    # Normalize embeddings for cosine similarity calculation
    positive_embeddings = F.normalize(positive_embeddings, dim=1)
    subset_embeddings = F.normalize(subset_embeddings, dim=1)

    # Compute cosine similarities between subset and positive items
    similarities = torch.mm(subset_embeddings, positive_embeddings.T)  # (|subset|, |positive_items|)

    # Compute mean similarity for each sampled item in the subset
    mean_similarities = similarities.mean(dim=1)  # (|subset|,)

    # Select the top-k hardest negatives (lowest mean similarity)
    hard_negatives_indices = torch.topk(mean_similarities, num_negatives, largest=False).indices

    # Map back to the original item indices
    hard_negatives = filtered_subset[hard_negatives_indices]

    return hard_negatives.tolist()

def generate_negative_samples(user_indices, pos_item_indices, all_item_indices, product_embeddings, num_negatives=10, device='cuda'):
    """
    Generates negative samples for the given users and items using hard_negative_sampling.
    Ensures that negative samples are within valid bounds and handles edge cases gracefully.
    """
    # Ensure all inputs are on the correct device (GPU or CPU)
    user_indices = user_indices.to(device)
    pos_item_indices = pos_item_indices.to(device)
    all_item_indices = all_item_indices.to(device)

    negative_samples = []

    for user, pos_item in zip(user_indices, pos_item_indices):
        try:
            # Adjust product item indices to match embedding index space
            pos_item_adjusted = pos_item + 99999  # Add 99,999 to match the embedding indices for products
            
            # Perform hard negative sampling for this user
            negatives = hard_negative_sampling(
                user_idx=user.item(),
                positive_items=[pos_item_adjusted.item()],  # Use adjusted index for products
                all_items=all_item_indices.tolist(),
                product_embeddings=product_embeddings,
                num_negatives=num_negatives
            )

            # Ensure the output length matches the requested number of negatives
            if len(negatives) < num_negatives:
                # Fill with random negatives if hard sampling didn't return enough
                remaining_negatives = num_negatives - len(negatives)
                additional_negatives = random.sample(
                    list(set(all_item_indices.tolist()) - {pos_item_adjusted.item()}),
                    remaining_negatives
                )
                negatives.extend(additional_negatives)

            negative_samples.append(negatives)

        except Exception as e:
            # Handle any errors during sampling
            print(f"Error generating negatives for user {user.item()}, item {pos_item.item()}: {e}")
            # Add random negatives as a fallback
            fallback_negatives = random.sample(
                list(set(all_item_indices.tolist()) - {pos_item_adjusted.item()}),
                num_negatives
            )
            negative_samples.append(fallback_negatives)

    # Convert the list of negatives into a tensor
    return torch.tensor(negative_samples, device=device)

def parallel_negative_sampling(user_indices, item_indices, all_item_indices, max_neg_samples=5, device='cuda'):
    """
    Generate negative samples efficiently using GPU-based tensor operations.
    Args:
        user_indices (torch.Tensor): The user indices for the batch.
        item_indices (torch.Tensor): The item indices for the batch.
        all_item_indices (torch.Tensor): The unique list of all item indices.
        max_neg_samples (int): The maximum number of negative samples to generate per user-item pair.
        device (str): Device to run the computations ('cuda' or 'cpu').
    Returns:
        torch.Tensor: The negative item indices for each user-item pair.
    """

    # Ensure the data is on the correct device (GPU or CPU)
    user_indices = user_indices.to(device)
    item_indices = item_indices.to(device)
    all_item_indices = all_item_indices.to(device)

    def generate_negatives(user_idx, item_idx):
        # Sample random negatives (ensuring no overlap with the positive item)
        neg_samples = all_item_indices[torch.randint(len(all_item_indices), (max_neg_samples,)).to(device)]
        neg_samples = neg_samples[neg_samples != item_idx]  # Exclude the positive item
        
        # If less than required, keep sampling until we fill max_neg_samples
        while len(neg_samples) < max_neg_samples:
            additional_neg_samples = all_item_indices[torch.randint(len(all_item_indices), (max_neg_samples - len(neg_samples),)).to(device)] 
            neg_samples = torch.cat((neg_samples, additional_neg_samples), dim=0)
            neg_samples = neg_samples[neg_samples != item_idx]
        
        return neg_samples[:max_neg_samples]

    # Parallelizing the negative sampling for each user-item pair using multiprocessing
    def negative_sampling_worker(start_idx, end_idx):
        negatives_batch = []
        for idx in range(start_idx, end_idx):
            user_idx = user_indices[idx]
            item_idx = item_indices[idx]
            neg_samples = generate_negatives(user_idx, item_idx)
            negatives_batch.append(neg_samples)
        return negatives_batch

    num_workers = mp.cpu_count()  # Use all available CPU cores
    chunk_size = len(user_indices) // num_workers
    chunks = [(i * chunk_size, (i + 1) * chunk_size) for i in range(num_workers)]

    # Use multiprocessing to distribute the negative sampling across the workers
    with mp.Pool(num_workers) as pool:
        results = pool.starmap(negative_sampling_worker, chunks)
    
    # Flatten the results
    all_negatives = torch.cat([torch.cat(res, dim=0) for res in results], dim=0)

    return all_negatives

# Training DataLoader
def data_loader(user_indices, pos_item_indices, neg_item_indices, batch_size):
    dataset = TensorDataset(user_indices, pos_item_indices, neg_item_indices)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)

In [153]:
def prepare_data(B, batch_size=10000, model=None, item_embeddings=None):
    """
    Prepares the training data with positive and dynamically generated hard negative samples.
    Optimized by caching neighbors and avoiding redundant lookups.
    """
    if item_embeddings is None:
        raise ValueError("Item embeddings are None. Ensure that the embeddings are properly initialized.")
    
    # Map user and item IDs to integers
    user_to_int = {user: idx for idx, user in enumerate(n for n in B.nodes if B.nodes[n].get('type') == 'customer')}
    item_to_int = {item: idx for idx, item in enumerate(n for n in B.nodes if B.nodes[n].get('type') == 'product')}
    
    # Extract users and items
    users = list(user_to_int.keys())
    all_items = torch.tensor(list(item_to_int.values()), dtype=torch.long)  # All item IDs as tensor
    
    # Cache neighbors for all users (assuming these are static across batches)
    user_neighbors = {user: list(B.neighbors(user)) for user in users}

    # Initialize lists to collect results
    positive_users, positive_items = [], []
    negative_users, negative_items = [], []

    # Process users in batches
    for i in range(0, len(users), batch_size):
        batch_users = users[i:i + batch_size]
        
        # Create positive interactions (observed user-item pairs)
        batch_positive_users, batch_positive_items = [], []
        for user in batch_users:
            neighbors = user_neighbors[user]
            for item in neighbors:
                if B[user][item].get('weight', 0) > 0:  # Valid interaction
                    batch_positive_users.append(user_to_int[user])
                    batch_positive_items.append(item_to_int[item])
        
        positive_users.extend(batch_positive_users)
        positive_items.extend(batch_positive_items)
        
        # Efficient negative sampling for the batch
        for user in batch_users:
            # Get positive items for the user (those they've interacted with)
            positive_items_for_user = [item_to_int[item] for item in user_neighbors[user] if B[user][item].get('weight', 0) > 0]
            
            # Get unobserved items efficiently by creating a mask
            positive_items_set = set(positive_items_for_user)
            unobserved_items = all_items[~torch.isin(all_items, torch.tensor(positive_items_for_user, dtype=torch.long))]

            # Adjust the indices for negative sampling to match embeddings
            unobserved_items_adjusted = unobserved_items + 99999  # Add 99,999 to product indices
            
            # Sample hard negatives for the entire batch of users at once
            sampled_negatives = hard_negative_sampling(user_to_int[user], positive_items_for_user, unobserved_items_adjusted.tolist(), item_embeddings)
            
            # Collect the negative samples
            for neg_item in sampled_negatives:
                negative_users.append(user_to_int[user])
                negative_items.append(neg_item)

    # Ensure there are sufficient positive and negative samples
    if not positive_users or not negative_users:
        raise ValueError("Insufficient positive or negative samples generated.")

    # Convert to tensors for use in training
    positive_users_tensor = torch.tensor(positive_users, dtype=torch.long)
    positive_items_tensor = torch.tensor(positive_items, dtype=torch.long)
    negative_users_tensor = torch.tensor(negative_users, dtype=torch.long)
    negative_items_tensor = torch.tensor(negative_items, dtype=torch.long)
    
    # Return the dataset
    return data_loader(positive_users_tensor, positive_items_tensor, negative_items_tensor, batch_size)

In [154]:
def prepare_test_data(B, batch_size=10000):
    # Map user and item IDs to integers
    user_to_int = {user: idx for idx, user in enumerate(n for n in B.nodes if B.nodes[n].get('type') == 'customer')}
    item_to_int = {item: idx for idx, item in enumerate(n for n in B.nodes if B.nodes[n].get('type') == 'product')}

    # Extract users as integers
    users = list(user_to_int.keys())

    # Initialize lists to collect results
    test_users, test_items = [], []

    # Precompute neighbors for all users in the graph to avoid redundant calculations
    user_neighbors = {user: list(B.neighbors(user)) for user in users}
    
    # Process users in batches to save memory
    for i in range(0, len(users), batch_size):
        batch_users = users[i:i + batch_size]
        
        # Create test samples for the current batch
        for user in batch_users:
            neighbors = user_neighbors[user]
            for item in neighbors:
                # Check if the interaction is valid (positive weight)
                if B[user][item].get('weight', 0) > 0:
                    test_users.append(user_to_int[user])
                    test_items.append(item_to_int[item])

    if not test_users:
        raise ValueError("No test samples found in the dataset.")

    # Convert to tensors
    test_user_indices = torch.tensor(test_users, dtype=torch.long)
    test_item_indices = torch.tensor(test_items, dtype=torch.long)
    
    return test_user_indices, test_item_indices

In [155]:
def bpr_loss(y_pred_pos, y_pred_neg, lambda_reg=1e-5):
    """
    BPR loss function
    Args:
        y_pred_pos: predicted score for positive item
        y_pred_neg: predicted score for negative item
        lambda_reg: L2 regularization strength
    """
    loss = -torch.log(torch.sigmoid(y_pred_pos - y_pred_neg)).mean()
    reg_loss = lambda_reg * (y_pred_pos.norm(2) + y_pred_neg.norm(2))
    return loss + reg_loss

#### Evaluation metric

In [156]:
def hit_at_k(predictions, ground_truth, k=10):
    """
    Calculate Hit@K for recommendations.
    
    Args:
        predictions: The predicted scores for items (e.g., user-item pairs).
        ground_truth: The true items that the user interacted with.
        k: The number of top items to consider for the hit calculation.

    Returns:
        hit@k score: The percentage of times the true item is in the top-k recommendations.
    """
    hits = 0
    for i in range(len(predictions)):
        # Ensure that predictions[i] is a 1D tensor (with items to rank)
        if predictions[i].dim() == 1:
            num_items = predictions[i].size(0)
        else:
            # Skip if the prediction is not a tensor with the expected dimensions
            continue
        
        # If the number of items is fewer than k, use all items
        if num_items < k:
            top_k_items = predictions[i].topk(num_items).indices.tolist()
        else:
            top_k_items = predictions[i].topk(k).indices.tolist()

        # Check if the true item is in the top k predictions
        if ground_truth[i] in top_k_items:
            hits += 1

    hit_at_k = hits / len(predictions)
    return hit_at_k

In [157]:
def train(model, train_loader, optimizer, epoch, device):
    model.train()
    total_loss = 0.0
    scaler = GradScaler()  # For mixed precision training
    
    for user_indices, item_indices, negative_item_indices in train_loader:
        # Move data to the device in advance if it's not already there
        user_indices = user_indices.to(device)
        item_indices = item_indices.to(device)
        negative_item_indices = negative_item_indices.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        with autocast():  # Enable mixed precision
            # Forward pass
            positive_preds, negative_preds = model(user_indices, item_indices, negative_item_indices)

            # Compute BPR loss
            loss = bpr_loss(positive_preds, negative_preds, lambda_reg=1e-5)

        # Backpropagate and optimize
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    return total_loss / len(train_loader)

def evaluate(model, test_loader, device, k=10):
    model.eval()
    predictions = []
    ground_truth = []

    with torch.no_grad():
        for user_indices, item_indices, _ in test_loader:
            # Move data to device in advance
            user_indices = user_indices.to(device)
            item_indices = item_indices.to(device)

            # Forward pass
            positive_preds, _ = model(user_indices, item_indices, item_indices)
            predictions.append(positive_preds)

            # Collect ground truth (true items)
            ground_truth.append(item_indices)

    # Convert lists to tensors
    predictions = torch.cat(predictions, dim=0).cpu()  # Move only once after the loop
    ground_truth = torch.cat(ground_truth, dim=0).cpu()

    # Calculate Hit@10
    return hit_at_k(predictions, ground_truth, k)

In [164]:
# Set the device for GPU usage (if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device in use: {device}")

Device in use: cuda


In [165]:
# Define number of users and items
num_users = [n for n in B.nodes if B.nodes[n].get('type') == 'customer']
num_items = [n for n in B.nodes if B.nodes[n].get('type') == 'product']

embedding_size = 64
num_layers = 3 
batch_size = 1024
learning_rate = 0.001
l2_reg = 1e-5
dropout = 0.1
node_dropout = 0.1
epochs = 100
patience = 10  # Early stopping patience

In [166]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Ensure errors are reported synchronously

# Initialize data preparation and model setup (same as your code)

# Prepare data, assuming prepare_data() and prepare_test_data() functions are properly defined
batch_size_data_prep = 10000
mp.set_start_method('spawn', force=True)

train_user_indices, train_item_indices, train_negative_item_indices = prepare_data(
    B, batch_size=batch_size_data_prep, item_embeddings=product_embeddings
)
test_user_indices, test_item_indices = prepare_test_data(B, batch_size=batch_size_data_prep)

# Downsample for faster experimentation (optional)
sample_fraction = 0.1
num_samples = min(int(sample_fraction * len(train_user_indices)), len(train_user_indices))
train_sample_indices = torch.randperm(len(train_user_indices))[:num_samples]

# Split into train/val sets (same as your code)
train_split_fraction = 0.8
num_train_samples = int(train_split_fraction * num_samples)
permuted_indices = torch.randperm(num_samples)

train_indices = train_sample_indices[permuted_indices[:num_train_samples]]
val_indices = train_sample_indices[permuted_indices[num_train_samples:]]

# Ensure alignment between train and validation sets
train_user_indices = train_user_indices[train_indices]
train_item_indices = train_item_indices[train_indices]
train_negative_item_indices = train_negative_item_indices[train_indices]

val_user_indices = train_user_indices[val_indices]
val_item_indices = train_item_indices[val_indices]
val_negative_item_indices = train_negative_item_indices[val_indices]

# Combine unique item indices for negative sampling
all_item_indices = torch.unique(torch.cat((train_item_indices, train_negative_item_indices)))

# Move product embeddings and data to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
product_embeddings = product_embeddings.to(device)

# Ensure indices are within the range of product embeddings
max_index = product_embeddings.size(0) - 1
assert torch.all(train_user_indices <= max_index), "User indices are out of bounds."
assert torch.all(train_item_indices <= max_index), "Item indices are out of bounds."
assert torch.all(train_negative_item_indices <= max_index), "Negative item indices are out of bounds."

train_user_indices = train_user_indices.to(device)
train_item_indices = train_item_indices.to(device)
train_negative_item_indices = train_negative_item_indices.to(device)

val_user_indices = val_user_indices.to(device)
val_item_indices = val_item_indices.to(device)
val_negative_item_indices = val_negative_item_indices.to(device)

test_user_indices = test_user_indices.to(device)
test_item_indices = test_item_indices.to(device)

# Negative sampling function (assuming it's properly defined)
train_negative_item_indices = parallel_negative_sampling(
    train_user_indices, train_item_indices, all_item_indices, max_neg_samples=5
)
val_negative_item_indices = parallel_negative_sampling(
    val_user_indices, val_item_indices, all_item_indices, max_neg_samples=5
)

# Create DataLoaders for training, validation, and testing
train_loader = data_loader(train_user_indices, train_item_indices, train_negative_item_indices, batch_size_data_prep)
val_loader = data_loader(val_user_indices, val_item_indices, val_negative_item_indices, batch_size_data_prep)

test_dataset = TensorDataset(test_user_indices, test_item_indices)
test_loader = DataLoader(test_dataset, batch_size=batch_size_data_prep, shuffle=False, num_workers=4, pin_memory=True)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Instantiate the model with the fixed hyperparameters
model = NGCF(
    num_users=len(num_users),
    num_items=len(num_items),
    embedding_size=embedding_size,
    num_layers=num_layers,
    dropout_ratio=dropout,
    p1=dropout,
    p2=node_dropout,
).to(device)

# Define optimizer with the fixed learning rate and L2 regularization
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=l2_reg)

# Early stopping parameters
best_val_loss = float('inf')
best_hit_at_10 = 0.0  # For tracking Hit@10 improvement
epochs_since_improvement = 0

# Path to save the model
save_path = "best_ngcf_model.pth"

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    accumulation_steps = 4  # Accumulate gradients over 4 mini-batches

    for batch_idx, (users, pos_items, neg_items) in enumerate(train_loader):
        # Move data to GPU, non-blocking for efficiency
        users = users.to(device, non_blocking=True)
        pos_items = pos_items.to(device, non_blocking=True)
        neg_items = neg_items.to(device, non_blocking=True)
        
        # Zero gradients (before accumulating)
        optimizer.zero_grad()
        
        # Forward pass
        positive_preds, negative_preds = model(users, pos_items, neg_items)
        
        # Compute BPR loss
        loss = bpr_loss(positive_preds, negative_preds, lambda_reg=1e-5)
        
        # Backpropagation (but don't optimize yet)
        loss.backward()

        # Update weights every `accumulation_steps` batches
        if (batch_idx + 1) % accumulation_steps == 0 or (batch_idx + 1) == len(train_loader):
            optimizer.step()
            optimizer.zero_grad()  # Reset gradients after step

        epoch_loss += loss.item()
    # Average loss for the epoch
    train_loss = epoch_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}")
    
    # Validate less frequently (e.g., every 5 epochs)
    if (epoch + 1) % 5 == 0 or epoch == epochs - 1:
        model.eval()
        with torch.no_grad():
            val_loss = evaluate(model, val_loader, device)
            hit_at_10_score = evaluate(model, val_loader, device, k=10)
        print(f"Epoch {epoch + 1}/{epochs}, Validation Loss: {val_loss:.4f}, Hit@10: {hit_at_10_score:.4f}")

        # Save model if improved
        if val_loss < best_val_loss or hit_at_10_score > best_hit_at_10:
            best_val_loss = val_loss
            best_hit_at_10 = hit_at_10_score
            epochs_since_improvement = 0
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'best_val_loss': best_val_loss,
                'best_hit_at_10': best_hit_at_10
            }, save_path)
            print(f"Model saved to {save_path}")
        else:
            epochs_since_improvement += 1
            if epochs_since_improvement >= patience:
                print(f"Early stopping triggered at epoch {epoch + 1}.")
                break

# Final Test Evaluation
model.eval()
with torch.no_grad():
    final_hit_at_10 = evaluate(model, test_loader, device, k=10)
print(f"Final Hit@10: {final_hit_at_10:.4f}")

NameError: name 'NGCF' is not defined

### Evaluate frequency model on whole test data using hitrate@10

The aim here is to assess how relevant our 10 product recommendations are. To do this, we calculate the percentage of these recommendations that are actually purchased by the customer.

In order to be precise, we describe here the formula for the computation of the score **hitrate@10**.
Let us introduce two vectors:
+ $(a_i)_{1 \le i \le N}$ with $N \in \mathbb{N}^*$ the products purchased by a customer during an order,
+ $(y_i)_{1 \le i \le 10}$ the $10$ predicted products.

Let us remark that $N$ is not necessarily equal to $10$. Then we have the following formula:


$$HitRate@K(a,y) = \frac{1}{\min(N,10)}\sum_{i=1}^{10} \mathbb{1}_{y_i \in \{a_1,...,a_N\}}$$

Let us provide several remarks:
+ We divide by $\min(N,10)$ in order to have a perfect score $1$ when the customer buys less than $10$ products and all these products are predicted.
+ This definition implicitly assumes that all $y_1,...,y_{10}$ are distinct. If some values are identical, the result can be distorted: for example, if the same product is entered $10$ times and this product is purchased, then the score is $1$. In order to avoid such a case, we ask you to have $10$ different products in your prediction. 
+ For the private and public score on kaggle, the rank is not taken into account. Nevertheless, we may use the score   **hitrate@K** for $K<10$ after the competition to decide between possible very close groups. So we ask you to give for each product a rank between $1$ and $10$ (i.e. a permutation of $\{1,...,10\}$).

In [None]:
# Hitrate@10 evaluation function

def hitrate_at_k(true_data: pd.DataFrame,
                 predicted_data: pd.DataFrame,
                 k: int = 10) -> float:
    """
    This function calculates the hitrate at k for the recommendations.
    It assesses how relevant our 10 product recommendations are.
    In other words, it calculates the proportion of recommended products that are actually purchased by the customer.

    Args:
        true_data: a pandas DataFrame containing the true data
            customer_id: the customer identifier
            product_id: the product identifier that was purchased in the test set
        predicted_data: a pandas DataFrame containing the predicted data
            customer_id: the customer identifier
            product_id: the product identifier that was recommended
            rank: the rank of the recommendation. the rank should be between 1 and 10.
        k: the number of recommendations to consider. k should be between 1 and 10.
    
    Returns:
        The hitrate at k
    """
    
    data = pd.merge(left = true_data, right = predicted_data, how = "left", on = ["customer_id", "product_id"])
    df = data[data["rank"] <= k]
    non_null_counts = df.groupby('customer_id')['rank'].apply(lambda x: x.notna().sum()).reset_index(name='non_null_count')
    distinct_products_per_customer = data.groupby('customer_id')['product_id'].nunique().reset_index(name='distinct_product_count')
    df = pd.merge(left = distinct_products_per_customer, right = non_null_counts, how = "left", on = "customer_id")
    df["denominator"] = [min(df.iloc[i].distinct_product_count,k) for i in range(len(df))]
    df = df.fillna(0)
    return (df["non_null_count"]/df["denominator"]).mean()

In [None]:
# Calculate the hitrate at k for k = 10
frequency_model_hitrate_at_10 = hitrate_at_k(test_data,top_10_recommendations,10)
print(f"Hitrate@10 for the frequency model is {frequency_model_hitrate_at_10:.2f}")

# Create submission file 

The goal of this part is to provide a function that allows you to encode your prediction in a format that is readable by kaggle when you submit it. In particular, this function checks that you have 10 distinct products per customer and that the ranks are some distinct integers between 1 to 10.

In [None]:
# Create submission file for 

# Keep only the top 10 recommendations for Households between 80001 and 100000
prediction = top_10_recommendations[
    top_10_recommendations.customer_id.isin(
            [
                f"Household_{i}" for i in range(80001,100001)
            ]
        )
    ]

# Print the solution
prediction.head()

In [None]:
def process_and_format_prediction(df):

    # Replace invalid characters in column names
    df.columns = df.columns.str.replace('+AF8-', '_', regex=False)
    df = df.replace(r'\+AF8-', '_', regex=True)

    # Clean the 'customer_id', 'product_id', and 'transaction_id' columns
    if 'customer_id' in df.columns and df['customer_id'].dtype == 'object':
        df['customer_id'] = df['customer_id'].str.extract('(\d+)').fillna(11).astype(int)

    if 'product_id' in df.columns and df['product_id'].dtype == 'object':
        df['product_id'] = df['product_id'].str.extract('(\d+)').fillna(11).astype(int)

    if 'transaction_id' in df.columns and df['transaction_id'].dtype == 'object':
        df['transaction_id'] = df['transaction_id'].str.replace(r'\D', '', regex=True).fillna(11).astype(int)

    df['id'] = df.index
    df = df[['id'] + [col for col in df.columns if col != 'id']]

    if 'customer_id' not in df.columns or 'product_id' not in df.columns:
        raise ValueError("true_data must contain 'customer_id' and 'product_id' columns")

    # Group by customer_id and concatenate product and rank values
    prediction_grouped = df.groupby('customer_id').agg({
        'id': 'first',  # Take the first value of 'id'
        'product_id': lambda x: ','.join(map(str, x)),  # Concatenate product_ids into a string
        'rank': lambda x: ','.join(map(str, x))  # Concatenate ranks into a string
    }).reset_index()

    # Drop the 'id' column if it exists
    if 'id' in prediction_grouped.columns:
        prediction_grouped = prediction_grouped.drop(columns=['id'])

    # Filter the data
    prediction_grouped = prediction_grouped[prediction_grouped['customer_id'] != 11]
    prediction_grouped.insert(0, 'id', range(len(prediction_grouped)))
    
    # Verify ranks and duplicates
    for index, row in prediction_grouped.iterrows():

        # Check ranks
        ranks = list(map(int, row['rank'].split(',')))

        if sorted(ranks) != list(range(1, 11)):  # Check that ranks are distinct from 1 to 10
            print("Duplicate detected. Ranks must be distinct (from 1 to 10) for each of the 10 predicted products for a customer.\n")
            return None
        
        # Check for product duplicates
        products = row['product_id'].split(',')
        
        if len(products) != len(set(products)):  # If duplicates are present in the products
            print("Duplicate detected. There must be 10 different products per customer.\n")
            return None

    return prediction_grouped

prediction_grouped = process_and_format_prediction(prediction)
print(prediction_grouped)


In [None]:
# Create a .csv file to submit on kaggle
prediction_grouped.to_csv('submission/submission_list.csv', index=False)