Data Preprocessing

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import implicit

# Load and preprocess data
data = pd.read_parquet('train.parquet')
data['date'] = pd.to_datetime(data['date'])

# Aggregating user-item interactions
user_item_interaction = data.groupby(['userId', 'itemId']).size().reset_index(name='interaction')

# Create a user-item interaction matrix
interaction_matrix = user_item_interaction.pivot(index='userId', columns='itemId', values='interaction').fillna(0)

# Convert to sparse matrix format
interaction_sparse = csr_matrix(interaction_matrix.values)

# Train-test split
train_data, test_data = train_test_split(user_item_interaction, test_size=0.2, random_state=42)

# Create interaction matrices for train and test sets
train_matrix = train_data.pivot(index='userId', columns='itemId', values='interaction').fillna(0)
test_matrix = test_data.pivot(index='userId', columns='itemId', values='interaction').fillna(0)

train_sparse = csr_matrix(train_matrix.values)
test_sparse = csr_matrix(test_matrix.values)

Model Building:

Selected Model: https://benfred.github.io/implicit/api/models/cpu/als.html

The ALS algorithm on the library is based on the paper: "Collaborative Filtering for Implicit Feedback Datasets" with additional optimizations. ALS algorithm is used with datasets where the data is implicit like click count, and it is suitable for large datasets.

In [11]:
# Initialize the ALS model
model = implicit.als.AlternatingLeastSquares(factors=50, regularization=0.1, iterations=20)

# Train the model
model.fit(train_sparse)

def recommend_items_with_descriptions(user_id, model, interaction_matrix, interaction_sparse, data, num_recommendations=5):
    user_index = interaction_matrix.index.get_loc(user_id)
    item_indices, scores = model.recommend(user_index, interaction_sparse[user_index], N=num_recommendations)
    recommended_item_ids = [interaction_matrix.columns[i] for i in item_indices]
    
    # Retrieve item descriptions
    recommended_items = data[data['itemId'].isin(recommended_item_ids)][['itemId', 'category']].drop_duplicates().reset_index(drop=True)
    
    return recommended_items

100%|██████████| 20/20 [00:11<00:00,  1.80it/s]


Demo

In [12]:
# Example: Generate recommendations for a user
example_user_id = interaction_matrix.index[0]
recommended_items_with_desc = recommend_items_with_descriptions(example_user_id, model, interaction_matrix, interaction_sparse, data)
print(f"Recommended items for user {example_user_id}:\n{recommended_items_with_desc}")

Recommended items for user 0001d86ea81e6eef12cebaa1dcbdadc2:
                                              itemId  \
0                   1ebaecedff83b2c752deb1060147e21e   
1                   790901c3b65b96818091880a4c79a8b8   
2                   790901c3b65b96818091880a4c79a8b8   
3                   aae00106ce8ee782b6541b4bbce37fed   
4  ["b2649b0b9daf26a1a4e8ce0cdf700b13","66f120ebc...   
5                   b4a195de86acbc620f75c1a04ca1a9b4   

                                           category  
0                ["kadın ayakkabi","spor ayakkabı"]  
1  ["seyahat american tourister","kabin boy valiz"]  
2            ["seyahat american tourister","çocuk"]  
3              ["erkek ayakkabı","günlük ayakkabı"]  
4                                                []  
5                                                []  
