In [1]:
# Importing necessary libraries
import pandas as pd

# Import the data
data = pd.read_csv('data/session_data.csv')

# Display the first few rows of the data
data.head()

Unnamed: 0,session_id,product_name
0,000ed966131fcb96e0efc4ff2b716a3e,beetroot
1,000ed966131fcb96e0efc4ff2b716a3e,cucumber
2,0013eab657eaf2d82d7f1e13023d95c2,onion
3,0013eab657eaf2d82d7f1e13023d95c2,long shelf life milk
4,0013fabde1e543dd541be925266aadbc,dates


## Feature Engineering

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Label encode the product names
encoder = LabelEncoder()
data['product_id'] = encoder.fit_transform(data['product_name'])

# Create user-product interaction matrix
user_product_matrix = data.pivot_table(index='session_id', columns='product_id', aggfunc='size', fill_value=0)

# Train-test split
train_data, test_data = train_test_split(user_product_matrix, test_size=0.2, random_state=42)

## Model Training

Collaborative Filtering using Matrix Factorization (SVD)

In [4]:
import numpy as np
from scipy.sparse.linalg import svds

# Convert the user-product matrix to a NumPy array
R = train_data.to_numpy()
user_ratings_mean = np.mean(R, axis=1)
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

# Apply SVD
U, sigma, Vt = svds(R_demeaned, k=50)
sigma = np.diag(sigma)

# Reconstruct the matrix
predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)
predicted_ratings = pd.DataFrame(predicted_ratings, columns=train_data.columns)