In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
# Install the scikit-surprise library
!pip install scikit-surprise



In [2]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

In [14]:
file_path = '/kaggle/input/amazon-reviews/Amazon_ratings.csv' 

# File doesn't have a header, so we'll name the columns.
df = pd.read_csv(file_path, names=['UserId', 'ProductId', 'Rating', 'Timestamp'])

# --- DATA PREPARATION ---
# We only need the User, Product, and Rating columns
df_recommender = df[['UserId', 'ProductId', 'Rating']].copy()

# Smaller sample of the data
df_sample = df_recommender.sample(n=100000, random_state=42)

# First few rows of the prepared data
print("Data sample head:")
print(df_sample.head())
print(f"\nShape of the sample data: {df_sample.shape}")

  df = pd.read_csv(file_path, names=['UserId', 'ProductId', 'Rating', 'Timestamp'])


Data sample head:
                 UserId   ProductId Rating
1896735  A1DAS5O0WFN018  B00CZ4YCGW    5.0
429761    AI918ELXX2X6O  B000UPRSKA    5.0
1472796  A1Z2W2Q1HC1T4E  B0062RJ3FI    5.0
113959    AK11G3BGYYRXN  B0006L7LNW    5.0
1304501  A2426BZ6LAO7QI  B004UAOOEO    1.0

Shape of the sample data: (100000, 3)


In [8]:
# The Reader object helps Surprise parse the rating scale.
reader = Reader(rating_scale=(1, 5))

# Load the data from the pandas DataFrame into a Surprise dataset.
data = Dataset.load_from_df(df_sample[['UserId', 'ProductId', 'Rating']], reader)

print("Data has been loaded into Surprise's format and is ready for training.")

Data has been loaded into Surprise's format and is ready for training.


In [9]:
# Split the data into 80% for training and 20% for testing
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Initialize the SVD model.
model = SVD(n_factors=50, random_state=42)

# Training
print("Training the SVD model... (This may take a moment)")
model.fit(trainset)
print("Model training complete.")

Training the SVD model... (This may take a moment)
Model training complete.


In [10]:
# Testing on unseen data
predictions = model.test(testset)

# Calculate the RMSE
rmse = accuracy.rmse(predictions)
print(f"Model RMSE on the test set: {rmse:.4f}")

RMSE: 1.2965
Model RMSE on the test set: 1.2965


In [12]:
def get_top_n_recommendations(user_id, n=10):
    """
    Returns the top N product recommendations for a given user based on our trained model.
    """
    # Get a list of all unique product IDs from our sample
    all_product_ids = df_sample['ProductId'].unique()
    
    # Get the list of product IDs that the user has already rated
    products_rated_by_user = df_sample[df_sample['UserId'] == user_id]['ProductId'].unique()
    
    # Create a list of products the user has NOT rated
    products_to_predict = [prod_id for prod_id in all_product_ids if prod_id not in products_rated_by_user]
    
    # Predict the ratings for all the unrated products
    predicted_ratings = []
    for product_id in products_to_predict:
        predicted_rating = model.predict(uid=user_id, iid=product_id).est
        predicted_ratings.append((product_id, predicted_rating))
            
    # Sort the predictions by estimated rating in descending order
    predicted_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Get the top N recommended product IDs
    top_n_recommendations = [prod_id for prod_id, rating in predicted_ratings[:n]]
    
    return top_n_recommendations

# --- Example: Get recommendations for a sample user ---

# First, find a user who has made several ratings in our dataset for a good test
user_review_counts = df_sample['UserId'].value_counts()
sample_user_id = user_review_counts[user_review_counts > 5].index[0]

print(f"✨ Getting top 10 recommendations for User ID: {sample_user_id}\n")

# Get the list of recommended Product IDs
recommendations = get_top_n_recommendations(sample_user_id, n=10)

print("Top 10 Recommended Product IDs:")
for i, prod_id in enumerate(recommendations):
    print(f"{i+1}. {prod_id}")

✨ Getting top 10 recommendations for User ID: A3KEZLJ59C1JVH

Top 10 Recommended Product IDs:
1. B001ET76EY
2. B00004TUBL
3. B006IBM21K
4. B000NNDNYY
5. B001ET77NY
6. B000YJ2SLG
7. B00GJX58PE
8. B00264NW7G
9. B0043OWJG2
10. B000127UUA
