In [1]:
pip install scikit-surprise

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd

# Load datasets
links = pd.read_csv('links.csv')              # contains movieId, imdbId, tmdbId
ratings = pd.read_csv('ratings.csv')          # contains userId, movieId, rating, timestamp
movies_tmdb = pd.read_csv('movies.csv')  # or your 'movies.csv' file from TMDB

# Important: ensure the 'id' column in TMDB is of integer type to match 'tmdbId'
movies_tmdb['id'] = pd.to_numeric(movies_tmdb['id'], errors='coerce')

# Drop rows with missing tmdbId or id to ensure clean join
links = links.dropna(subset=['tmdbId'])
links['tmdbId'] = links['tmdbId'].astype(int)
movies_tmdb = movies_tmdb.dropna(subset=['id'])
movies_tmdb['id'] = movies_tmdb['id'].astype(int)

# Merge datasets: ratings → links → TMDB movies
merged = ratings.merge(links, on='movieId').merge(movies_tmdb, left_on='tmdbId', right_on='id')


In [5]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD, accuracy

# Prepare data
from surprise import Dataset, Reader

# Use only the necessary columns: userId, tmdbId, rating
df_cf = merged[['userId', 'tmdbId', 'rating']]

# Surprise expects a specific rating scale
reader = Reader(rating_scale=(0.5, 5.0))  # adjust if needed

# Load into Surprise format
data = Dataset.load_from_df(df_cf[['userId', 'tmdbId', 'rating']], reader)

In [7]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import train_test_split

# Assuming you already have your Surprise dataset loaded as `data`
# If not, load your dataset accordingly

# Initialize the SVD model with best parameters
best_svd_model = SVD(n_factors=50, lr_all=0.005, reg_all=0.1)

# Train the model on the full training set
trainset = data.build_full_trainset()
best_svd_model.fit(trainset)

def predict_rating(user_id, movie_id):
    """
    Predict the rating that user_id would give to movie_id
    using the trained SVD model.
    
    Parameters:
    - user_id: str or int, user identifier
    - movie_id: str or int, movie identifier
    
    Returns:
    - estimated rating (float)
    """
    prediction = best_svd_model.predict(str(user_id), str(movie_id))
    return prediction.est


In [15]:
print(f"{predict_rating(6, 100):.2f}")

3.58
