In [1]:
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', None)
from backend.data_models.save_to_db import get_ratings
from rapidfuzz import process, fuzz
import time

%load_ext autoreload
%autoreload 2

In [2]:
ratings = pd.read_csv('data/ratings_v6.csv')
ratings = ratings.rename(columns={'member_id': 'user_name', "film_id": "item_id"}).dropna()

def transform_ratings_and_compute_mean(df):
    # Map the rating to a 0.5-5 scale by normalizing the rating
    df = df.copy()
    df['rating'] = (df['rating'] - 1) * (4.5 / 9) + 0.5

    return df

training_data_scaled = transform_ratings_and_compute_mean(ratings[ratings.user_name!='ard_s'])
screenings = pd.read_csv('data/movies.csv')

KeyboardInterrupt: 

In [403]:
# screenings['title'] = screenings.movie_link.apply(lambda x : x.split('/')[-3])
unique_screenings = screenings.title.unique().tolist()
unique_screenings = [title.title() for title in unique_screenings]
my_films = ratings[ratings.member_id=='ard_s'].dropna(subset='rating').alt_title.tolist()
top_movies = ratings["alt_title"].value_counts().iloc[:800].index.tolist()
film_set = unique_screenings + my_films + top_movies

In [375]:
# print(len(film_set))
print(len(ratings[ratings["alt_title"].isin(film_set)].alt_title.unique()))

951


In [404]:
# Get top users and top movies
top_users = set(ratings["member_id"].value_counts().nlargest(3000).index) | {"ard_s", "miloujane", "abstraxia"}

# Filter the original DataFrame
filtered_ratings = ratings[ratings["member_id"].isin(top_users) & ratings["alt_title"].isin(film_set)].dropna()

In [379]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from surprise import accuracy

# Step 2: Prepare the Data for Surprise
reader = Reader(rating_scale=(0.5, 5))  # Assuming ratings are between 1 and 10
data = Dataset.load_from_df(filtered_ratings[['member_id', 'film_id', 'rating']], reader)

# Step 3: Train-Test Split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [380]:
# Step 4: Train the SVD Model
model = SVD(n_factors=20, random_state=42)  # n_factors = number of latent factors
model.fit(trainset)

# Step 5: Make Predictions
predictions = model.test(testset)

# Step 6: Evaluate Model Performance
rmse_score = rmse(predictions)

RMSE: 1.3403


In [381]:
# Step 7: Function to Get Top-N Recommendations for a User
def get_top_n_recommendations(model, user_id, top_n=10):
    
    # Get all movie IDs
    all_movies = filtered_ratings["slug"].unique()
    
    # Get movies the user has already rated
    rated_films = ratings[ratings["member_id"] == user_id].film_id.unique()
    
    # Filter out already rated movies 
    # #TODO also filter out movies that had NaN values!
    current_films = filtered_ratings[filtered_ratings["alt_title"].isin(unique_screenings)].film_id.unique()

    current_unseen = set(current_films) - set(rated_films)

    movies_to_predict = [m for m in current_unseen]
    
    # Predict ratings for all unrated movies
    predictions = [model.predict(user_id, movie) for movie in movies_to_predict]
    
    # Sort by estimated rating
    top_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:top_n]
    
    # Extract movie IDs and estimated scores
    recommended_movies = [(pred.iid, pred.est) for pred in top_predictions]
    
    return recommended_movies

In [383]:
# Example: Get recommendations for a specific user
user_id = "ard_s"  # Change this to a valid user_id
top_recommendations = get_top_n_recommendations(model, user_id, top_n=100)

# Display recommendations
print(f"Top 10 recommendations for user {user_id}:")
for movie_id, score in top_recommendations:
    movie_title = ratings.loc[ratings["film_id"] == movie_id, "alt_title"].values[0]
    print(f"Movie: {movie_title} (ID: {movie_id}) - Predicted Rating: {score:.2f}")

Top 10 recommendations for user ard_s:
Movie: Taxi Driver (ID: 51947) - Predicted Rating: 8.56
Movie: The Apartment (ID: 51774) - Predicted Rating: 8.35
Movie: Tokyo Story (ID: 40554) - Predicted Rating: 8.31
Movie: The Elephant Man (ID: 50653) - Predicted Rating: 8.26
Movie: The Deer Hunter (ID: 45017) - Predicted Rating: 8.21
Movie: The Big Lebowski (ID: 51935) - Predicted Rating: 8.20
Movie: Before Sunrise (ID: 51974) - Predicted Rating: 8.11
Movie: The Pianist (ID: 51672) - Predicted Rating: 8.11
Movie: Before Sunset (ID: 51970) - Predicted Rating: 8.08
Movie: Mother (ID: 142453) - Predicted Rating: 7.93
Movie: Winter Sleep (ID: 188418) - Predicted Rating: 7.93
Movie: Secrets & Lies (ID: 45549) - Predicted Rating: 7.88
Movie: Punishment Park (ID: 35264) - Predicted Rating: 7.80
Movie: Mother (ID: 23839) - Predicted Rating: 7.79
Movie: Mother (ID: 177213) - Predicted Rating: 7.76
Movie: Eraserhead (ID: 51184) - Predicted Rating: 7.75
Movie: Interstellar (ID: 117621) - Predicted Rati

In [444]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse

class SVDRecommender:
    def __init__(self, ratings, screenings, additional_users=None, n_factors=20, random_state=42):
        self.ratings = ratings  # Accept pre-loaded ratings DataFrame
        self.screenings = screenings
        self.additional_users = additional_users or {"ard_s", "miloujane", "abstraxia"}
        self.n_factors = n_factors
        self.random_state = random_state
        self.model = SVD(n_factors=n_factors, random_state=random_state)
        self.filtered_ratings = None
        
    def prepare_data(self, user_id, n_titles, n_users):
        """Filters ratings and prepares the dataset for training."""
        print('\nfetching unique screenings')
        unique_screenings = self._get_unique_screenings()
        print('constructing set of films to train on')
        film_set = self._get_film_set(unique_screenings, user_id, n_titles)
        print('fetching top users')
        top_users = self._get_top_users(n_users)
        
        self.filtered_ratings = self.ratings[
            self.ratings["member_id"].isin(top_users) & self.ratings["alt_title"].isin(film_set)
        ].dropna()

        print(f'training set has shape {self.filtered_ratings.shape}')
        
    def _get_unique_screenings(self):
        """Extracts and formats unique screening titles."""
        unique_screenings = self.screenings.title.unique().tolist()
        return [title.title() for title in unique_screenings]
    
    def _get_film_set(self, unique_screenings, user_id, n_titles=800):
        """Gets the final set of films for training."""
        user_films = self.ratings[self.ratings.member_id == user_id].dropna(subset=['rating']).alt_title.tolist()
        top_movies = self.ratings["alt_title"].value_counts().iloc[:n_titles].index.tolist()
        return unique_screenings + user_films + top_movies
    
    def _get_top_users(self, n_users=3000):
        """Gets top users for training including specified additional users."""
        top_users = set(self.ratings["member_id"].value_counts().nlargest(n_users).index)
        return top_users | self.additional_users
    
    def train_model(self):
        """Trains the SVD model."""
        reader = Reader(rating_scale=(1, 10))
        data = Dataset.load_from_df(self.filtered_ratings[['member_id', 'film_id', 'rating']], reader)
        trainset, testset = train_test_split(data, test_size=0.2, random_state=self.random_state)
        
        self.model.fit(trainset)
        predictions = self.model.test(testset)
        
        print("RMSE:", rmse(predictions))
    
    def get_recommendations(self, user_id, top_n=10):
        """Generates top-N recommendations for a user."""
        print(f'\ngenerating predictions for user: {user_id}')
        rated_films = self.ratings[self.ratings["member_id"] == user_id].film_id.unique()
        current_films = self.filtered_ratings[self.filtered_ratings["alt_title"].isin(self._get_unique_screenings())].film_id.unique()
        current_unseen = set(current_films) - set(rated_films)

        # top_250 = self.ratings.film_id.value_counts().iloc[:1000].index.tolist()
        # eligible_for_prediction = set(current_unseen) - set(top_250)
        all_films = self.filtered_ratings.film_id.unique().tolist()
        eligible = set(all_films) - set(rated_films)

        predictions = [self.model.predict(user_id, movie) for movie in eligible]
        top_predictions = sorted(predictions, key=lambda x: x.est, reverse=True)[:top_n]
        
        recommended_movies = [(pred.iid, pred.est) for pred in top_predictions]
        return recommended_movies
    
    def train_and_predict(self, user_id, top_n=10):
        """Main method to prepare data, train the model, and get recommendations."""
        self.train_model()
        top_recommendations = self.get_recommendations(user_id, top_n)
        print("Top recommendations:")
        for movie_id, score in top_recommendations:
            movie_title = self.ratings.loc[self.ratings["film_id"] == movie_id, "alt_title"].values[0]
            print(f"Movie: {movie_title} (ID: {movie_id}) - Predicted Rating: {score:.2f}")

        return top_recommendations

In [447]:
for user_id in ['ard_s']:
    recommender = SVDRecommender(ratings, screenings)
    recommender.prepare_data(user_id=user_id, n_titles=5000, n_users=300)
    top_recommendations = recommender.train_and_predict(user_id, top_n=100)


fetching unique screenings
constructing set of films to train on
fetching top users
training set has shape (756151, 5)
RMSE: 1.4329
RMSE: 1.4328594275462492

generating predictions for user: ard_s
Top recommendations:
Movie: Terminator 2: Judgment Day (ID: 51776) - Predicted Rating: 8.83
Movie: The Thing (ID: 51155) - Predicted Rating: 8.80
Movie: 2001: A Space Odyssey (ID: 51987) - Predicted Rating: 8.79
Movie: Taxi Driver (ID: 51947) - Predicted Rating: 8.79
Movie: Rear Window (ID: 51552) - Predicted Rating: 8.77
Movie: Raiders of the Lost Ark (ID: 51965) - Predicted Rating: 8.76
Movie: Harakiri (ID: 43015) - Predicted Rating: 8.76
Movie: Aliens (ID: 51445) - Predicted Rating: 8.74
Movie: No Country for Old Men (ID: 48140) - Predicted Rating: 8.74
Movie: Citizen Kane (ID: 2702) - Predicted Rating: 8.73
Movie: Vertigo (ID: 51669) - Predicted Rating: 8.72
Movie: Jaws (ID: 51542) - Predicted Rating: 8.71
Movie: Se7en (ID: 51345) - Predicted Rating: 8.70
Movie: Psycho (ID: 51578) - Pred