In [None]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux

In [None]:
# Core Libraries
import pandas as pd
import numpy as np

# For Collaborative Filtering
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, cross_validate
from surprise.accuracy import rmse
from surprise import accuracy
from collections import defaultdict
from scipy.stats import rankdata

# For Content-Based Filtering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Visualization (optional)
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
movies = pd.read_csv('movie.csv')
movies.dropna(inplace=True)
movies['genres'] = movies['genres'].str.replace('|', ', ')

ratings = pd.read_csv('rating.csv',  parse_dates=['timestamp'])

In [None]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
 #   Column     Dtype         
---  ------     -----         
 0   userId     int64         
 1   movieId    int64         
 2   rating     float64       
 3   timestamp  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 610.4 MB


In [None]:
user_interaction_counts = ratings.groupby('userId')['rating'].count()

# Filter users with at least n interactions
users_to_keep = user_interaction_counts[user_interaction_counts >= 100].index

# Filter the ratings DataFrame
ratings = ratings[ratings['userId'].isin(users_to_keep)]

# rand_userIds = np.random.choice(ratings['userId'].unique(),
#                                 size=int(len(ratings['userId'].unique())*0.3),
#                                 replace=False)

rand_userIds = np.random.choice(ratings['userId'].unique(),
                                size=115,
                                replace=False)

ratings = ratings.loc[ratings['userId'].isin(rand_userIds)]

print('There are {} rows of data from {} users'.format(len(ratings), len(rand_userIds)))
ratings.info()

There are 40715 rows of data from 115 users
<class 'pandas.core.frame.DataFrame'>
Index: 40715 entries, 64761 to 19976572
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   userId     40715 non-null  int64         
 1   movieId    40715 non-null  int64         
 2   rating     40715 non-null  float64       
 3   timestamp  40715 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 1.6 MB


In [None]:
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'] \
                                .rank(method='first', ascending=False)

# Calculate the cutoff for the top 20%
top_percent = ratings.groupby('userId')['rank_latest'].transform('max') * 0.20

# Create test set with top 20% latest interactions
test_ratings = ratings[ratings['rank_latest'] <= top_percent]

# Create train set with the remaining interactions
train_ratings = ratings[ratings['rank_latest'] > top_percent]

# train_ratings = ratings[ratings['rank_latest'] != 1]
# test_ratings = ratings[ratings['rank_latest'] == 1]

# drop columns that we no longer need
train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId', 'movieId', 'rating']]
train_ratings.shape, test_ratings.shape

((32619, 3), (8096, 3))

In [None]:
test_ratings

Unnamed: 0,userId,movieId,rating
64762,457,5,1.0
64764,457,16,4.0
64778,457,160,2.5
64780,457,163,3.0
64781,457,173,2.5
...,...,...,...
19976568,138307,96417,3.0
19976569,138307,96592,3.0
19976570,138307,97860,2.0
19976571,138307,97866,2.5


In [None]:
def get_user_interaction_counts(user_id, train_data, test_data):
    train_count = len(train_data[train_data['userId'] == user_id])
    test_count = len(test_data[test_data['userId'] == user_id])
    return train_count, test_count

# Example usage:
user_id_to_check = ratings['userId'].unique()[0]
train_interactions, test_interactions = get_user_interaction_counts(
    user_id_to_check, train_ratings, test_ratings
)

print(f"User {user_id_to_check}:")
print(f"- Train set interactions: {train_interactions}")
print(f"- Test set interactions: {test_interactions}")


User 457:
- Train set interactions: 341
- Test set interactions: 85


In [None]:
# # Select only the required columns
# movies = movies[['movieId', 'title', 'genres']]

# # Replace '|' in genres with spaces for easier processing
# movies['genres'] = movies['genres'].fillna('unknown').str.replace('|', ' ')

# # Preview the updated movies dataset
# movies.head()

# # Merge the ratings dataset with movies
# ratings = ratings.merge(movies, on='movieId', how='left')

# # Preview the merged dataset
# print("Merged Ratings Dataset:")
# ratings.head()

In [None]:
# Check if the user_id exists in the ratings dataset
print(f"User IDs: {ratings['userId'].unique()[:10]}")  # Print the first 10 user IDs
print(f"Movie IDs: {ratings['movieId'].unique()[:10]}")  # Print the first 10 movie IDs

# Example to verify if specific user_id and movie_id exist
user_id = 8
movie_id = 2
if user_id not in ratings['userId'].unique():
    print(f"User ID {user_id} does not exist in the dataset.")
if movie_id not in ratings['movieId'].unique():
    print(f"Movie ID {movie_id} does not exist in the dataset.")

User IDs: [  800  2264  4014  6950  7323  7484  7877  8456  8908 11137]
Movie IDs: [ 345  349  442  480  494 1127 1208 1259 1387 1407]
User ID 8 does not exist in the dataset.


In [None]:
# Define the reader for Surprise
reader = Reader(rating_scale=(0.5, 5.0))

# Load the dataset into Surprise format
# data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], )

# Split the data into training and testing sets
trainset =  Dataset.load_from_df(train_ratings, reader)
trainset = trainset.build_full_trainset()
testset = Dataset.load_from_df(test_ratings, reader)
testset = testset.build_full_trainset().build_testset()

print('Number of users: ', trainset.n_users)
print('Number of movies: ', trainset.n_items)
print('Number of ratings: ', trainset.n_ratings)

Number of users:  115
Number of movies:  6235
Number of ratings:  32619


In [None]:
# Initialize the SVD model
svd = SVD()

# Train the model on the training set
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7bb23e9c5050>

In [None]:
first_user_id = ratings['userId'].unique()[0]
user_interactions = ratings[ratings['userId'] == first_user_id]

movie_id = ratings['movieId'].unique()[0]

if movie_id not in user_interactions['movieId'].unique():
    print(f"Movie ID {movie_id} does not exist in the dataset.")
else:
    print(user_interactions[user_interactions['movieId'] == movie_id].head())
    avg_rating = ratings[ratings['movieId'] == movie_id]['rating'].mean()
    print(f"\nAverage Rating for Movie ID {movie_id}: {avg_rating}")

avg_user_rating = ratings[ratings['userId'] == first_user_id]['rating'].mean()
print(f"Average Rating Given by User ID {first_user_id}: {avg_user_rating}")

print("\nPredicted Rating: ", svd.predict(first_user_id, 2).est)

       userId  movieId  rating           timestamp  rank_latest
64761     457        2     3.0 2010-10-07 15:11:18        146.0

Average Rating for Movie ID 2: 3.0853658536585367
Average Rating Given by User ID 457: 3.226525821596244

Predicted Rating:  3.2260673138979623


In [None]:
user_interactions['movieId']

Unnamed: 0,movieId
119103,345
119104,349
119105,442
119106,480
119107,494
119108,1127
119109,1208
119110,1259
119111,1387
119112,1407


In [None]:
# Handle new user or movie
if user_id not in ratings['userId'].unique():
    print(f"User ID {user_id} is new. Use average user ratings.")
if movie_id not in ratings['movieId'].unique():
    print(f"Movie ID {movie_id} is new. Use average movie ratings.")

User ID 8 is new. Use average user ratings.


In [None]:
def recommend_movies(user_id, n=5):
    # Get all unique movie IDs
    all_movie_ids = ratings['movieId'].unique()

    # Get movies already rated by the user
    rated_movies = train_ratings[train_ratings['userId'] == user_id]['movieId'].unique()

    # Filter out movies already rated
    unrated_movies = [movie for movie in all_movie_ids if movie not in rated_movies]

    # Predict ratings for unrated movies
    predictions = [svd.predict(user_id, movie_id) for movie_id in unrated_movies]
    # Sort predictions by estimated rating
    predictions.sort(key=lambda x: x.est, reverse=True)

    # Get top N recommendations
    top_n = predictions[:n]

    recommendations = [(pred.iid ,movies[movies['movieId'] == pred.iid]['title'].values[0], round(pred.est, 2)) for pred in top_n]
    return recommendations

# Get top 5 recommendations for User 1
recommendations = recommend_movies(user_id=800, n=10)
print("Top Recommendations:")
for id, title, rating in recommendations:
    print(f"ID: {id}, {title}: Predicted Rating {rating}")


Top Recommendations:
ID: 1136, Monty Python and the Holy Grail (1975): Predicted Rating 4.48
ID: 318, Shawshank Redemption, The (1994): Predicted Rating 4.48
ID: 50, Usual Suspects, The (1995): Predicted Rating 4.47
ID: 904, Rear Window (1954): Predicted Rating 4.38
ID: 593, Silence of the Lambs, The (1991): Predicted Rating 4.32
ID: 4226, Memento (2000): Predicted Rating 4.31
ID: 2959, Fight Club (1999): Predicted Rating 4.29
ID: 912, Casablanca (1942): Predicted Rating 4.29
ID: 1193, One Flew Over the Cuckoo's Nest (1975): Predicted Rating 4.29
ID: 58559, Dark Knight, The (2008): Predicted Rating 4.29


In [None]:
test_ratings[test_ratings['userId'] == 457 && test_ratings['movieId'] == 1]

Unnamed: 0,userId,movieId,rating
64762,457,5,1.0
64764,457,16,4.0
64778,457,160,2.5
64780,457,163,3.0
64781,457,173,2.5
...,...,...,...
65160,457,61323,3.5
65165,457,64839,4.0
65171,457,69481,4.0
65181,457,73462,2.5


### Evaluation

In [None]:
def recommend_movies_for_all_users(n=5):
    """
    Generate Top-N movie recommendations for all users.
    Returns a defaultdict where keys are user_ids and values are lists of recommended (movieId, title, rating).
    """
    recommendations = defaultdict(list)

    # Get all unique users
    all_users = ratings['userId'].unique()

    # Get all unique movie IDs
    all_movie_ids = ratings['movieId'].unique()

    for user_id in all_users:
        # Get movies already rated by the user
        rated_movies = train_ratings[train_ratings['userId'] == user_id]['movieId'].unique()

        # Filter out movies already rated
        unrated_movies = [movie for movie in all_movie_ids if movie not in rated_movies]

        # Predict ratings for unrated movies
        predictions = [svd.predict(user_id, movie_id) for movie_id in unrated_movies]

        # Sort predictions by estimated rating
        predictions.sort(key=lambda x: x.est, reverse=True)

        # Get top N recommendations
        top_n = predictions[:n]

        # Store recommendations in defaultdict
        recommendations[user_id] = [
            pred.iid for pred in top_n
        ]

    return recommendations

# Generate recommendations for all users
all_user_recommendations = recommend_movies_for_all_users(n=20)

In [None]:

def evaluate_recommendations(recommendations, testset, ks=[5, 10, 20]):
    """
    Evaluate the recommendation system using HR@K, Precision@K, Recall@K, and NDCG@K for different K values.
    Returns a pandas DataFrame with evaluation metrics.
    """
    metrics = {"K": [], "HR@K": [], "Precision@K": [], "Recall@K": [], "NDCG@K": []}

    # Convert test set into a dictionary {user_id: set of relevant movieIds}
    actual_ratings = defaultdict(set)
    for user_id, movie_id, rating in testset:
        # if rating >= 4.0:  # Consider ratings 4+ as relevant
          actual_ratings[user_id].add(movie_id)

    for k in ks:
        hit_count = 0
        precision_sum = 0
        recall_sum = 0
        ndcg_sum = 0
        total_users = 0

        for user_id in actual_ratings:
            relevant_movies = actual_ratings[user_id]
            recommended_movies = recommendations.get(user_id, [])[:k]  # Get Top-K recommendations

            # Compute HR@K
            hit = len(set(recommended_movies) & relevant_movies) > 0
            hit_count += hit

            # Compute Precision@K
            precision = len(set(recommended_movies) & relevant_movies) / k
            precision_sum += precision

            # Compute Recall@K
            recall = len(set(recommended_movies) & relevant_movies) / len(relevant_movies)
            recall_sum += recall

            # Compute NDCG@K
            dcg = sum([1 / np.log2(idx + 2) for idx, movie in enumerate(recommended_movies) if movie in relevant_movies])
            idcg = sum([1 / np.log2(idx + 2) for idx in range(min(len(relevant_movies), k))])
            ndcg = dcg / idcg if idcg > 0 else 0
            ndcg_sum += ndcg

            total_users += 1

        # Store results
        metrics["K"].append(k)
        metrics["HR@K"].append(hit_count / total_users)
        metrics["Precision@K"].append(precision_sum / total_users)
        metrics["Recall@K"].append(recall_sum / total_users)
        metrics["NDCG@K"].append(ndcg_sum / total_users)

    return pd.DataFrame(metrics)

# Evaluate the recommendations
evaluation_results = evaluate_recommendations(all_user_recommendations, testset, ks=[5, 10, 20])

# Display the evaluation metrics in a pandas table
evaluation_results

Unnamed: 0,K,HR@K,Precision@K,Recall@K,NDCG@K
0,5,0.408696,0.106087,0.010493,0.108902
1,10,0.565217,0.093913,0.017201,0.099616
2,20,0.66087,0.083913,0.029412,0.090604


In [None]:
def get_top_k_recommendations(model, testset, k=5):
    predictions = model.test(testset)

    top_k_recommendations = defaultdict(list)
    actual_ratings = defaultdict(list)

    for pred in predictions:
        user_id = pred.uid
        movie_id = pred.iid
        true_rating = pred.r_ui
        estimated_rating = pred.est

        top_k_recommendations[user_id].append((movie_id, estimated_rating))
        if true_rating >= 4.0:  # Consider movies rated 4 or above as relevant
            actual_ratings[user_id].append(movie_id)

    # Sort each user's recommendations by estimated rating in descending order
    for user_id in top_k_recommendations:
        top_k_recommendations[user_id].sort(key=lambda x: x[1], reverse=True)
        top_k_recommendations[user_id] = [movie[0] for movie in top_k_recommendations[user_id][:k]]

    return top_k_recommendations, actual_ratings


def evaluate_recommendations(top_k_recommendations, actual_ratings, k=5):
    hit_rate = []
    precision_at_k = []
    recall_at_k = []
    ndcg_at_k = []

    for user_id in actual_ratings:
        actual_set = set(actual_ratings[user_id])
        recommended_list = top_k_recommendations.get(user_id, [])

        hits = sum(1 for movie in recommended_list if movie in actual_set)
        hit_rate.append(1 if hits > 0 else 0)

        precision = hits / k
        recall = hits / len(actual_set) if len(actual_set) > 0 else 0

        # Compute DCG (Discounted Cumulative Gain)
        dcg = sum((1 / np.log2(idx + 2)) for idx, movie in enumerate(recommended_list) if movie in actual_set)
        # Compute IDCG (Ideal DCG) for normalization
        idcg = sum((1 / np.log2(idx + 2)) for idx in range(min(len(actual_set), k)))
        ndcg = dcg / idcg if idcg > 0 else 0

        precision_at_k.append(precision)
        recall_at_k.append(recall)
        ndcg_at_k.append(ndcg)

    # Compute the average metrics across all users
    results = {
        "HR@K": np.mean(hit_rate),
        "Precision@K": np.mean(precision_at_k),
        "Recall@K": np.mean(recall_at_k),
        "NDCG@K": np.mean(ndcg_at_k),
    }

    return pd.DataFrame([results])


# Generate Top-K recommendations
top_k_recommendations, actual_ratings = get_top_k_recommendations(svd, testset, k=5)

# Evaluate the model
evaluation_results = evaluate_recommendations(top_k_recommendations, actual_ratings, k=5)

# Display results
print("\nEvaluation Metrics (Top-5 Recommendations):")
evaluation_results



Evaluation Metrics (Top-5 Recommendations):


Unnamed: 0,HR@K,Precision@K,Recall@K,NDCG@K
0,0.987686,0.704758,0.53724,0.805306


In [None]:
def generate_metric(model, testset, k=[5, 10, 20]):

    results = []
    for k_val in k:
        top_k_recommendations, actual_ratings = get_top_k_recommendations(model, testset, k=k_val)
        metrics = evaluate_recommendations(top_k_recommendations, actual_ratings, k=k_val)
        metrics['K'] = k_val  # Add a column for K value
        results.append(metrics)

    return pd.concat(results, ignore_index=True)

generate_metric(svd, testset, k=[5, 10, 20])

Unnamed: 0,HR@K,Precision@K,Recall@K,NDCG@K,K
0,0.987686,0.704758,0.53724,0.805306,5
1,0.997409,0.570388,0.726988,0.826302,10
2,0.999358,0.414177,0.86712,0.849272,20


In [None]:
evaluation_results

Unnamed: 0,HR@K,Precision@K,Recall@K,NDCG@K
0,0.984375,0.752083,0.15756,0.775854


In [None]:
import pickle

# Save the trained SVD model
with open('svd_model.pkl', 'wb') as file:
    pickle.dump(svd, file)

print("Model saved as svd_model.pkl")

Model saved as svd_model.pkl


In [None]:
movies.to_csv('/kaggle/working/movies.csv', index=False)
ratings.to_csv('/kaggle/working/ratings.csv', index=False)

OSError: Cannot save file into a non-existent directory: '/kaggle/working'