# Movie Recommender Project

# Introduction
In this case study, we will analyze data from MovieLens 100K and train a model to give recommendations to users based off their ratings on movies the user has seen. 

To train the model we will be using SVD as our model. 

Based on MovieLens 100K data from here: https://dl.acm.org/doi/10.1145/2827872

In [14]:
# Library
import pandas as pd
import numpy as np
from surprise import Dataset, Reader
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import KFold
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt


In [13]:
# Define File Paths
Movie_Base = "/Users/alexrodriguez/Desktop/Graduate Classes/Data Mining II/Projects/movielens_100k.base"
Movie_Test = "/Users/alexrodriguez/Desktop/Graduate Classes/Data Mining II/Projects/movielens_100k.test"

# Define column names
column_names = ['user_id', 'movie_id', 'rating', 'timestamp']

# Load the dataset into a Pandas dataframe
Base_data = pd.read_csv(Movie_Base, sep='\t', names=column_names)

# Drop the timestamp column
Base_data = Base_data.drop('timestamp', axis=1)

# Print the first few rows of the dataframe to verify the data has been loaded correctly
print(Base_data.head(50))

    user_id  movie_id  rating
0         1         1       5
1         1         2       3
2         1         3       4
3         1         4       3
4         1         5       3
5         1         7       4
6         1         8       1
7         1         9       5
8         1        11       2
9         1        13       5
10        1        15       5
11        1        16       5
12        1        18       4
13        1        19       5
14        1        21       1
15        1        22       4
16        1        25       4
17        1        26       3
18        1        28       4
19        1        29       1
20        1        30       3
21        1        32       5
22        1        34       2
23        1        35       1
24        1        37       2
25        1        38       3
26        1        40       3
27        1        41       2
28        1        42       5
29        1        43       4
30        1        45       5
31        1        46       4
32        

In [4]:
# Define the reader to parse the rating scale
reader = Reader(rating_scale=(1, 5))

# Load the data into Surprise's Dataset class
Base_data = Dataset.load_from_df(Base_data[['user_id', 'movie_id', 'rating']], reader)

# Split the data into training and testing sets
train_base, test_base = train_test_split(Base_data, test_size=0.2, random_state=50)


In [5]:
# Define the model and fit it to the training data
model = SVD()
model.fit(train_base)

# Generate predictions on the test data
predictions = model.test(test_base)

# Compute RMSE on the test data
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse:.3f}")


RMSE: 0.9472
RMSE: 0.947


In [12]:
# Recommend top 5 movies for a given user
user_id = 1
user_ratings = Base_data.build_full_trainset().ur[user_id]
unrated_movies = [item_id for item_id in Base_data.build_full_trainset().all_items() if item_id not in user_ratings]

movie_ratings = [(item_id, model.predict(user_id, item_id).est) for item_id in unrated_movies]
top_movies = sorted(movie_ratings, key=lambda x: x[1], reverse=True)[:5]
print(f"Top 5 recommended movies for user {user_id}:")
for movie_id, rating in top_movies:
    print(f"Movie ID {movie_id}", f"predicted rating: {rating:.3f}")
    


Top 5 recommended movies for user 1:
Movie ID 483 predicted rating: 4.745
Movie ID 169 predicted rating: 4.706
Movie ID 318 predicted rating: 4.634
Movie ID 50 predicted rating: 4.601
Movie ID 647 predicted rating: 4.573


In [15]:
Movie_Test = "/Users/alexrodriguez/Desktop/Graduate Classes/Data Mining II/Projects/movielens_100k.test"

# Define column names
column_names = ['user_id', 'movie_id', 'rating', 'timestamp']

# Load the dataset into a Pandas dataframe
Test_data = pd.read_csv(Movie_Test, sep='\t', names=column_names)

# Drop the timestamp column
Test_data = Test_data.drop('timestamp', axis=1)

In [16]:
# Define the reader to parse the rating scale
reader = Reader(rating_scale=(1, 5))

# Load the data into Surprise's Dataset class
Test_data = Dataset.load_from_df(Test_data[['user_id', 'movie_id', 'rating']], reader)

# Split the data into training and testing sets
train_Test, test_Test = train_test_split(Test_data, test_size=0.2, random_state=50)


In [17]:
# Define the model
model = SVD()
model.fit(train_base)
model.fit(train_Test)

predictions = model.test(test_Test)

# Compute RMSE on the test data
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse:.3f}")

RMSE: 0.9785
RMSE: 0.979


In [18]:
# Recommend top 5 movies for a given user
user_id = 10
user_ratings = Test_data.build_full_trainset().ur[user_id]
unrated_movies = [item_id for item_id in Test_data.build_full_trainset().all_items() if item_id not in user_ratings]

movie_ratings = [(item_id, model.predict(user_id, item_id).est) for item_id in unrated_movies]
top_movies = sorted(movie_ratings, key=lambda x: x[1], reverse=True)[:5]
print(f"Top 5 recommended movies for user {user_id}:")
for movie_id, rating in top_movies:
    print(f"Movie ID {movie_id}", f"predicted rating: {rating:.3f}")
    


Top 5 recommended movies for user 10:
Movie ID 483 predicted rating: 5.000
Movie ID 127 predicted rating: 4.906
Movie ID 174 predicted rating: 4.859
Movie ID 96 predicted rating: 4.844
Movie ID 114 predicted rating: 4.835


In [19]:
# Define the cross-validation iterator
kf = KFold(n_splits=5, random_state=1)

# Perform cross-validation and compute RMSE for each fold
for i, (trainset, testset) in enumerate (kf.split(Test_data)):
    print(f"Fold {i+1}:")
    # Fit the model to the training set for this fold
    model.fit(trainset)
    # Generate predictions on the test set for this fold
    predictions = model.test(testset)
    # Compute RMSE on the test set for this fold
    rmse = accuracy.rmse(predictions)
    print(f"RMSE: {rmse:.3f}")


Fold 1:
RMSE: 1.0017
RMSE: 1.002
Fold 2:
RMSE: 0.9764
RMSE: 0.976
Fold 3:
RMSE: 1.0129
RMSE: 1.013
Fold 4:
RMSE: 0.9958
RMSE: 0.996
Fold 5:
RMSE: 0.9848
RMSE: 0.985
