# **Movie Recommendation System using SVD**

**Importing dependencies**

In [85]:
import numpy as np
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from collections import defaultdict

**---- INITIAL DATA ANALYSIS & PRE PROCESSING ----**

**Loading & Parsing the dat files**

In [None]:
movies_df  = pd.read_csv('movies.dat',sep='::', engine='python', names=['movieID', 'title', 'genres'], encoding='latin-1')

rating_df = pd.read_csv('ratings.dat', sep='::', engine='python', names=['userID', 'movieID', 'rating', 'timestamp'], encoding='latin-1')

In [87]:
print(movies_df.head())

   movieID                               title                        genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy


In [None]:
movies_df.isnull().sum() #finding null values

movieID    0
title      0
genres     0
dtype: int64

In [None]:
movies_df.shape #dimensions of the dataset

(3883, 3)

In [None]:
movies_df.info() #information about the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieID  3883 non-null   int64 
 1   title    3883 non-null   object
 2   genres   3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [None]:
rating_df.head() #top 5 rows of the dataset

Unnamed: 0,userID,movieID,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [None]:
rating_df.shape #dimensions of the dataset

(1000209, 4)

In [None]:
rating_df.info() #information about the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   userID     1000209 non-null  int64
 1   movieID    1000209 non-null  int64
 2   rating     1000209 non-null  int64
 3   timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


**Loading ratings data into Surprise dataset format**

In [None]:
reader = Reader(line_format='user item rating timestamp', sep='::', rating_scale=(1, 5))
data = Dataset.load_from_file('ratings.dat', reader=reader)
print("✅ Data loaded into scikit-surprise successfully!")

✅ Data loaded into scikit-surprise successfully!


In [95]:
print(data)

<surprise.dataset.DatasetAutoFolds object at 0x00000263D20A85F0>


**---- MODEL BUILDING ----**

**Splitting the data for train and test**

In [None]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=54) #20% for test, 80% for train

**Intializing the model and training it**

In [None]:
svd = SVD(n_factors=10, n_epochs=50, lr_all=0.006)
svd.fit(trainset) #training the model

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x263f6ab94c0>

**Prediction from the model**

In [99]:
prediction = svd.test(testset)

**Analysis of the above prediction**

In [100]:
for index, (user, item, rating, est, extra) in enumerate(prediction):
    if index == 50:
        break
    print(index, user, item, rating, est, extra)

0 53 745 5.0 5 {'was_impossible': False}
1 3993 1079 4.0 3.3231126017154753 {'was_impossible': False}
2 5036 2827 2.0 2.526373343990747 {'was_impossible': False}
3 1291 2424 4.0 3.3033438482896633 {'was_impossible': False}
4 1238 233 5.0 3.7990496638525526 {'was_impossible': False}
5 4318 2822 4.0 3.944992805376482 {'was_impossible': False}
6 4979 3760 4.0 3.2141926365276254 {'was_impossible': False}
7 1329 870 1.0 2.0624781199434006 {'was_impossible': False}
8 4957 62 3.0 3.439126037780584 {'was_impossible': False}
9 1632 2362 1.0 2.9388321757150715 {'was_impossible': False}
10 5070 1078 2.0 4.19331916459395 {'was_impossible': False}
11 4058 2423 2.0 3.7124028783946237 {'was_impossible': False}
12 351 457 4.0 4.162966033037638 {'was_impossible': False}
13 587 260 4.0 3.344912095418415 {'was_impossible': False}
14 6007 2455 4.0 2.931383388222886 {'was_impossible': False}
15 1051 3397 4.0 4.047903368048303 {'was_impossible': False}
16 2092 3368 3.0 3.671969379313864 {'was_impossible': F

**---- MODEL EVALUATION ----**

**Manual checkup of the prediction**

In [None]:
user_id = str(2) #Userid as string if using surprise
movie_id = str(2236) #movieid as string if using surprise
result = svd.predict(user_id, movie_id) #passing them for prediction
print(f'predicted rating for User {user_id} on Movie {movie_id}: {result.est:.2f}')

predicted rating for User 2 on Movie 2236: 3.84


**Accuracy scores of the model**

In [None]:
MAE = accuracy.mae(prediction) #Mean Absolute Error
MSE = accuracy.mse(prediction) #Mean Squared Error
RMSE = accuracy.rmse(prediction) #Root Mean Squared Error


MAE:  0.6705
MSE: 0.7375
RMSE: 0.8588


**---- RECOMMEND TOP N MOVIES TO THE USERS ----**

**Finding top 5 recommendations**

In [None]:
def get_top_n(full_predictions, n=5):
    top_n = defaultdict(list) #Default dictionary to append data

    for uid, iid, true_r, pred_r, _ in full_predictions:
        top_n[uid].append((iid, pred_r)) #append movie, predicted rating into uid

    for uid, user_ratings in top_n.items():
        top_n[uid] = user_ratings[:n] #slicing top n data 

    return top_n

**Building the anti movies data**

In [None]:
full_trainset = data.build_full_trainset() #All training ratings into full trainset
svd.fit(full_trainset)
full_testset = full_trainset.build_anti_testset() #All user-item pairs for movies where the user has not rated yet
full_predictions = svd.test(full_testset)

**Analysis of above operation**

In [113]:
for index, (user_id, movie_id, user_rating, pred_r, _) in enumerate(full_predictions):
    if index == 20:
        break
    print(user_id, movie_id, user_rating, pred_r)

1 1357 3.581564453029317 3.664124036877412
1 3068 3.581564453029317 4.381818125079591
1 1537 3.581564453029317 4.391631333423155
1 647 3.581564453029317 4.178081538470979
1 2194 3.581564453029317 4.302198859426854
1 648 3.581564453029317 3.6747912911082548
1 2268 3.581564453029317 4.439272761867488
1 2628 3.581564453029317 3.4736391614926783
1 1103 3.581564453029317 4.150203342079834
1 2916 3.581564453029317 3.834726618266035
1 3468 3.581564453029317 4.469981708360451
1 1210 3.581564453029317 3.8178727217713875
1 1792 3.581564453029317 3.8601440835440046
1 1687 3.581564453029317 3.5087446017933512
1 1213 3.581564453029317 4.347310618737607
1 3578 3.581564453029317 4.272579122165872
1 2881 3.581564453029317 3.992797907057434
1 3030 3.581564453029317 3.8557543599690063
1 1217 3.581564453029317 3.953880116869925
1 434 3.581564453029317 3.7369647270866286


In [114]:
print(len(full_predictions))

21384031


**Finding top recommendations out of full data**

In [138]:
top_n_recommendations = get_top_n(full_predictions, n=5)
print(len(top_n_recommendations))

6040


**Finding top 5 recommendations for user 1**

In [None]:
# getting recommendations for the first available user
sample_user = list(top_n_recommendations.keys())[0]
print("Sample user ID:", sample_user)
print("Recommendations:", top_n_recommendations[sample_user])

Sample user ID: 1
Recommendations: [('1357', 3.664124036877412), ('3068', 4.381818125079591), ('1537', 4.391631333423155), ('647', 4.178081538470979), ('2194', 4.302198859426854)]


**Fetching names to the movieIDs and printing them**

In [130]:
sample_user = '1'

movies_df['movieID'] = movies_df['movieID'].astype(str)

recommended_movies = [
    (movies_df.loc[movies_df['movieID'] == movie_id, 'title'].values[0], score) for movie_id, score in top_n_recommendations[sample_user]
]

print("Top recommendations for user", sample_user)
print('                                 ')
for title, score in recommended_movies:
    print(f"{title} : (Predicted rating: {score:.2f})")


Top recommendations for user 1
                                 
Shine (1996) : (Predicted rating: 3.66)
Verdict, The (1982) : (Predicted rating: 4.38)
Shall We Dance? (Shall We Dansu?) (1996) : (Predicted rating: 4.39)
Courage Under Fire (1996) : (Predicted rating: 4.18)
Untouchables, The (1987) : (Predicted rating: 4.30)


# **Thank You!**