In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
! pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from surprise import Reader, Dataset, SVD
from surprise import accuracy
from surprise.model_selection import train_test_split

**Loading the "Netflix_Dataset_Rating" dataset**

In [4]:
df = pd.read_csv("/content/drive/MyDrive/data sets/Netflix_Dataset_Rating.csv")
df.head()

Unnamed: 0,User_ID,Rating,Movie_ID
0,712664,5,3
1,1331154,4,3
2,2632461,3,3
3,44937,5,3
4,656399,4,3


In [5]:
df.shape

(17337458, 3)

In [6]:
df.dtypes

User_ID     int64
Rating      int64
Movie_ID    int64
dtype: object

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17337458 entries, 0 to 17337457
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   User_ID   int64
 1   Rating    int64
 2   Movie_ID  int64
dtypes: int64(3)
memory usage: 396.8 MB


In [8]:
df.isnull().sum()

User_ID     0
Rating      0
Movie_ID    0
dtype: int64

In [9]:
df["Rating"].describe().astype("int")

count    17337458
mean            3
std             1
min             1
25%             3
50%             4
75%             4
max             5
Name: Rating, dtype: int64

In [10]:
print("Unique values: /n", df.nunique())

Unique values: /n User_ID     143458
Rating           5
Movie_ID      1350
dtype: int64


**Loading the "Netflix_Dataset_Movie" Dataset**

In [11]:
df_title = pd.read_csv("/content/drive/MyDrive/data sets/Netflix_Dataset_Movie.csv")
df_title.head()

Unnamed: 0,Movie_ID,Year,Name
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW


In [12]:
df_title.shape

(17770, 3)

In [13]:
df_title.dtypes

Movie_ID     int64
Year         int64
Name        object
dtype: object

In [14]:
df_title.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17770 entries, 0 to 17769
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Movie_ID  17770 non-null  int64 
 1   Year      17770 non-null  int64 
 2   Name      17770 non-null  object
dtypes: int64(2), object(1)
memory usage: 416.6+ KB


In [15]:
df_title.isnull().sum()

Movie_ID    0
Year        0
Name        0
dtype: int64

In [16]:
df_title["Year"].describe().astype("int")

count    17770
mean      1990
std         16
min       1915
25%       1985
50%       1997
75%       2002
max       2005
Name: Year, dtype: int64

In [17]:
print("Unique values: \n", df_title.nunique())

Unique values: 
 Movie_ID    17770
Year           91
Name        17297
dtype: int64


**Analysing the Data:**

In [18]:
no_of_rated_products_per_user = df.groupby("User_ID")["Rating"].count().sort_values(ascending = False)
no_of_rated_products_per_user = no_of_rated_products_per_user.reset_index()
no_of_rated_products_per_user

Unnamed: 0,User_ID,Rating
0,305344,1344
1,387418,1339
2,2439493,1324
3,2118461,1305
4,1664010,1257
...,...,...
143453,1009839,11
143454,553082,10
143455,964881,9
143456,1445218,9


In [19]:
no_of_rated_products_per_user.describe()
# the mean here indicates --> on an average a user have rated 120 out of 1350 movies. 

Unnamed: 0,User_ID,Rating
count,143458.0,143458.0
mean,1320490.0,120.853895
std,765296.6,79.783702
min,6.0,5.0
25%,657900.8,67.0
50%,1317248.0,95.0
75%,1982464.0,147.0
max,2649429.0,1344.0


In [20]:
no_of_rated_products_per_movies = df.groupby("Movie_ID")["Rating"].count().sort_values(ascending = False)
no_of_rated_products_per_movies = no_of_rated_products_per_movies.reset_index()
no_of_rated_products_per_movies

Unnamed: 0,Movie_ID,Rating
0,1905,117075
1,2452,102721
2,4306,102376
3,571,101450
4,3860,98545
...,...,...
1345,4161,1215
1346,1375,1213
1347,717,1212
1348,2870,1092


In [21]:
no_of_rated_products_per_movies.describe()

Unnamed: 0,Movie_ID,Rating
count,1350.0,1350.0
mean,2227.1,12842.561481
std,1308.988364,17805.334719
min,3.0,1042.0
25%,1074.25,2607.75
50%,2214.0,5229.0
75%,3383.5,14792.0
max,4496.0,117075.0


From the entire dataset we will recommend only those movies to the users that satisfies a perticular benchmark or criteria.

In [22]:
f = ["count", "mean"]
df_movie_summary = df.groupby("Movie_ID")["Rating"].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)
print(df_movie_summary)
print("*" *30)
movie_benchmark = round(df_movie_summary["count"].quantile(0.70),0)
print(movie_benchmark)
print("*" *30)
drop_movie_list = df_movie_summary[df_movie_summary["count"] < movie_benchmark].index
print(drop_movie_list)
print("Number of elements in list:", len(drop_movie_list))
print("*" *30)

          count      mean
Movie_ID                 
3          1524  3.621391
8          9379  3.136795
16         2517  3.081843
17         4165  2.909964
18         9419  3.767597
...         ...       ...
4488      27997  3.540236
4490       7507  3.161849
4492       7511  2.652510
4493       4743  3.164875
4496       7835  3.765795

[1350 rows x 2 columns]
******************************
11472.0
******************************
Int64Index([   3,    8,   16,   17,   18,   26,   32,   33,   44,   45,
            ...
            4460, 4463, 4465, 4474, 4478, 4485, 4490, 4492, 4493, 4496],
           dtype='int64', name='Movie_ID', length=945)
Number of elements in list: 945
******************************


In [23]:
df_title = df_title.set_index("Movie_ID")
df_title

# we performed this step because the dataframe "df_movie_summary" also has "Movie_ID" as its index.
# Now the unnecessary movies that we got from "df_movie_summary["count"] < movie_benchmark" criteria are stored in "drop_movie_list" List.
# Finally we will drop those movies from the original dataset (i.e. deleting those movies from "df_title" dataframe).

Unnamed: 0_level_0,Year,Name
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003,Dinosaur Planet
2,2004,Isle of Man TT 2004 Review
3,1997,Character
4,1994,Paula Abdul's Get Up & Dance
5,2004,The Rise and Fall of ECW
...,...,...
17766,2002,Where the Wild Things Are and Other Maurice Se...
17767,2004,Fidel Castro: American Experience
17768,2000,Epoch
17769,2003,The Company


# **Building the Recommendation Model**

In [24]:
# initializing the "SVD" model as an object named "model"
model = SVD()

data = Dataset.load_from_df(df[["User_ID", "Movie_ID", "Rating"]], Reader())
# This line loads the user-item-rating data from a pandas DataFrame named "df" and creates a Surprise Dataset object using the "load_from_df" method.
# The method takes the DataFrame, selects the columns for "User_ID", "Movie_ID", and "Rating", and passes them to a Reader object that will interpret the ratings.

trainset, testset = train_test_split(data, test_size = 0.30, random_state = 10)

trainset = data.build_full_trainset()
# The "build_full_trainset()" method is used to build a training set 
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f6c60fe1dc0>

In [25]:
predictions = model.test(testset)
accuracy.rmse(predictions)

RMSE: 0.7362


0.7362403902285758

In [33]:
def Recommendation(given_user_id,n_movies):
    given_user = df_title.copy()
    given_user = given_user.reset_index()
    given_user = given_user[~given_user['Movie_ID'].isin(drop_movie_list)]


    given_user['Estimated_Rating'] = given_user['Movie_ID'].apply(lambda x: model.predict(given_user_id, x).est)

    given_user = given_user.drop('Movie_ID', axis = 1)

    given_user = given_user.sort_values('Estimated_Rating', ascending=False)
    given_user.reset_index(inplace=True,drop=True)
    return given_user.head(n_movies)

**Movie Recommendation for user-712664**

In [34]:
Recommendation(712664, 10)

Unnamed: 0,Year,Name,Estimated_Rating
0,1954,Seven Samurai,5.0
1,1992,Reservoir Dogs,5.0
2,1986,Aliens: Collector's Edition,4.987529
3,1979,Alien: Collector's Edition,4.926286
4,1961,The Hustler,4.850613
5,1974,The Godfather,4.809105
6,1994,The Professional,4.80617
7,1964,A Fistful of Dollars,4.707469
8,1959,North by Northwest,4.694685
9,2002,Spirited Away,4.687222


**Movie Recommendation for user-2643029**

In [35]:
Recommendation(2643029, 5)

Unnamed: 0,Year,Name,Estimated_Rating
0,2002,The Pianist,4.650157
1,1999,American Beauty,4.646383
2,1949,The Third Man,4.604773
3,1995,Pride and Prejudice,4.59554
4,2004,Sideways,4.562622
