In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive

In [2]:
drive.mount('/content/drive')

# List contents of Google Drive root directory
folder_path = '/content/drive/My Drive/Recommendation System/Data/Data2/'
contents = os.listdir(folder_path)
print(contents)

Mounted at /content/drive
['movies_metadata.csv', 'links.csv', 'keywords.csv', 'links_small.csv', 'ratings.csv', 'ratings_small.csv', 'credits.csv']


In [3]:
user_ratings_df = pd.read_csv(folder_path + 'ratings_small.csv')
user_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
user_ratings_df.shape

(100004, 4)

In [5]:
nan_count_user = user_ratings_df.isnull().sum()
nan_count_user

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [6]:
movie_metadata = pd.read_csv(folder_path + 'movies_metadata.csv')
movie_metadata.head(1)

  movie_metadata = pd.read_csv(folder_path + 'movies_metadata.csv')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [7]:
movie_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [8]:
movie_metadata.shape

(45466, 24)

In [9]:
nan_count_column_a = movie_metadata['overview'].isnull().sum()
nan_count_column_a

954

In [10]:
nan_count = movie_metadata.isnull().sum()
nan_count

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [11]:
# Filter rows where both 'title' and 'overview' columns are NaN
nan_rows = movie_metadata[movie_metadata['title'].isnull() & movie_metadata['overview'].isnull()]

nan_rows

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count


In [12]:
# Drop rows where 'overview' column is NaN
movie_metadata = movie_metadata.dropna(subset=['overview'])

movie_metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [13]:
movie_metadata.shape

(44512, 24)

### **Collaborative-filtering using KNN**

In [14]:
user_item_matrix = user_ratings_df.pivot(index=['userId'], columns=['movieId'], values='rating').fillna(0)
user_item_matrix = user_item_matrix.to_numpy()
user_item_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

In [15]:
from sklearn.neighbors import NearestNeighbors

# Define a KNN model on cosine similarity
cf_knn_model= NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)

# Fitting the model on our matrix
cf_knn_model.fit(user_item_matrix)

In [18]:
! pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.9.1


In [16]:
movie_names = movie_metadata[['title', 'genres']]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
movie_names.head()

Unnamed: 0,title,genres
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]"


In [19]:
from rapidfuzz import process

def collaborative_filtering_recommender(movie_name, matrix, n_recs=10):
    # Extract input movie ID
    movie_id = process.extractOne(movie_name, movie_names['title'])[2]

    print(movie_id)
    # Calculate neighbour distances
    distances, indices = cf_knn_model.kneighbors(matrix[movie_id].reshape(1, -1), n_neighbors=n_recs)
    movie_rec_ids = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]

    # List to store recommendations
    cf_recs = []
    for i in movie_rec_ids:
        cf_recs.append({'Title': movie_names['title'][i[0]]})

    # Select top number of recommendations needed
    df = pd.DataFrame(cf_recs, index=range(1, n_recs))

    return df

In [20]:
n_recs = 11
recommendations = collaborative_filtering_recommender('Jumanji', user_item_matrix, n_recs)
recommendations

1


Unnamed: 0,Title
1,Pinocchio
2,The Amazing Panda Adventure
3,Nell
4,Billy's Holiday
5,Desperado
6,Colonel Chabert
7,Brother Minister: The Assassination of Malcolm X
8,Batman Forever
9,Red Rock West
10,Muriel's Wedding


A list of the top 10 recommended movies for someone who is a fan of Batman

### **Advantages of Collaborative Filtering:**

**Personalized Recommendations:** Offers tailored suggestions based on user behavior, leading to highly customized experiences.

**Diverse Content Discovery:** Capable of recommending a wide range of items, helping users discover content they might not find on their own. It gives diverse content discovery the edge over content-based filtering.

**Community Wisdom:** Leverages the collective preferences of users, often leading to more accurate recommendations than individual or content-based analysis alone.

**Dynamic Adaptation**: The model continuously gets updated with user interactions, keeping the recommendations relevant and up-to-date.

### **Limitations of Collaborative Filtering:**


**Cold start problem:** This happens when new movies or users are added to the system. The system struggles to make accurate recommendations since there's not enough data on these new entries.

**Popularity bias:** Popular movies get recommended a lot, overshadowing lesser-known gems. There are also scalability issues that come with managing such a large dataset.

### **Content-based filtering**

In [21]:
movie_metadata = movie_metadata.head(30000)

In [22]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
movie_metadata['overview'] = movie_metadata['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movie_metadata['overview'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(30000, 58989)

In [23]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [24]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(movie_metadata.index, index=movie_metadata['title']).drop_duplicates()

In [25]:
def content_based_recommender(movie_name, cosine_sim=cosine_sim, indices=indices, n_recs=10):
    # Get the index of the movie that matches the movie_name
    idx = process.extractOne(movie_name, movie_metadata['title'])[2]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n_recs+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies as a DataFrame
    recommendations = movie_metadata['title'].iloc[movie_indices]
    return pd.DataFrame(recommendations)

In [26]:
recommendations =content_based_recommender('Jumanji')
recommendations

Unnamed: 0,title
21633,Table No. 21
6166,Brainscan
8801,Quintet
17223,The Dark Angel
9503,Word Wars
16843,DeVour
8079,Masques
13601,The Mindscape of Alan Moore
6055,Poolhall Junkies
13711,Rhinoceros


### **Hybrid Technique**