In [None]:
#https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset?resource=download&select=movies_metadata.csv
#alternative links
#https://www.kaggle.com/code/rohitshirudkar/movie-recommendation-system
#

# I - Content based filtering 

In [None]:
import pandas as pd
from ast import literal_eval
from nltk.stem import PorterStemmer
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 500)
pd.set_option("display.float_format", lambda x: "%.5f" % x)
pd.set_option('display.expand_frame_repr', False)

## 1) Fetching dataset

In [None]:
credits = pd.read_csv('credits.csv', low_memory=False)
keywords = pd.read_csv('keywords.csv', low_memory=False)
links = pd.read_csv('links.csv', low_memory=False)
ratings = pd.read_csv('ratings.csv', low_memory=False)
movies = pd.read_csv('movies_metadata.csv', low_memory=False)

In [None]:
movies['id'] = movies['id'].str.extract('(\d+)')
movies['id'] = movies['id'].astype(float)
movies['id'] = movies['id'].astype(int)
credits.columns = ['cast','crew', 'id']
movies = movies.merge(credits,on='id')

keywords.columns = ['id', 'keywords']
movies = movies.merge(keywords,on='id')


In [None]:
movies


In [None]:
print("Number of movies and number of information per movie (movie, feature) : ",movies.shape,"\n")
movies.info()

In [None]:
# check null value in dataset
movies.isnull().sum()

In [None]:
# check for duplicate values
movies.duplicated().sum()

In [None]:
# check for duplicate values
movies['title'].duplicated().sum()

## 3) Preprocessing data 

In [None]:
movies.drop_duplicates(inplace=True)

In [None]:
movies['title'].drop_duplicates(inplace=True)

In [None]:
# Replace the Nan with ''
movies.fillna('', inplace=True)
# tagline
movies['tagline'] = movies['tagline'].fillna('')


## 4) Cleaning data

In [None]:
# dropping uneccessary feature
data = movies.drop(["homepage", "belongs_to_collection", "imdb_id", "poster_path", "status", "video", "spoken_languages", "title"], axis=1)
# get rid of duplicates with same release date
data.drop_duplicates(subset=["original_title","release_date"], inplace=True)


In [None]:
stemmer = PorterStemmer()

#extracting genre names
data['genres'] = data['genres'].apply(literal_eval)
data['genres'] = data['genres'].apply(lambda x : [i['name'] for i in x])
data['genres'] = data['genres'].apply(lambda x : list(set(x)))
#keywords
data['keywords'] = data['keywords'].apply(literal_eval)
data['keywords'] = data['keywords'].apply(lambda x : [i['name'] for i in x])
data['keywords'] = data['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
data['keywords'] = data['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
data['keywords'] = data['keywords'].apply(lambda x : list(set(x)))

data['cast'] = data['cast'].apply(literal_eval)
data['crew'] = data['crew'].apply(literal_eval)
data['top_crew'] = data['cast'].apply(lambda x : [i['name'] for i in x])
# Here we are taking top 2 crews
data['top_crew'] = data['top_crew'].apply(lambda x : x[:2])

# Geting director Name
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return ""
data['director'] = data['crew'].apply(get_director)
imp_cols = ['tagline', 'genres' ,'original_language' ,'keywords' ,'top_crew','director']

#-------------------------------------------------------------#
#Extracting Digit from column
data['budget']=data['budget'].str.extract('(\d+)')
data['budget']=data['budget'].astype(float)
data['budget']=data['budget'].astype(int)
rev = []
for i in data['vote_count'].values:
    if i != '':
        rev.append(float(i))
    else:
        rev.append(0.0)
    
data['vote_count']=rev

rev = []
for i in data['vote_average'].values:
    if i != '':
        rev.append(float(i))
    else:
        rev.append(0.0)
    
data['vote_average']=rev

rev = []
for i in data['runtime'].values:
    if i != '':
        rev.append(float(i))
    else:
        rev.append(0.0)
    
data['runtime']=rev

data = data.drop(["cast", "crew"], axis=1)

### Display basic information 

In [None]:
data.info()

In [None]:
data

In [None]:
data.to_csv('movies_complete_with_index.csv', index=True)

In [None]:
# Different Unique Genre count
temp=[]
for i in data['genres']:
    for j in i:
        temp.append(j)   # add it to temp list and get the unique genre using set
unique_genre = list(set(temp))  
movies_count = []

for gen in unique_genre:
    count=0
    for i in data['genres']:
        if gen in i:
            count = count+1
    movies_count.append([gen,count])  

movies_count  

data_genre = pd.DataFrame(data=movies_count,columns=['genre_name','count'])

In [None]:
#Find the Distribution of number of movies in each genre with bar graph
plt.figure(figsize=(15,5),dpi=250)
data_genre=data_genre.sort_values(by='count',ascending = False)
sns.barplot(x=data_genre['genre_name'],y=data_genre['count'])
plt.title('Top genres with most numbers of movies')
#Change the rotation of xticks so it fits properly the chart
plt.xticks(rotation = 90)
plt.xlabel("Genre", fontsize = 13)  # Set X label                              
plt.ylabel("Movies Count", fontsize = 11)  # Set Y label

In [None]:
#most voted Movie in the dataset
data.sort_values(by=['vote_count'],ascending=False)[['original_title','vote_count']][:10]

In [None]:
pd.set_option('mode.chained_assignment', None) #disabling SettingWithCopyWarning warning
#Combining all required columns into Onecleaned_data1 = cleaned_data[imp_cols]
cleaned_data = data[imp_cols]

cleaned_data['tagline'] = cleaned_data['tagline'].apply(lambda x : [x])
cleaned_data['original_language'] = cleaned_data['original_language'].apply(lambda x : [x])
cleaned_data['director'] = cleaned_data['director'].apply(lambda x : [x])

cleaned_data['combine'] = cleaned_data['genres'] + cleaned_data['original_language'] +\
                        cleaned_data['keywords'] + cleaned_data['top_crew'] +\
                        cleaned_data['director']
cleaned_data['combine'] = cleaned_data['combine'].apply(lambda x: ' '.join(x))
cleaned_data

In [None]:
part_data = cleaned_data.head(20000)
part_data

In [None]:
#Transformation of the data in required fashion using Count vectorizer making ngrams
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(part_data['combine'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
data = data.reset_index()
titles = data['original_title']
indices = pd.Series(data.index, index=data['original_title'])


In [None]:
index_movie_id = data[['index','id']]

In [None]:
def get_content_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = data.iloc[movie_indices][['original_title', 'vote_count', 'vote_average']]
    
    print("\nRecommended movies for <<", title, ">> are : ")
    
    return movies

In [None]:
get_content_recommendations('Halo Legends')

# II - Collaborative filtering

In [None]:
#https://www.kaggle.com/code/padmanabhanporaiyar/imdb-movies-all-types-of-recommender-system

In [None]:
from surprise import Dataset, Reader
from surprise.prediction_algorithms.knns import KNNBasic
from surprise import accuracy
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm
import random

In [None]:
# movie dataframe with votes more than 55
movie_md = movies[movies['vote_count']>100][['id','original_title']]
# IDs of movies with count more than 55
movie_ids = [int(x) for x in movie_md['id'].values]

# Select ratings of movies with more than 55 counts
ratings = ratings[ratings['movieId'].isin(movie_ids)]

# Reset Index
ratings.reset_index(inplace=True, drop=True)

ratings

In [None]:
# Initialize a surprise reader object
reader = Reader(line_format='user item rating', sep=',', rating_scale=(0,5), skip_lines=1)

# Load the data
data_md = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader=reader)

# Build trainset object(perform this only when you are using whole dataset to train)
trainset = data_md.build_full_trainset()

In [None]:
#Declaring the similarity options.
sim_options = {'name': 'cosine',
               'user_based': False}

# KNN algorithm is used to find similar items
sim_item = KNNBasic(sim_options=sim_options, verbose=False, random_state=33)

# Train the algorithm on the trainset, and predict ratings for the testset
sim_item.fit(trainset)

In [None]:
#predicting rating for a sample user with an interacted product.
sim_item.predict(uid=2,iid=17,r_ui=5.0)

sim_item.predict(uid=671,iid=4011,r_ui=4.0)

In [None]:
def get_collaborative_recommendations(user_id=1, top_n=5):
    
    # creating an empty list to store the recommended product ids
    recommendations = []
    
    # creating an user item interactions matrix 
    user_movie_interactions_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
    
    # extracting those product ids which the user_id has not interacted yet
    non_interacted_movies = user_movie_interactions_matrix.loc[user_id][user_movie_interactions_matrix.loc[user_id].isnull()].index.tolist()
    
    # looping through each of the product ids which user_id has not interacted yet
    for item_id in non_interacted_movies:
        
        # predicting the ratings for those non interacted product ids by this user
        est = sim_item.predict(user_id, item_id).est
        # appending the predicted ratings
        movie_name = movie_md[movie_md['id']==str(item_id)]['original_title'].values[0]
        recommendations.append((movie_md['id'],est))

    # sorting the predicted ratings in descending order
    recommendations.sort(key=lambda x: x[1], reverse=True)

    return recommendations[:top_n] # returning top n highest predicted rating products for this user

In [None]:
get_collaborative_recommendations(random.randint(1, 270896))