# RECOMMENDATION SYSTEM USING IMDB MOVIE DATASET

Import Relevant Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
import autoreload
import os
import sys

from collections import defaultdict, Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

sb.set_style("darkgrid")
sb.set()

 ## A. Simple Recommender System 

In [2]:
df = pd.read_csv("D:\Open Classroom\Datasets\The Movies Dataset\movies_metadata.csv", low_memory=False)
df.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [3]:
df.shape

(45466, 24)

#### Create a Clone for the IMDB Top 250 Datasate
#### Calculating the weighted rating using 
#### WeightedRating(WR)=(v/v+m⋅R)+(m/v+m⋅C)

In the above equation,

1. v is the number of votes for the movie;

2. m is the minimum votes required to be listed in the chart;

3. R is the average rating of the movie;

4. C is the mean vote across the whole report.

In [4]:
# calculate the mean of the vote average series

c = df["vote_average"].mean()
c

5.618207215134185

In [5]:
# Calculate the number of votes (m) reacieved by a movie in the 90th percentile

m = df["vote_count"].quantile(0.9)
m

160.0

In [6]:
filt_df = df.copy().loc[df["vote_count"] >= m]
filt_df.head(2)
print(filt_df.shape)

(4555, 24)


In [7]:
# Calculate the weighted rating for each qualified movie

def movie_rating(x, m = m, c = c):
    v = x["vote_count"]
    r = x["vote_average"]
    
    # using the imdb average score formular
    return (v/(v+m)*r) + (m/(v+m)*c)

In [8]:
filt_df["score"] = filt_df.apply(movie_rating, axis = 1)
filt_df = filt_df.sort_values(by = "score", ascending = False)
filt_df[['title', 'vote_count', 'vote_average', 'score']].head()

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385


## B. Content Based Recommender System

### Using the Sigmoid Kernel to make recommendation to user based on content watched 

In [9]:
df = pd.read_csv("D:\Open Classroom\Datasets\The Movies Dataset\movies_metadata.csv", low_memory=False)
df.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [10]:
tfidf = TfidfVectorizer(min_df=3, max_features=None, stop_words='english', strip_accents='unicode', ngram_range=(1, 3), analyzer='word')

# Fill missing values in the overview column will empty string
df['overview'] = df['overview'].fillna('')

# Fitting and Transforming the TF-IDF instance
ovw_matrix = tfidf.fit_transform(df['overview'])
ovw_matrix

<45466x75878 sparse matrix of type '<class 'numpy.float64'>'
	with 1453128 stored elements in Compressed Sparse Row format>

In [11]:
sig = sigmoid_kernel()

TypeError: sigmoid_kernel() missing 1 required positional argument: 'X'

In [None]:
# Reverse indexing
indices = pd.Series(df.index, df['original_title']).drop_duplicates()
indices