### Content Based Recommendation System

### Read the Dataset `movies_metadata.csv`

# Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
movies = pd.read_csv("movies_metadata-1.csv")
movies.shape

  interactivity=interactivity, compiler=compiler, result=result)


(45466, 24)

In [3]:
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
id                       45466 non-null object
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null objec

### Create a new column with name 'description' combining `'overview' and 'tagline'` columns in the given dataset

In [5]:
movies['tagline'] = movies['tagline'].fillna('')
movies['description'] = movies['overview'] + movies['tagline']

In [6]:
movies.head()

movies.shape

(45466, 25)

### Lets drop the null values in `description` column

In [7]:
movies['description'].isnull().sum()

954

In [8]:
movies.dropna(subset=['description'], inplace=True)
movies['description'].isnull().sum()

0

In [9]:
movies.shape

(44512, 25)

In [10]:
movies.description.head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: description, dtype: object

### Keep the first occurance and drop duplicates of each title in column `title`

In [11]:
## printing the duplicate movies
duplicateMovies = movies.title.duplicated()

title = duplicateMovies[duplicateMovies == True]

title

888      True
930      True
1296     True
1465     True
1644     True
         ... 
45421    True
45453    True
45454    True
45460    True
45463    True
Name: title, Length: 3140, dtype: bool

In [12]:
movies= movies.drop_duplicates('title')

In [13]:
movies.shape

(41372, 25)

### As we might have dropped a few rows with duplicate `title` in above step, just reset the index [make sure you are not adding any new column to the dataframe while doing reset index]

In [14]:
movies.reset_index(inplace=True,drop=True)
movies.shape

(41372, 25)

### Generate tf-idf matrix using the column `description`. Consider till 3-grams, with minimum document frequency as 0.

In [15]:
tf_idf_vect = TfidfVectorizer(analyzer='word',ngram_range=(1,3),stop_words='english', min_df = 0)
tfidf_matrix = tf_idf_vect.fit_transform(movies["description"])
print(tfidf_matrix)

  (0, 1126726)	0.11233766834628615
  (0, 643575)	0.11233766834628615
  (0, 581976)	0.11233766834628615
  (0, 1442903)	0.11233766834628615
  (0, 2185713)	0.11233766834628615
  (0, 263399)	0.11233766834628615
  (0, 1757708)	0.11233766834628615
  (0, 337272)	0.11233766834628615
  (0, 263359)	0.11233766834628615
  (0, 1516369)	0.11233766834628615
  (0, 2185717)	0.11233766834628615
  (0, 896905)	0.11233766834628615
  (0, 87094)	0.11233766834628615
  (0, 1502102)	0.11233766834628615
  (0, 1194138)	0.11233766834628615
  (0, 51985)	0.11233766834628615
  (0, 1722784)	0.11233766834628615
  (0, 1156858)	0.11233766834628615
  (0, 263384)	0.11233766834628615
  (0, 239892)	0.11233766834628615
  (0, 198821)	0.11233766834628615
  (0, 87007)	0.11233766834628615
  (0, 1689112)	0.11233766834628615
  (0, 880561)	0.11233766834628615
  (0, 1168574)	0.11233766834628615
  :	:
  (41371, 802865)	0.1414267623753692
  (41371, 192000)	0.13211877826835347
  (41371, 1303580)	0.10843628334018819
  (41371, 451336)	0.1

### create cosine similarity matrix

In [16]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

(41372, 41372)

### Write a function with name `recommend` which takes `title` as argument and returns a list of 10 recommended title names in the output based on the above cosine similarities

In [17]:
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

In [18]:
def recommend(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

### Give the recommendations from above functions for movies `The Godfather` and `The Dark Knight Rises`

In [19]:
recommend('The Godfather')

40138    The Godfather Trilogy: 1972-1990
1162               The Godfather: Part II
29583                    Honor Thy Father
21784                          Blood Ties
11574          The Cave of the Yellow Dog
34854            A Mother Should Be Loved
17426                     The Outside Man
10925                    Household Saints
4259                                 Made
29                         Shanghai Triad
Name: title, dtype: object

In [20]:
recommend('The Dark Knight Rises')

12041                                      The Dark Knight
149                                         Batman Forever
1311                                        Batman Returns
3042                          Batman: Mask of the Phantasm
583                                                 Batman
20234                      Batman: Mystery of the Batwoman
14858                           Batman: Under the Red Hood
8966                    Batman Beyond: Return of the Joker
23685                                    Batman vs Dracula
20042    Batman Unmasked: The Psychology of the Dark Kn...
Name: title, dtype: object