# Movie Recommendation System
### Content based Recommendation

In [1]:
# importing the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# ignoring harmless warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# importing the file
movie=pd.read_csv('tmdb_5000_movies.csv')


credit=pd.read_csv('tmdb_5000_credits.csv')


In [3]:
# Number of Rows and columns

print(movie.shape)
print(credit.shape)

(4803, 20)
(4803, 4)


In [4]:
print(movie.columns)
print(credit.columns)

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')
Index(['movie_id', 'title', 'cast', 'crew'], dtype='object')


In [5]:
print(movie['title'].nunique())
print(credit['title'].nunique())

4800
4800


In [6]:
# merging two different dataframe
df=movie.merge(credit,on='title')
df.shape

(4809, 23)

In [7]:
df.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [8]:
df.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'movie_id', 'cast', 'crew'],
      dtype='object')

In [9]:
# selecting only the required columns
final_df=df[['title','genres','keywords','overview','cast','crew',]]

In [10]:
final_df.head(2)

Unnamed: 0,title,genres,keywords,overview,cast,crew
0,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


# Data Cleaning

In [11]:
# checking for null values
final_df.isna().mean()*100

Unnamed: 0,0
title,0.0
genres,0.0
keywords,0.0
overview,0.062383
cast,0.0
crew,0.0


In [12]:
final_df['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [13]:
final_df['genres'][20]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}]'

In [14]:
import ast

# extracting only the names
def genres_name(genres):
  li=[]
  for value in ast.literal_eval(genres):
    li.append(value['name'])
  return li

In [15]:
final_df['genres']=final_df['genres'].apply(genres_name)

In [16]:
final_df['keywords']=final_df['keywords'].apply(genres_name)

In [17]:
# extracting the main character of the movie
def main_cast(text):
  cast_li=[]
  count=0
  for value in ast.literal_eval(text):
    if count<5:
      cast_li.append(value['name'])
      count+=1
  return cast_li

In [18]:
final_df['cast']=final_df['cast'].apply(main_cast)

In [19]:
# extracting only the director of the movie
def main_director(text):
  main_dir=[]
  for value in ast.literal_eval(text):
    if value['job']=='Director':
      main_dir.append(value['name'])
  return main_dir

In [20]:
final_df['crew']=final_df['crew'].apply(main_director)

In [21]:
# dropping the null values
final_df.isna().sum()

Unnamed: 0,0
title,0
genres,0
keywords,0
overview,3
cast,0
crew,0


In [22]:
# dropping null values
final_df=final_df.dropna()

In [23]:
final_df.shape

(4806, 6)

In [24]:
final_df['cast'][1]

['Johnny Depp',
 'Orlando Bloom',
 'Keira Knightley',
 'Stellan Skarsgård',
 'Chow Yun-fat']

In [25]:
# removing spaces in the names - bcoz different tokens mights be created making it complicatedd
def collapse(text):
  collapsed_li=[]
  for value in text:
    collapsed_li.append(value.replace(" ",""))
  return collapsed_li

In [26]:
for col in ['genres','keywords','cast','crew']:
  final_df[col]=final_df[col].apply(collapse)

In [27]:
# converting the overview into the list of words
final_df['overview']=final_df['overview'].apply(lambda x:x.split())

In [28]:
final_df.head(5)

Unnamed: 0,title,genres,keywords,overview,cast,crew
0,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[In, the, 22nd, century,, a, paraplegic, Marin...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[Captain, Barbossa,, long, believed, to, be, d...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski]
2,Spectre,"[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[A, cryptic, message, from, Bond’s, past, send...","[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",[SamMendes]
3,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[Following, the, death, of, District, Attorney...","[ChristianBale, MichaelCaine, GaryOldman, Anne...",[ChristopherNolan]
4,John Carter,"[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[John, Carter, is, a, war-weary,, former, mili...","[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",[AndrewStanton]


In [29]:
# creating the feature
final_df['context']=final_df['genres']+final_df['keywords']+final_df['overview']+final_df['cast']+final_df['crew']

In [30]:
# dropping the other features
final_df.drop(['genres', 'keywords', 'overview', 'cast', 'crew'],axis=1,inplace=True)

In [31]:
final_df.columns

Index(['title', 'context'], dtype='object')

In [32]:
# converting the list into string
final_df['context']=final_df['context'].apply(lambda x:" ".join(x))

In [33]:
final_df['context'][0]

'Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. SamWorthington ZoeSaldana SigourneyWeaver StephenLang MichelleRodriguez JamesCameron'

# Feature Engineering/Extraction Method - Bag of Words

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorize=TfidfVectorizer(stop_words='english')

vector=vectorize.fit_transform(final_df['context']).toarray()

In [35]:
vector.shape

(4806, 39222)

# Building Recommendation System similarity matrix - Cosine Similarity


In [36]:
# Building Recommendation System
from sklearn.metrics.pairwise import cosine_similarity
similarity=cosine_similarity(vector)
similarity

array([[1.        , 0.01370074, 0.00742948, ..., 0.00306254, 0.00330481,
        0.        ],
       [0.01370074, 1.        , 0.00818174, ..., 0.01054095, 0.        ,
        0.        ],
       [0.00742948, 0.00818174, 1.        , ..., 0.00965227, 0.        ,
        0.        ],
       ...,
       [0.00306254, 0.01054095, 0.00965227, ..., 1.        , 0.01538426,
        0.01765574],
       [0.00330481, 0.        , 0.        , ..., 0.01538426, 1.        ,
        0.0088266 ],
       [0.        , 0.        , 0.        , ..., 0.01765574, 0.0088266 ,
        1.        ]])

In [37]:
# building the recommendation system
import difflib

def recommendation(movie):

  # list of movie title
  list_of_title=final_df['title'].tolist()
  # finding the close match
  movie=difflib.get_close_matches(movie,list_of_title)[0]

  index=final_df[final_df['title']==movie].index[0]
  # finding the highest similarity score
  similarity_score=list(enumerate(similarity[index]))
  # sorted order of highest similarity
  sorted_similarity=sorted(similarity_score,key=lambda x:x[1],reverse=True)

  # Top 5 similar movies
  for i in sorted_similarity[1:6]:
    print(final_df.iloc[i[0]]['title'])

In [38]:
recommendation('Avatar')

Aliens
Battle: Los Angeles
Falcon Rising
Star Trek Into Darkness
Apollo 18


# Downloading the required code


In [39]:
import pickle

In [41]:
#Serializing the movie names

pickle.dump(final_df['title'],open('movie_list.pkl','wb'))

In [42]:
# Serializing the cosine similarity

pickle.dump(similarity,open('cosine_similarity.pkl','wb'))