# Content based Movie Recommendation Engine

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import ast
import nltk
import pickle
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

### Import Datasets

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

### Merging the above two data sets on basis of 'title' column

In [3]:
movies = movies.merge(credits, on='title')

### Keeping only necessary data columns

In [4]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew','vote_average']]

### Changing 'vote_average' to 'ratings'

In [5]:
movies = movies.rename(columns={'vote_average':'ratings'})

### Removing vacant columns and checking duplicate data

In [6]:
movies.dropna(inplace=True)
movies.duplicated().sum()

0

## Creating a functions to extract specific data from df

In [7]:
def convert(obj):
    l = []
    for i in ast.literal_eval(obj):
        l.append(i['name']) 
    return l 

In [8]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [9]:
def convert_cast(obj):
    l = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter !=5 :
            l.append(i['name'])
            counter+=1
        else:
            break
    return l 

In [10]:
movies['cast'] = movies['cast'].apply(convert_cast)

In [11]:
def get_director(obj):
    l = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            l.append(i['name'])
    return l 

In [12]:
movies['crew'] = movies['crew'].apply(get_director)

### Handling data to generate tags

In [13]:
movies['new_overview'] = movies['overview'].apply(lambda x:x.split())
movies['new_cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['new_crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])
movies['new_genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])

## Generate Tags

In [14]:
movies['tags'] = movies['new_overview'] + movies['new_cast'] + movies['new_crew'] + movies['new_genres'] + movies['keywords']

## Forming a new DataFrame

In [15]:
new_df = movies[['movie_id','title','overview','genres','cast','crew','ratings','tags']]

In [16]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


### Performing stemming before vectorisation to generate discrete vectors

In [17]:
ps = PorterStemmer()

def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y)

In [18]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


## Performing vectorisation on tags

In [19]:
cnt_vec = CountVectorizer(max_features=5000,stop_words='english')

### Converting the generated vectors into a numPy array

In [20]:
vectors = cnt_vec.fit_transform(new_df['tags']).toarray()

## Calculating similarity scores using cosine similarity

In [21]:
similarity_score = cosine_similarity(vectors)

## Recommendations based on similarity scores

In [22]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    movies_list = sorted(list(enumerate(similarity_score[movie_index])),reverse=True,key = lambda x: x[1])[1:11]
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [23]:
recommend('Avatar')

Aliens vs Predator: Requiem
Independence Day
Falcon Rising
Battle: Los Angeles
Titan A.E.
Aliens
Small Soldiers
Meet Dave
Jupiter Ascending
Lifeforce


In [24]:
pickle.dump(new_df.to_dict(),open('movies_dict.pkl','wb'))
pickle.dump(similarity_score,open('similarity.pkl','wb'))