In [2]:
import numpy as np
import pandas as pd
import pickle
import ast

In [3]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [4]:
movies = movies.merge(credits, on='title')

In [5]:
movies = movies[['movie_id', 'title','overview','genres','keywords','cast','crew']]

In [6]:
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L

In [7]:
movies.dropna(inplace=True)

In [8]:
movies['genres'] = movies['genres'].apply(convert)

In [9]:
ast.literal_eval(
    '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [10]:
movies['cast'] = movies['cast'].apply(convert)

In [11]:
movies['cast'] = movies['cast'].apply(lambda x: x[0:3])

In [12]:
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

In [13]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [14]:
def collapse(L):
    L1 = []
    for i in L:
        L1.append(i.replace(" ", ""))
    return L1

In [15]:
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)

In [16]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [17]:
movies['tags'] = movies['overview'] + movies['genres'] + \
    movies['keywords'] + movies['cast'] + movies['crew']

In [18]:
new = movies.drop(columns=['overview', 'genres','keywords','cast','crew'])

In [19]:
new['tags'] = new['tags'].apply(lambda x: " ".join(x))

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [21]:
vector = cv.fit_transform(new['tags']).toarray()

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
similarity = cosine_similarity(vector)

In [24]:
new[new['title'] == 'The Lego Movie'].index[0]

744

In [25]:
def recommend(movie):
    index = new[new['title'] == movie].index[0]
    distances = sorted(
        list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    for i in distances[1:6]:
        print(new.iloc[i[0]].title)

In [26]:
recommend('Gandhi')

Gandhi, My Father
A Passage to India
Ramanujan
Chariots of Fire
Mr. Turner


In [27]:
new['title'].values

array(['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre',
       ..., 'Signed, Sealed, Delivered', 'Shanghai Calling',
       'My Date with Drew'], dtype=object)

In [28]:
pickle.dump(new.to_dict(), open('movies_data.pkl','wb'))

In [29]:
pickle.dump(similarity, open('similarity.pkl','wb'))