In [3]:
import numpy as np
import pandas as pd

In [4]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

In [5]:
movies_credits = movies.merge(credits, on='title')

In [6]:
movies_credits = movies_credits[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [7]:
movies_credits.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [8]:
movies_credits.dropna(inplace=True)

In [9]:
movies_credits.duplicated().sum()

0

In [10]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [11]:
import ast
def convert(obj):
    res = []
    for i in ast.literal_eval(obj):
        res.append(i['name'])
    return res

In [12]:
movies_credits['genres'] = movies_credits['genres'].apply(convert)

In [13]:
movies_credits['keywords'] = movies_credits['keywords'].apply(convert)

In [14]:
import ast
def convert3(obj):
    res = []
    count = 0
    for i in ast.literal_eval(obj):
        if count != 3:
            res.append(i['name'])
            count += 1
        else:
            break
    return res

In [15]:
movies_credits['cast'] = movies_credits['cast'].apply(convert3)

In [16]:
import ast
def fetchDirector(obj):
    res = []
    for i in ast.literal_eval(obj):
        if i['job'] == "Director":
            res.append(i['name'])
            break
    return res

In [17]:
movies_credits['crew'] = movies_credits['crew'].apply(fetchDirector)

In [18]:
movies_credits['overview'] = movies_credits['overview'].apply(lambda x: x.split())

In [19]:
movies_credits['genres'] = movies_credits['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies_credits['keywords'] = movies_credits['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies_credits['cast'] = movies_credits['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies_credits['crew'] = movies_credits['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

In [21]:
movies_credits['tags'] = movies_credits['overview'] + movies_credits['genres'] + movies_credits['keywords'] + movies_credits['cast'] + movies_credits['crew']

In [22]:
new_df = movies_credits[['movie_id', 'title', 'tags']]

In [23]:
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))


In [24]:
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


In [25]:
import nltk

In [26]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [27]:
def stem(text):
    res = []
    for i in text.split():
        res.append(ps.stem(i))
    return " ".join(res)

In [28]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [29]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [30]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

In [33]:
similarity = cosine_similarity(vectors)

In [41]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [45]:
recommend('Iron Man')

Iron Man 3
Iron Man 2
Avengers: Age of Ultron
The Avengers
Captain America: Civil War


In [46]:
import pickle

In [None]:
pickle.dump(new_df, open('movies.pkl', 'wb'))