# model_testing.ipynb (Training the model)

In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import warnings

In [2]:
warnings.filterwarnings("ignore")
credits = pd.read_csv('datasets/tmdb_5000_credits.csv')
movies = pd.read_csv('datasets/tmdb_5000_movies.csv')

In [3]:
movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [4]:
credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [5]:
movies = movies.merge(credits,on='title')
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [6]:
movies_data = movies[['id', 'production_countries', 'cast', 'crew', 'release_date', 'genres',
                      'runtime', 'tagline', 'vote_average', 'vote_count']]

In [7]:
movies_data.head(2)

Unnamed: 0,id,production_countries,cast,crew,release_date,genres,runtime,tagline,vote_average,vote_count
0,19995,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",2009-12-10,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",162.0,Enter the World of Pandora.,7.2,11800
1,285,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",2007-05-19,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",169.0,"At the end of the world, the adventure begins.",6.9,4500


In [8]:
movies = movies[['id','title','overview','genres','keywords','cast','crew']]
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [9]:
movies.isnull().sum()

id          0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [10]:
movies.dropna(inplace=True)

In [11]:
movies.isnull().sum()

id          0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [12]:
movies.duplicated().sum()

0

In [13]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [14]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [15]:
movies_data['genres'] = movies_data['genres'].apply(convert)
movies_data['cast'] = movies_data['cast'].apply(ast.literal_eval)
movies['genres'] = movies['genres'].apply(convert)

In [16]:
def fetchCrew(obj):
    L = []
    for i in ast.literal_eval(obj):
        d = {}
        if i['job']=='Director' or i['job']=='Screenplay' or i['job']=='Writing' or i['job']=='Characters' or i['job']=='Sound' or i['job']=='Story':
            d[i['job']] = i['name']
            L.append(d)
    return L

In [17]:
movies_data['crew'] = movies_data['crew'].apply(fetchCrew)

In [18]:
movies_data.head(2)

Unnamed: 0,id,production_countries,cast,crew,release_date,genres,runtime,tagline,vote_average,vote_count
0,19995,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...","[{'cast_id': 242, 'character': 'Jake Sully', '...","[{'Director': 'James Cameron'}, {'Screenplay':...",2009-12-10,"[Action, Adventure, Fantasy, Science Fiction]",162.0,Enter the World of Pandora.,7.2,11800
1,285,"[{""iso_3166_1"": ""US"", ""name"": ""United States o...","[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{'Director': 'Gore Verbinski'}, {'Screenplay'...",2007-05-19,"[Adventure, Fantasy, Action]",169.0,"At the end of the world, the adventure begins.",6.9,4500


In [19]:
movies['keywords'] = movies['keywords'].apply(convert)
movies.head(2)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [20]:
def convert_actors(obj):
    L = []
    c=0
    for i in ast.literal_eval(obj):
        if(c!=5):
            L.append(i['name'])
            c+=1
        else:
            break
    return L

In [21]:
movies['cast'] = movies['cast'].apply(convert_actors)
movies.head(2)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [22]:
def fetchDirector(obj):
    L = []
    for i in ast.literal_eval(obj):
        if(i['job']=='Director'):
            L.append(i['name'])
            break
    return L

In [23]:
movies['crew'] = movies['crew'].apply(fetchDirector)
key_order = ['Director', 'Screenplay', 'Characters', 'Sound', 'Story']
movies_data['crew'] = movies_data['crew'].apply(lambda x: sorted(x, key=lambda d: key_order.index(list(d.keys())[0])))

In [24]:
movies.head(2)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]


In [25]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())
movies.head(2)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]


In [26]:
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [27]:
movies.head(2)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski]


In [28]:
movies['tags'] = movies['keywords'] + movies['overview'] + movies['genres'] + movies['cast'] + movies['crew']
movies.head(2)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron],"[cultureclash, future, spacewar, spacecolony, ..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski],"[ocean, drugabuse, exoticisland, eastindiatrad..."


In [29]:
data = movies[['id', 'title', 'tags']]
data.head(2)

Unnamed: 0,id,title,tags
0,19995,Avatar,"[cultureclash, future, spacewar, spacecolony, ..."
1,285,Pirates of the Caribbean: At World's End,"[ocean, drugabuse, exoticisland, eastindiatrad..."


In [30]:
data['tags'] = data['tags'].apply(lambda x: " ".join(x))

In [31]:
data['tags'] =  data['tags'].apply(lambda x:x.lower())

In [32]:
data.head(2)

Unnamed: 0,id,title,tags
0,19995,Avatar,cultureclash future spacewar spacecolony socie...
1,285,Pirates of the Caribbean: At World's End,ocean drugabuse exoticisland eastindiatradingc...


In [33]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [34]:
vectors = cv.fit_transform(data['tags']).toarray()

In [35]:
similarity = cosine_similarity(vectors)

In [36]:
def recommend(movie):
    
    movie_index = data[data['title']==movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:10]

    #to fetch movies from indeces
    for i in movies_list:
        print(data.iloc[i[0]].title)
        
recommend('Iron Man')

Iron Man 2
Iron Man 3
Avengers: Age of Ultron
Captain America: Civil War
Ant-Man
The Avengers
X-Men
X-Men: The Last Stand
Thor: The Dark World


In [39]:
pickle.dump(similarity,open('similarity.pkl','wb'))
pickle.dump(movies,open('movies.pkl','wb'))
pickle.dump(movies_data,open('movies_data.pkl','wb'))

In [38]:
movies4 = pickle.load(open('models/movies.pkl', 'rb'))
movies_data4 = pickle.load(open('models/movies_data.pkl', 'rb'))
similarity4 = pickle.load(open('models/similarity.pkl', 'rb'))