In [4]:
import numpy as np
import pandas as pd

In [5]:
movies = pd.read_csv('datasets/tmdb_5000_movies.csv')
credits = pd.read_csv('datasets/tmdb_5000_credits.csv')

In [6]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [7]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [8]:
##Data Preprocessing

In [9]:
data = movies.merge(credits, on='title')

In [10]:
data.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [11]:
# Useful features
# budget
# homepage
# id
# original_language
# original_title
# popularity
# production_comapny
# production_countries
# release-date(not sure)

In [12]:
data = data[['movie_id','title','overview','genres','keywords','cast','crew']]

In [13]:
data.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [14]:
data.dropna(inplace=True)

In [15]:
data.duplicated().sum()

0

In [16]:
import ast
def format(dict, total=float('inf')):

    if not isinstance(dict, str):
        return dict
    L = []
    for i, item in enumerate(ast.literal_eval(dict)):
        L.append(item['name']) 
        if i>=total:
            break
    return L 

In [17]:
data['genres'] = data['genres'].apply(format)
data.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [18]:
data['keywords'] = data['keywords'].apply(format)

In [19]:
data.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [20]:
#keep first 3 cast
data['cast'] = data['cast'].apply(lambda x: format(x, total=3))
data.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [21]:
def fetch_director(text):
    if not isinstance(text, str):
        return text
    L = []
    for item in ast.literal_eval(text):
        if item['job'] == 'Director':
            L.append(item['name'])
    return L 

In [22]:
data['crew'] = data['crew'].apply(fetch_director)
data.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]


In [23]:
data['overview'] = data['overview'].apply(lambda x:x.split())

In [24]:
data['genres'] = data['genres'].apply(lambda x:[ w.replace(" ", "") for w in x ])
data['keywords'] = data['keywords'].apply(lambda x:[ w.replace(" ", "") for w in x ])
data['cast'] = data['cast'].apply(lambda x:[ w.replace(" ", "") for w in x ])
data['crew'] = data['crew'].apply(lambda x:[ w.replace(" ", "") for w in x ])

In [25]:
data.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]


In [26]:
data['tags'] = data['overview'] + data['genres'] + data['keywords'] + data['cast'] + data['crew']


In [27]:
new_data = data[['movie_id','title', 'tags']].copy()

In [28]:
new_data['tags'] = new_data['tags'].apply(lambda x: " ".join(x))
new_data['tags'] = new_data['tags'].apply(lambda x: x.lower())

In [29]:
new_data['tags']

0       in the 22nd century, a paraplegic marine is di...
1       captain barbossa, long believed to be dead, ha...
2       a cryptic message from bond’s past sends him o...
3       following the death of district attorney harve...
4       john carter is a war-weary, former military ca...
                              ...                        
4804    el mariachi just wants to play his guitar and ...
4805    a newlywed couple's honeymoon is upended by th...
4806    "signed, sealed, delivered" introduces a dedic...
4807    when ambitious new york attorney sam is sent t...
4808    ever since the second grade when he first saw ...
Name: tags, Length: 4806, dtype: object

In [30]:
##Vectorization

In [31]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [32]:
def stem(text):
    y = []
    for t in text.split():
        y.append(ps.stem(t))
    return " ".join(y)             



In [33]:
stem('loves loved')

'love love'

In [34]:
new_data['tags'] = new_data['tags'].apply(stem)

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [36]:
vector = cv.fit_transform(new_data['tags']).toarray()

In [37]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [38]:
from sklearn.metrics.pairwise import cosine_similarity

In [39]:
similarity = cosine_similarity(vector)

In [41]:
similarity.shape


(4806, 4806)

In [49]:
def get_similar_movies(movie_name):
    index = new_data[new_data['title'] == movie_name].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for movie in distances[1:6]:
        print(new_data.iloc[movie[0]].title)

In [54]:
get_similar_movies('Avatar')

Aliens vs Predator: Requiem
Falcon Rising
Independence Day
Titan A.E.
Battle: Los Angeles


In [51]:
import pickle


In [53]:
pickle.dump(new_data,open('movie_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))