In [1]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle


In [2]:
## Import two datafile and merge into one file 
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')
merge = movies.merge(credits, on='title')

In [3]:
## Extract usefull information from the data frame
merge = merge[['movie_id', 'title', 'overview', 'genres', 'keywords','cast','crew']]
merge.info()
merge.dropna(inplace=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4809 non-null   int64 
 1   title     4809 non-null   object
 2   overview  4806 non-null   object
 3   genres    4809 non-null   object
 4   keywords  4809 non-null   object
 5   cast      4809 non-null   object
 6   crew      4809 non-null   object
dtypes: int64(1), object(6)
memory usage: 300.6+ KB


In [4]:
merge.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [5]:
merge.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [6]:
## Print out the genres format
merge.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [7]:
## Print out the keywords format
merge.iloc[0].keywords

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [8]:
## Convert {"id": 1463, "name": "culture clash"} => culture clash
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [9]:
## Apply convert into genres and keywords
merge['genres'] = merge['genres'].apply(convert)
merge['keywords'] = merge['keywords'].apply(convert)

In [10]:
# Get the top-3 actor in the movie 
def convert_top_3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

# Get the director of the movie
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
                L.append(i['name'])
                break
    return L

In [11]:
merge['cast'] = merge['cast'].apply(convert_top_3)
merge['crew'] = merge['crew'].apply(fetch_director)

In [12]:
merge['overview'] = merge['overview'].apply(lambda x:x.split())
merge['genres'] = merge['genres'].apply(lambda x:[i.replace(" ","") for i in x])
merge['keywords'] = merge['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
merge['cast'] = merge['cast'].apply(lambda x:[i.replace(" ","") for i in x])
merge['crew'] = merge['crew'].apply(lambda x:[i.replace(" ","") for i in x])


In [13]:
merge['tags'] = merge['overview'] + merge['genres'] + merge['keywords'] + merge['cast'] + merge['crew'] 
new_df = merge[['movie_id','title','tags']]

In [14]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


In [15]:
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [16]:
counter = CountVectorizer(max_features=5000,stop_words='english')
vectors = counter.fit_transform(new_df['tags']).toarray()


In [17]:
counter.fit_transform(new_df['tags']).toarray().shape

(4806, 5000)

In [18]:
# counter.get_feature_names()

In [19]:
# Creating a PorterStemmer
ps = PorterStemmer()
# Function for stemming
def stem(text):
    y= []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

new_df['tags'] = new_df['tags'].apply(stem) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [20]:
vectors = counter.fit_transform(new_df['tags']).toarray()
similarity = cosine_similarity(vectors)

In [21]:
similarity.shape

(4806, 4806)

In [22]:
# sorted(similarity[0], reverse=True)

In [23]:
sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x:x[1])[1:6]

[(1216, 0.28676966733820225),
 (2409, 0.26901379342448517),
 (3730, 0.2605130246476754),
 (507, 0.255608593705383),
 (539, 0.25038669783359574)]

In [24]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]

    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]

    for i in movie_list:

        print(new_df.iloc[i[0]].title)

In [25]:
recommend('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


In [26]:
def recommend3(movies):
    movie_list = []
    for i in movies:
        movie_index = new_df[new_df['title'] == i].index[0]
        distances = similarity[movie_index]
        recommended = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:8]
        for j in recommended:
            movie_list.append(j)
            print(new_df.iloc[j[0]].title)

In [27]:
recommend3(["Avatar","Spectre","The Dark Knight Rises"])

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.
Battle: Los Angeles
Predators
Quantum of Solace
Skyfall
Never Say Never Again
From Russia with Love
Octopussy
Diamonds Are Forever
Thunderball
The Dark Knight
Batman Returns
Batman
Batman Forever
Batman Begins
Batman
Batman & Robin


In [28]:
pickle.dump(new_df.to_dict(), open('movies_dict.pkl','wb'))

In [29]:
# new_df.to_dict()

In [30]:
pickle.dump(similarity,open('similarity.pkl','wb'))