In [2]:
import numpy as np
import pandas as pd

In [3]:
credits= pd.read_csv('tmdb_5000_credits.csv')
movies= pd.read_csv('tmdb_5000_movies.csv')

In [4]:
movies= movies.merge(credits,on='title')

In [5]:
# all columns
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [6]:
# to_keep= [genres,id,keywords,title,overview,release_date,cast,crew]
movies= movies[['genres','movie_id','keywords','title','overview','release_date','cast','crew']]

In [7]:
movies.head(1)

Unnamed: 0,genres,movie_id,keywords,title,overview,release_date,cast,crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"In the 22nd century, a paraplegic Marine is di...",2009-12-10,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [8]:
# removing null values 
movies.isnull().sum()

genres          0
movie_id        0
keywords        0
title           0
overview        3
release_date    1
cast            0
crew            0
dtype: int64

In [9]:
movies.dropna(inplace=True)

In [10]:
# simplyfying columns genres,keywords,cast and crew

In [11]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [12]:
# helper function
import ast
def convert(obj):
    li= list()
    for i in ast.literal_eval(obj):
        li.append(i['name'])
    return li

In [13]:
movies['genres']= movies['genres'].apply(convert)

In [14]:
movies['keywords']= movies['keywords'].apply(convert)

In [15]:
def convert3(obj):
    li= list()
    counter= 0
    for i in ast.literal_eval(obj):
        if(counter==3):
            break
        li.append(i['name'])
        counter+=1
    return li

In [16]:
movies['cast']= movies['cast'].apply(convert3)

In [17]:
def fetch_director(obj):
    li= list()
    for i in ast.literal_eval(obj):
        if(i['job']== 'Director'):
            li.append(i['name'])
    return li

In [18]:
movies['crew']= movies['crew'].apply(fetch_director)

In [19]:
movies['overview']= movies['overview'].apply(lambda x: x.split())

In [20]:
# removing spaces for unique tags
movies['genres']= movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords']= movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['overview']= movies['overview'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast']= movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])

In [21]:
movies['crew']= movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])

In [22]:
# merging multiple columns into a tag column
movies['tags']= movies['overview']+ movies['genres']+ movies['keywords']+ movies['cast']+ movies['crew']

In [23]:
# converting list to string
movies['tags']= movies['tags'].apply(lambda x: " ".join(x))

In [24]:
movies['tags']= movies['tags'].apply(lambda x: x.lower())

In [25]:
new_df= movies[['movie_id','title','tags']]

In [26]:
new_df

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."
...,...,...,...
4804,9367,El Mariachi,el mariachi just wants to play his guitar and ...
4805,72766,Newlyweds,a newlywed couple's honeymoon is upended by th...
4806,231617,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduces a dedic..."
4807,126186,Shanghai Calling,when ambitious new york attorney sam is sent t...


In [27]:
# stemming the tags column to convert similar words as a same word
import nltk
from nltk.stem.porter import PorterStemmer
ps= PorterStemmer()

In [28]:
def stem(text):
    y= list()
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y)

In [29]:
new_df['tags']= new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']= new_df['tags'].apply(stem)


In [30]:
# using countVectorizer class from scikit learn library to count occurance of most common words in each movie tag(bag of words)
# text vectorization
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features= 5000,stop_words='english')

In [31]:
vectors= cv.fit_transform(new_df['tags']).toarray() 

In [32]:
vectors.shape

(4805, 5000)

In [33]:
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
similarity_matrix= cosine_similarity(vectors)

In [35]:
# contains similarity ratio between a movie and every other movie for all movies 
# smallet the cosine distance greater will be the simalarity ratio(between 0 an 1)
similarity_matrix.shape

(4805, 4805)

In [36]:
def recommend(movie):
    movie_index= new_df[new_df['title']==movie].index[0]
    distances= similarity_matrix[movie_index]
    similar_mov_list= sorted(enumerate(distances),reverse= True, key= lambda x: x[1])[1:6]
    
    for i in similar_mov_list:
        print(new_df.iloc[i[0]].title)
        

In [38]:
recommend('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


In [39]:
import pickle

In [40]:
pickle.dump(new_df,open('movies.pkl','wb'))

In [42]:
pickle.dump(similarity_matrix,open('similarity_matrix.pkl','wb'))