Imports 

In [1]:
import numpy as np
import pandas as pd
import ast
import pickle
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Data Pre-processing

In [3]:
#reading dataset
movies=pd.read_csv('tmdb_5000_movies.csv')
credits=pd.read_csv('tmdb_5000_credits.csv')

#merge two dataset
movies=movies.merge(credits,on='title')

#Filtering and keeping only necessary data entries
movies=movies[['movie_id','title','overview','genres','keywords','cast','crew']]

#removing missing values and duplicates
movies.dropna(inplace=True)
movies.drop_duplicates(inplace=True)

#formating datas
def convert(obj):
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

def convertCrew(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            L.append(i['name'])
    return L

def convertCast(obj):
    L=[]
    for i in ast.literal_eval(obj)[:3]:
        L.append(i['name'])
    return L


#converting to list
movies['genres']=movies['genres'].apply(convert)

movies['keywords']=movies['keywords'].apply(convert)

movies['cast']=movies['cast'].apply(convertCast)

movies['crew']=movies['crew'].apply(convertCrew)

#converting to string without spaces to make it more accurate 
movies['overview']=movies['overview'].apply(lambda x:x.split())

movies['genres']=movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])

movies['keywords']=movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])

movies['cast']=movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])

movies['crew']=movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

# concatenating all the columns to tag column
movies['tags'] = movies['overview'] + movies['genres'] + \
    movies['keywords'] + movies['cast'] + movies['crew']

# Selecting only necessary columns for new DataFrame
new_df = movies[['movie_id', 'title', 'tags']]
new_df.loc[:, 'tags'] = new_df['tags'].apply(lambda x: " ".join(x))
new_df.loc[:, 'tags'] = new_df['tags'].apply(lambda x: x.lower())

#using nltk to stem the words

stemmer = PorterStemmer()
def stem(text):
    y=[]
    for word in text.split():
        y.append(stemmer.stem(word))
    return " ".join(y)


new_df.loc[:, 'tags'] = new_df['tags'].apply(stem)

new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."


Text Vectorization

In [4]:
cv=CountVectorizer(max_features=5000,stop_words='english')
vectors=cv.fit_transform(new_df['tags']).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

Calculating Cosine Similarity 

In [5]:
similarity=cosine_similarity(vectors)
similarity[1]

array([0.08346223, 1.        , 0.06063391, ..., 0.02378257, 0.        ,
       0.02615329])

Main function for recommendation 
    This function does the following things:
        1.Find Index: Locate the index of the input movie title in the DataFrame.
        2.Calculate Similarity: Retrieve similarity scores for the input movie.
        3.Sort Similarity: Sort similarity scores, excluding the input movie itself.
        4.Extract Top Similar: Extract indices of the top 5 similar movies.
        5.Print Recommendations: Print titles of recommended movies.

In [6]:
def recommend(movie):
    movie_index=new_df[new_df['title']==movie].index[0]
    distances=similarity[movie_index]
    movies_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

Exporting the model

In [7]:
pickle.dump(new_df, open('movie_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl','wb'))