In [1]:
import pandas as pd
import numpy as np


from sklearn.feature_extraction.text import CountVectorizer
# tool for preprocessing text data and converting it into a numerical representation that can be used for machine learning.

from sklearn.metrics.pairwise import cosine_similarity
# used to compute the cosine similarity between two or more vectors.

import pickle
#used  to convert Python objects into a stream of bytes that can be written to a file or sent over a network.

In [2]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data=pd.read_csv("/content/drive/MyDrive/dataset/Movies/Processed Dataset.csv")
data.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,century paraplegic marine dispatched pandora u...
1,285,Pirates of the Caribbean: At World's End,captain barbossa believed dead headed earth tu...
2,206647,Spectre,cryptic message bonds sends trail uncover sini...
3,49026,The Dark Knight Rises,following death district attorney harvey dent ...
4,49529,John Carter,carter warweary former military captain whos i...


In [4]:
cv = CountVectorizer(max_features=2000,stop_words='english')

In [5]:
vector = cv.fit_transform(data['tags']).toarray()

Calculating distance among all the vectors. Greater the distance, smaller the similarity.
There are multiple ways to calculate the distances- such as euclidean  distance(point to point distance= root(x2-xi)^2/(y2-y1)^2),
but we will use Cosine distance (angle between the vectors) as we have a very high dimension data.

Cosine distance is inversly proportion to similarity
we will use sklearn library and from ther we will import cosine similarity

In [6]:
# we are stringing the distance in a variable similarity
similarity = cosine_similarity(vector)

 # Enum usage for recommendation

In [7]:
#suppose we're finding similar movies to movie with index 0
#we'll sort CountVectorized martix in non-increasing order
sorted(similarity[0],reverse=True)[0:5]

[1.0,
 0.3930042298310422,
 0.36305409909144715,
 0.3478327964999673,
 0.33333333333333337]

In [8]:
#here we lost the indices of movies, to prevent this, we'll enumerate the similarity matrix and pass it to a list, then sort
sorted(list(enumerate(similarity[0])),reverse=True)[0:5]
# lambda funtion is used to tell me

[(4799, 0.0),
 (4798, 0.04667600280093366),
 (4797, 0.03928371006591931),
 (4796, 0.08606629658238704),
 (4795, 0.04536092116265145)]

In [9]:
#another problem arises, that it sorts on basis of 1st term (to prevent we use lambda function)
sorted(list(enumerate(similarity[0])),key=lambda x:x[1],reverse=True)[0:5]

[(0, 1.0),
 (1213, 0.3930042298310422),
 (507, 0.36305409909144715),
 (1191, 0.3478327964999673),
 (83, 0.33333333333333337)]

# Declaring model

In [10]:
def recommend(movie):
    movie_index=data[data['title']==movie].index[0] #finding the index of movie mentioned, & finding it's similarity matrix
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
    #we will use sorting as sorting in reverse order which and pick out the first 5 movies which will the most similar movies
    #along with enumerators to keep track of original indices
    output = []
    for i in movies_list:
        output.append(data.iloc[i[0]].title)
        #finding title for the indices that were the most similar
    return output

# Predictions

In [11]:
recommend('Pirates of the Caribbean: At World\'s End')

["Pirates of the Caribbean: Dead Man's Chest",
 'Pirates of the Caribbean: On Stranger Tides',
 'Pirates of the Caribbean: The Curse of the Black Pearl',
 'Life of Pi',
 'The Pirates! In an Adventure with Scientists!']

In [26]:
recommend('Batman Begins')

['The Dark Knight',
 "Amidst the Devil's Wings",
 'Batman',
 'The Dark Knight Rises',
 'Dead Man Down']

# Exporting Data using pickle

In [27]:
import pickle


In [28]:
pickle.dump(data.to_dict(),open('movies_dictionary.pkl','wb')) #extracting movie title and id in dictionary format, in binary format

In [29]:
pickle.dump(similarity,open('similarity_matrix.pkl','wb')) #extracting similarity matrix in binary format