In [1]:
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [2]:
ps = PorterStemmer()

def stem(text):
    ls = []
    for i in text.split():
        ls.append(ps.stem(i))
    return " ".join(ls)

In [3]:
data = pd.read_csv("final_data.csv")

In [4]:
data['tags'] = data['tags'].apply(stem)

In [5]:
data['tags'][0]

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron'

In [6]:
# Converting the tags into vectors
cv = CountVectorizer(max_features=5000, stop_words='english')

In [7]:
vectors = cv.fit_transform(data['tags']).toarray()

In [8]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [9]:
cv.get_feature_names()[:20]

['000',
 '007',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '17th',
 '18',
 '18th',
 '18thcenturi',
 '19',
 '1910',
 '1920',
 '1930',
 '1940']

In [10]:
# Checking cosine similarity of movies with each other
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [11]:
sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x: x[1])[1:6]

[(1216, 0.28676966733820225),
 (2409, 0.26901379342448517),
 (3730, 0.2605130246476754),
 (507, 0.255608593705383),
 (539, 0.25038669783359574)]

In [28]:
def recommend_movie(movie):
    movie_index = data[data['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    for i in movies_list:
        print(data.iloc[i[0]].title)
         
    

In [29]:
recommend_movie('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


In [38]:
data['title'] = data['title'].str.lower()

In [67]:
data.head(10)

Unnamed: 0,movie_id,title,tags
0,19995,avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,pirates of the caribbean: at world's end,"captain barbossa, long believ to be dead, ha c..."
2,206647,spectre,a cryptic messag from bond’ past send him on a...
3,49026,the dark knight rises,follow the death of district attorney harvey d...
4,49529,john carter,"john carter is a war-weary, former militari ca..."
5,559,spider-man 3,the seemingli invinc spider-man goe up against...
6,38757,tangled,when the kingdom' most wanted-and most charmin...
7,99861,avengers: age of ultron,when toni stark tri to jumpstart a dormant pea...
8,767,harry potter and the half-blood prince,"as harri begin hi sixth year at hogwarts, he d..."
9,209112,batman v superman: dawn of justice,fear the action of a god-lik super hero left u...


In [40]:
import pickle

pickle.dump(data.to_dict(),open('movies_dict.pkl', 'wb'))

In [41]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [66]:
data[data['title'] == 'batman'].iloc[0].movie_id

268

In [57]:
data[data['title'] == 'batman'].index

Int64Index([1362, 1363, 1364, 1365], dtype='int64')