In [56]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [57]:
df = pd.read_csv('Dataset/final_dataset.csv')

In [58]:
df.shape

(4806, 4)

In [59]:
df.tail()

Unnamed: 0.1,Unnamed: 0,movie_id,title,tags
4801,4804,9367,El Mariachi,el mariachi just wants to play his guitar and ...
4802,4805,72766,Newlyweds,a newlywed couple's honeymoon is upended by th...
4803,4806,231617,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduces a dedic..."
4804,4807,126186,Shanghai Calling,when ambitious new york attorney sam is sent t...
4805,4808,25975,My Date with Drew,ever since the second grade when he first saw ...


In [37]:
df['title'].unique

<bound method Series.unique of 0                                         Avatar
1       Pirates of the Caribbean: At World's End
2                                        Spectre
3                          The Dark Knight Rises
4                                    John Carter
                          ...                   
4801                                 El Mariachi
4802                                   Newlyweds
4803                   Signed, Sealed, Delivered
4804                            Shanghai Calling
4805                           My Date with Drew
Name: title, Length: 4806, dtype: object>

In [8]:

"""
Need to apply stemming
dancing , dance, danc

"""
import nltk

In [9]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [10]:
def stemming(text):
    
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
        
    return " ".join(y)

In [11]:
df['tags'] = df['tags'].apply(stemming)

In [12]:
# used Bag of words technique using Countvectorisor
cv = CountVectorizer(max_features=5000, stop_words='english')

In [13]:
# fit_transform gives you a sparse matrix
sparse_matrix = cv.fit_transform(df['tags'])
# convert to numpy array
vectors = sparse_matrix.toarray()

In [14]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [15]:
cv.get_feature_names()



['000',
 '007',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '18th',
 '18thcenturi',
 '19',
 '1910',
 '1920',
 '1930',
 '1940',
 '1950',
 '1950s',
 '1960',
 '1960s',
 '1970',
 '1970s',
 '1980',
 '1985',
 '1990',
 '19th',
 '19thcenturi',
 '20',
 '200',
 '2009',
 '20th',
 '24',
 '25',
 '30',
 '300',
 '3d',
 '40',
 '50',
 '500',
 '60',
 '70',
 '80',
 'aaron',
 'aaroneckhart',
 'aarontaylor',
 'abandon',
 'abduct',
 'abigailbreslin',
 'abil',
 'abl',
 'aboard',
 'abov',
 'abus',
 'academi',
 'accept',
 'access',
 'accid',
 'accident',
 'acclaim',
 'accompani',
 'accomplish',
 'account',
 'accus',
 'ace',
 'achiev',
 'act',
 'action',
 'actionhero',
 'activ',
 'activist',
 'activities',
 'actor',
 'actress',
 'actual',
 'adam',
 'adambrodi',
 'adamsandl',
 'adamscott',
 'adamshankman',
 'adapt',
 'add',
 'addict',
 'adjust',
 'admir',
 'admit',
 'adolesc',
 'adopt',
 'ador',
 'adrienbrodi',
 'adult',
 'adultanim',
 'adulteri',
 'adulthood',
 'advanc',
 'adventur',

In [16]:
# We have to calculate the cosin distance, not the eucleadian distance when we deal with high dimensionality
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(vectors)

In [17]:
similarity_matrix[0]

array([1.        , 0.08006408, 0.08492078, ..., 0.04441156, 0.        ,
       0.        ])

In [18]:
m_list = sorted(list(enumerate(similarity_matrix[0])),key = lambda x: x[1], reverse=True)[1:6]
for i in m_list:
    print(df['title'][i[0]])

Aliens vs Predator: Requiem
Independence Day
Falcon Rising
Battle: Los Angeles
Titan A.E.


### Start Recommendation 

In [47]:
movie_index = df[df['title'] == "My Date with Drew"].index[0]
print(movie_index)

4805


In [20]:
def recommend(movie):
    movie_index = df[df['title'] == movie].index[0]
    distances = similarity_matrix[movie_index]
    movies_list = sorted(list(enumerate(distances)),key = lambda x: x[1], reverse=True)[1:6]
    
    for i in movies_list:
        print(df['title'][i[0]])

In [38]:
recommend('My Date with Drew')

Bad Grandpa
How to Fall in Love
The R.M.
Time Changer
After Earth
