In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [36]:
df_movie = pd.read_csv('/home/Machine Learning/projects/movie/dataset/n_movies.csv')

In [37]:
train_data, test_data = train_test_split(df_movie, test_size=0.25)
print('Number of training set: ', len(train_data))
print('Number of test set: ', len(test_data))
print(train_data.head(1))

Number of training set:  7467
Number of test set:  2490
                                title year certificate duration   genre  \
6512  My Wife and I Bought A Ranch...  NaN         NaN      NaN  Horror   

      rating                                        description stars votes  
6512     NaN  A young married couple moves onto a secluded r...    []   NaN  


In [38]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(train_data['description'])

In [39]:
tfidf_matrix.shape

(7467, 105654)

In [40]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [42]:
cosine_sim[0]

array([1., 0., 0., ..., 0., 0., 0.])

In [45]:
df_movie = df_movie.reset_index()
titles = df_movie['title']
indices = pd.Series(df_movie.index, index=df_movie['title'])

In [75]:
print(indices.shape, cosine_sim.shape)

(9957,) (7467, 7467)


In [67]:
#get recommendations from cosine similartiy matrix
def get_recommendations(title):
    #get index of movie that matches title
    idx = indices[title]
    #get pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    #sort movies based on similarity scores
    sim_scores = sorted(sim_scores, reverse=True)
    #get scores of 10 most similar movies
    sim_scores = sim_scores[1:11]
    #get movie indices
    movie_indices = [i[0] for i in sim_scores]
    #return top 10 most similar movies
    return df_movie['title'].iloc[movie_indices]

In [79]:
get_recommendations('The Secret Life of My Secretary').head(20)

7465                                         K.O. 3an Guo
7464                                  Your Life Is a Joke
7463                    Cuba's Long Shadow of Remembrance
7462    What 16 Movies Looked Like Behind the Scenes i...
7461                                  Amy and the Orphans
7460                                        Stunt Science
7459                         River, El Más Grande Siempre
7458              One Piece: Entering into the Grand Line
7457                         Bigflo & Oli: Hip Hop Frenzy
7456                                Fary Is the New Black
Name: title, dtype: object