In [13]:
import pandas as pd
from rake_nltk import Rake

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv('data/imdb_movie_data.csv')
df = df[['title','description']]

df.head()

Unnamed: 0,title,description
0,Guardians of the Galaxy,A group of intergalactic criminals are forced ...
1,Prometheus,"Following clues to the origin of mankind, a te..."
2,Split,Three girls are kidnapped by a man with a diag...
3,Sing,"In a city of humanoid animals, a hustling thea..."
4,Suicide Squad,A secret government agency recruits some of th...


In [14]:
# initializing the new column
df['Key_words'] = ""

for index, row in df.iterrows():
    plot = row['description']
    
    # instantiating Rake, by default it uses english stopwords from NLTK
    # and discards all puntuation characters as well
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary whith key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column for the corresponding movie
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
#df.drop(columns = ['description'], inplace = True)
df.head()

Unnamed: 0,title,description,Key_words
0,Guardians of the Galaxy,A group of intergalactic criminals are forced ...,"[group, forced, fanatical, warrior, work, toge..."
1,Prometheus,"Following clues to the origin of mankind, a te...","[team, finds, soon, realize, alone, following,..."
2,Split,Three girls are kidnapped by a man with a diag...,"[frightful, new, 24th, diagnosed, 23, distinct..."
3,Sing,"In a city of humanoid animals, a hustling thea...","[lives, attempt, hustling, theater, impresario..."
4,Suicide Squad,A secret government agency recruits some of th...,"[villains, first, mission, secret, government,..."


In [15]:
# instantiating and generating the count matrix
count = CountVectorizer(analyzer=lambda x: x)
df['Key_words']

0      [group, forced, fanatical, warrior, work, toge...
1      [team, finds, soon, realize, alone, following,...
2      [frightful, new, 24th, diagnosed, 23, distinct...
3      [lives, attempt, hustling, theater, impresario...
4      [villains, first, mission, secret, government,...
                             ...                        
995    [one, teenage, daughters, along, knit, team, s...
996    [three, american, college, students, studying,...
997    [arts, two, dance, students, maryland, school,...
998    [reunite, going, woman, pal, marry, pair, frie...
999    [trapped, inside, stuffy, businessman, finds, ...
Name: Key_words, Length: 1000, dtype: object

In [18]:
count_matrix = count.fit_transform(df['Key_words'])

In [19]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [20]:
# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use in the function to match the indexes
indices = pd.Series(df.index)

#  defining the function that takes in movie title 
# as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    # initializing the empty list of recommended movies
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]
   

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(df.index)[i])
        
    return recommended_movies

In [21]:
recommendations(200)

[699, 922, 120, 115, 483, 521, 52, 616, 398, 849]