## Recommender: Overview of Movie Synopsis

This notebook uses the 'overview' column, which is concise description of the movie synopsis, to make a recommender. It uses a TfidfVectorizer to capture infrequently used words and weigh them more heavily (in contrast to CountVectorizer).

In [89]:
###############
### IMPORTS ###
###############

import pandas as pd
import numpy as np

import matplotlib as plt
import seaborn as sns


import re
import string


from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split


import nltk
from nltk import word_tokenize
import nltk
from nltk import word_tokenize
from sklearn import svm
from nltk.tag import StanfordNERTagger
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words
from nltk.tag import pos_tag
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [106]:
# Reference: http://zwmiller.com/projects/nlp_pipeline.html
# Reference: https://github.com/ZWMiller/nlp_pipe_manager/blob/master/nlp_pipeline_manager/nlp_preprocessor.py
# Reference: https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72

class nlp_pipe:
    
    # Initialize the class
    def __init__(self, vectorizer, stemmer, lemmatizer, tokenizer, dataframe, column='Title'):
        self.vectorizer = vectorizer
        self.tokenizer = tokenizer
        self.lemmatizer = lemmatizer
        self.stemmer = stemmer
        self.dataframe = dataframe
        self.column = column
        self.dataframe[self.column] = self.dataframe[self.column].apply(str)
    
    ######################################################################
    
    # Create a cleaning method (aka fit) that will use several functions in order
    def cleaner(self):
        self.vader_sentiment()
        self.dataframe = self._remove_numbers(self.dataframe, self.column)
        self.dataframe = self._punctuation(self.dataframe, self.column)
        self.dataframe = self._dropduplicates(self.dataframe, self.column)
        self.real_words() # Check if it's a real word and then remove if not
        #self.remove_single_letter() # Remove single letter words
        #self.autocorrect() # Takes a very long time to run
        self.tokenize_words()
        self.remove_short_headlines() # Remove headline if only one word
        #self.lemmatize_words()
        #self.stem_words()
        #self.named_entities()
        self.dataframe = self._join_words(self.dataframe, self.column)
        #self.dataframe[self.column] = self.dataframe[self.column].replace('', np.nan,)
        #self.dataframe.dropna(subset=[self.column], inplace=True)
    
    ########## Functions that 'cleaner' will call ##########
    @staticmethod
    def _remove_numbers(dataframe, column):       
        # Removes all words containing numbers
        remove_numbers = lambda x: re.sub('\w*\d\w*', '', x)
        dataframe[column] = dataframe[column].map(remove_numbers)
        return dataframe
        
    @staticmethod
    def _punctuation(dataframe, column):
        # Removes punctuation marks
        punc_lower = lambda x: re.sub('[^A-Za-z0-9]+', ' ', x)
        dataframe[column] = dataframe[column].map(punc_lower)
        return dataframe
        
    @staticmethod
    def _dropduplicates(dataframe, column):
        # Drop rows that have duplicate 'Titles'
        dataframe.drop_duplicates(subset=column, keep='first', inplace=True)
        return dataframe
    
    @staticmethod
    def _join_words(dataframe, column):
        # Joins words together with space (' ')--used after tokenization
        join_words = lambda x: ' '.join(x)
        dataframe[column] = dataframe[column].map(join_words)
        return dataframe
    
    def vader_sentiment(self):
        analyzer = SentimentIntensityAnalyzer()
        self.dataframe['Positive_Sentiment'] = self.dataframe.apply(lambda x: analyzer.polarity_scores(x[self.column])['pos'], axis=1)
        self.dataframe['Negative_Sentiment'] = self.dataframe.apply(lambda x: analyzer.polarity_scores(x[self.column])['neg'], axis=1)
        self.dataframe['Neutral_Sentiment'] = self.dataframe.apply(lambda x: analyzer.polarity_scores(x[self.column])['neu'], axis=1)
        self.dataframe['Compound_Sentiment'] = self.dataframe.apply(lambda x: analyzer.polarity_scores(x[self.column])['compound'], axis=1)
        
    def tokenize_words(self):
        self.dataframe[self.column] = self.dataframe.apply(lambda x: self.tokenizer(x[self.column]), axis=1)
    
    def stem_words(self):
        self.dataframe[self.column] = self.dataframe.apply(lambda x: [self.stemmer.stem(word) for word in x[self.column]], axis=1)
                                                           
    def lemmatize_words(self):
        self.dataframe[self.column] = self.dataframe.apply(lambda x: [self.lemmatizer.lemmatize(word) for word in x[self.column]], axis=1)
        
    def named_entities(self):
        st = StanfordNERTagger('/usr/share/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                               '/usr/share/stanford-ner/stanford-ner.jar',
                               encoding='utf-8')
        self.dataframe[self.column] = self.dataframe.apply(lambda x: st.tag(x[self.column]), axis=1)
        
    def real_words(self):
        # Removes words that are not within the nltk.corpus library
        words = set(nltk.corpus.words.words())
        self.dataframe[self.column] = self.dataframe.apply(lambda x: \
        " ".join(w for w in nltk.wordpunct_tokenize(x[self.column]) if w.lower() in words or not w.isalpha()), axis=1)
        
    def remove_single_letter(self):
        # Removes words that are 1 letter
        self.dataframe[self.column] = self.dataframe.apply(lambda x: ' '.join([w for w in x[self.column].split() if len(w)>2]), axis=1)
        
    def remove_short_headlines(self, min_length=3):
        # Removes headlines that are less than 3 words
        self.dataframe['headline_length'] = self.dataframe.apply(lambda x: len(x[self.column]), axis=1)
        self.dataframe = self.dataframe[self.dataframe['headline_length'] > min_length]
        self.dataframe = self.dataframe.drop(columns='headline_length')
        self.dataframe.reset_index(drop=True)
        
    def remove_headlines_specific_words(self):
        self.dataframe = self.dataframe[~self.dataframe[self.column].str.contains('onion')]
        self.dataframe = self.dataframe[~self.dataframe[self.column].str.contains('Onion')]
        
    def autocorrect(self):
        # Autocorrects words based on Levenshtein distance (takes __ minutes to run)
        self.dataframe[self.column] = self.dataframe.apply(lambda x: ''.join(TextBlob(x[self.column]).correct()), axis=1)

In [107]:
df_all = pd.read_csv('data/dataframe_merged.csv')

In [108]:
print('Shape of dataframe: ', df_all.shape)
print('Columns of dataframe: ', df_all.columns)

Shape of dataframe:  (46628, 28)
Columns of dataframe:  Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords', 'director'],
      dtype='object')


In [109]:
df = pd.read_csv('data/dataframe_merged_small.csv', usecols=['id', 'title', 'overview', 'tagline'])

In [110]:
print('Shape of dataframe: ', df.shape)
print('Columns of dataframe: ', df.columns)

Shape of dataframe:  (10876, 4)
Columns of dataframe:  Index(['id', 'overview', 'tagline', 'title'], dtype='object')


In [111]:
df

Unnamed: 0,id,overview,tagline,title
0,862,"Led by Woody, Andy's toys live happily in his ...",,Toy Story
1,8844,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji
2,15602,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men
3,949,"Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga,Heat
4,11860,An ugly duckling having undergone a remarkable...,You are cordially invited to the most surprisi...,Sabrina
...,...,...,...,...
10871,19307,Sid and Bernie keep having their amorous inten...,Fun and games in the great outdoors!,Carry On Camping
10872,18098,Scheherezade puts herself in danger to save Su...,"When Night Falls, the Adventure Begins!",Arabian Nights
10873,52103,Little pocket thief Wu never got away from the...,,Pickpocket
10874,455661,A closeted boy runs the risk of being outed by...,The Heart Wants What The Heart Wants,In a Heartbeat


In [112]:
# Replace NaN with empty strings
df['overview'] = df['overview'].replace(np.nan, '', regex=True)
df['tagline'] = df['tagline'].replace(np.nan, '', regex=True)

In [113]:
# Join [overview] and [keywords] together
# These two columns are synopsis-associated and it's sensible to join them together
df['overview_and_tagline'] = df['overview'] + df['tagline']

In [114]:
df['overview_and_tagline'].loc[0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

### Clean the text using NLP class functions

In [101]:
# Clean the text using nlp_pipelines class
nlp = nlp_pipe(dataframe = df,
               column = 'overview_and_tagline',
               tokenizer = nltk.word_tokenize,
               vectorizer = TfidfVectorizer(stop_words='english'),
               stemmer = SnowballStemmer("english"),
               lemmatizer = WordNetLemmatizer())

In [102]:
# nlp.cleaner()

In [105]:
df

Unnamed: 0,id,overview,tagline,title,overview_and_tagline,Positive_Sentiment,Negative_Sentiment,Neutral_Sentiment,Compound_Sentiment,headline_length
0,862,"Led by Woody, Andy's toys live happily in his ...",,Toy Story,"[Led, Woody, live, happily, his, room, until, ...",0.091,0.034,0.875,0.4767,35
1,8844,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji,"[When, and, Peter, discover, board, game, that...",0.161,0.144,0.695,0.1260,55
2,15602,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,"[family, wedding, the, ancient, feud, between,...",0.157,0.132,0.710,0.7105,45
3,949,"Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga,Heat,"[Obsessive, master, thief, top, notch, crew, v...",0.068,0.207,0.725,-0.8555,43
4,11860,An ugly duckling having undergone a remarkable...,You are cordially invited to the most surprisi...,Sabrina,"[ugly, duckling, remarkable, change, still, fo...",0.158,0.076,0.766,0.5844,28
...,...,...,...,...,...,...,...,...,...,...
10871,19307,Sid and Bernie keep having their amorous inten...,Fun and games in the great outdoors!,Carry On Camping,"[and, keep, their, amorous, snubbed, their, an...",0.217,0.048,0.735,0.9472,46
10872,18098,Scheherezade puts herself in danger to save Su...,"When Night Falls, the Adventure Begins!",Arabian Nights,"[herself, danger, save, Sultan, her, childhood...",0.171,0.277,0.551,-0.7500,24
10873,52103,Little pocket thief Wu never got away from the...,,Pickpocket,"[Little, pocket, thief, never, got, away, from...",0.145,0.191,0.664,-0.2882,37
10874,455661,A closeted boy runs the risk of being outed by...,The Heart Wants What The Heart Wants,In a Heartbeat,"[boy, the, risk, being, outed, his, own, heart...",0.282,0.047,0.671,0.9100,22


### Make the word2vec model and train on df['overview_and_tagline']

In [115]:
train, test = train_test_split(df, test_size=.20, random_state=42)

In [117]:
#Tag train set
tagged_tr = [TaggedDocument(words=word_tokenize(_d.lower()),
tags = [str(i)]) for i, _d in enumerate(train.overview_and_tagline)]

In [121]:
#Tag test set
tagged_test = [TaggedDocument(words=word_tokenize(_d.lower()),
tags=[str(i)]) for i, _d in enumerate(test.overview_and_tagline)]

In [122]:
model = Doc2Vec(vector_size=100,
                window=5, 
                alpha=.025, 
                min_alpha=0.00025, 
                min_count=2, 
                dm=1, 
                workers=8)

In [123]:
model.build_vocab(tagged_tr)

In [126]:
epochs = range(100)
for epoch in epochs:
    print(f'Epoch {epoch+1}')
    model.train(tagged_tr,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.00025
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
     
model.save('data/doc2vec.model')

Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
Epoch 50
Epoch 51
Epoch 52
Epoch 53
Epoch 54
Epoch 55
Epoch 56
Epoch 57
Epoch 58
Epoch 59
Epoch 60
Epoch 61
Epoch 62
Epoch 63
Epoch 64
Epoch 65
Epoch 66
Epoch 67
Epoch 68
Epoch 69
Epoch 70
Epoch 71
Epoch 72
Epoch 73
Epoch 74
Epoch 75
Epoch 76
Epoch 77
Epoch 78
Epoch 79
Epoch 80
Epoch 81
Epoch 82
Epoch 83
Epoch 84
Epoch 85
Epoch 86
Epoch 87
Epoch 88
Epoch 89
Epoch 90
Epoch 91
Epoch 92
Epoch 93
Epoch 94
Epoch 95
Epoch 96
Epoch 97
Epoch 98
Epoch 99
Epoch 100


In [127]:
X_train = np.array([model.docvecs[str(i)] for i in range(len(tagged_tr))])

y_train = train['id']

In [130]:
y_train

8491     250124
7178     119431
2423       9026
10612    283591
10791     34933
          ...  
5734      15801
5191       2009
5390      12473
860         887
7270      77117
Name: id, Length: 8700, dtype: int64

In [134]:
print(model.most_similar('warrior'))

[('lady', 0.4496452808380127), ('prisoners', 0.41911080479621887), ('woman', 0.4177355468273163), ('boy', 0.38012969493865967), ('ensures', 0.3707790672779083), ('doctor', 0.363141804933548), ('dickinson', 0.36013805866241455), ('servicing', 0.35690784454345703), ('soldier', 0.3567981719970703), ('mankind', 0.35655438899993896)]


  print(model.most_similar('warrior'))


In [136]:
# Find movies similar to query word
vec = model['warrior']
print(model.docvecs.most_similar([vec]))

[('5275', 0.3733769953250885), ('6734', 0.3408944606781006), ('2676', 0.31792134046554565), ('7098', 0.3138238787651062), ('6990', 0.3135916590690613), ('7619', 0.31078362464904785), ('4977', 0.3083052635192871), ('2337', 0.3060140907764435), ('7395', 0.2977234423160553), ('5165', 0.2973651885986328)]


In [137]:
new_vector = model.infer_vector(tokens)
model.docvecs.most_similar([new_vector]) #gives you top 10 document tags and their cosine similarity

TypeError: 'method' object is not subscriptable

In [12]:
vectorizer = TfidfVectorizer(stop_words='english')
doc_word = vectorizer.fit_transform(df['overview_and_keywords'])

In [13]:
# Compute the cosine similarity matrix from doc_word
cosine_sim = cosine_similarity(doc_word, doc_word)

In [14]:
# Look at the cosine_sim matrix
pd.DataFrame(cosine_sim)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10866,10867,10868,10869,10870,10871,10872,10873,10874,10875
0,1.000000,0.013389,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.010031,0.046185,0.000000
1,0.013389,1.000000,0.040562,0.052569,0.000000,0.000000,0.009913,0.0,0.005114,0.000000,...,0.0,0.000000,0.000000,0.000000,0.003447,0.000000,0.000000,0.047648,0.000000,0.012675
2,0.000000,0.040562,1.000000,0.000000,0.000000,0.000000,0.010711,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.016470,0.000000,0.016014,0.050754,0.000000,0.014073
3,0.000000,0.052569,0.000000,1.000000,0.000000,0.000000,0.000000,0.0,0.018957,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.028662,0.032332,0.000000
4,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.009021,0.016196,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10871,0.000000,0.000000,0.000000,0.000000,0.016196,0.000000,0.000000,0.0,0.000000,0.040168,...,0.0,0.031422,0.000000,0.000000,0.007882,1.000000,0.000000,0.026808,0.000000,0.000000
10872,0.000000,0.000000,0.016014,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000
10873,0.010031,0.047648,0.050754,0.028662,0.000000,0.000000,0.008306,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.000000,0.005253,0.026808,0.000000,1.000000,0.013248,0.004446
10874,0.046185,0.000000,0.000000,0.032332,0.000000,0.000000,0.009476,0.0,0.000000,0.000000,...,0.0,0.000000,0.034507,0.062217,0.000000,0.000000,0.000000,0.013248,1.000000,0.013403


In [19]:
# # Save cosine_sim array to use in hybrid recommendation system
np.save('similarity_matrix/cos_overview_small.npy', cosine_sim)

In [20]:
# Reset index of our dataframe and construct reverse mapping as before
indices = pd.Series(df.index, index=df['title'])

In [21]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [22]:
get_recommendations('The Dark Knight Rises')

5365                                       The Dark Knight
274                                                 Batman
667                                         Batman Returns
6209                            Batman: Under the Red Hood
2707                                                 Q & A
4411                                         Batman Begins
4080                    Batman Beyond: Return of the Joker
7450     Batman Unmasked: The Psychology of the Dark Kn...
10350    LEGO DC Comics Super Heroes: Justice League - ...
6752                                      Batman: Year One
Name: title, dtype: object

In [23]:
get_recommendations('Toy Story')

6179               Toy Story 3
1458               Toy Story 2
4470    The 40 Year Old Virgin
8125                 Small Fry
485      Rebel Without a Cause
5007              Factory Girl
2616             Class of 1984
7273             A Simple Life
1486           Man on the Moon
1528      White Men Can't Jump
Name: title, dtype: object