In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import time
import pickle

from nltk import RegexpTokenizer
from nltk.corpus import stopwords

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
df = pickle.load( open( "criterion.pkl", "rb" ) )

In [3]:
df.head()

Unnamed: 0,title,director,country,year,links,summaries,summary
0,2 or 3 Things I Know About Her,Jean-Luc Godard,"France,",1967,https://www.criterionchannel.com/2-or-3-things...,Directed by Jean-Luc Godard • 1967 • France\nS...,Directed by Jean-Luc Godard • 1967 • France St...
1,3:10 to Yuma,Delmer Daves,"United States,",1957,https://www.criterionchannel.com/3-10-to-yuma,Directed by Delmer Daves • 1957 • United State...,Directed by Delmer Daves • 1957 • United State...
2,3 Faces,Jafar Panahi,"Iran,",2018,https://www.criterionchannel.com/3-faces,Directed by Jafar Panahi • 2018 • Iran\nStarri...,Directed by Jafar Panahi • 2018 • Iran Starrin...
3,"4 Months, 3 Weeks and 2 Days",Cristian Mungiu,"Romania,",2007,https://www.criterionchannel.com/4-months-3-we...,Criterion Collection Edition #958\r\n\r\nRoman...,Criterion Collection Edition #958 Romanian ...
4,5 Against the House,Phil Karlson,"United States,",1955,https://www.criterionchannel.com/5-against-the...,Directed by Phil Karlson • 1955 • United State...,Directed by Phil Karlson • 1955 • United State...


In [4]:
tokenizer = RegexpTokenizer(r'\w+')
stopword_set = set(stopwords.words('english'))

In [5]:
def tokenize(input_data):
    lowered = input_data.lower()
    tokens = tokenizer.tokenize(lowered)
    return tokens

In [6]:
all_tokens = [tokenize(i) for i in df.summary.values]

In [7]:
tagged = [TaggedDocument(words=chunk, tags=[str(i)]) for i, chunk in enumerate(all_tokens)]

In [10]:
# MODEL PARAMETERS
dm = 0 # 1 for distributed memory(default); 0 for dbow 
size = 300
context_window = 20
seed = 0
min_count = 10
alpha = 0.01
max_iter = 200

In [11]:
# # BUILD MODEL
crit_model = Doc2Vec(documents = tagged,
dm = dm,
alpha = alpha, # initial learning rate
seed = seed,
min_count = min_count, # ignore words with freq less than min_count
max_vocab_size = None, # 
window = context_window, # the number of words before and after to be used as context
vector_size = size, # is the dimensionality of the feature vector
sample = 1e-4, # ?
negative = 5, # ? # number of cores
epochs = max_iter) 

In [12]:
# crit_model = pickle.load(open("crit_model.pkl", "rb"))
# crit = pickle.dump(df, open( "criterion.pkl", "wb" ))

In [37]:
test = tokenize('chess death') # expected to produce 'The Seventh Seal', which failed.

In [38]:
sev = crit_model.infer_vector(test)

In [39]:
sim = crit_model.docvecs.most_similar([sev], topn = 300)

In [40]:
tagsim = [int(i[0]) for i in sim]

In [41]:
best = []
for i in tagsim:
    if len(tokenize(df.iloc[i].summary)) > 40:
        best.append((df.iloc[i].title, df.iloc[i].summary))

In [43]:
best[:10]

[('The Beaning',
  'Directed by Sean McCoy • 2017 • United States  Devils in the outfield! Steeped in occult dread, this experimental documentary uncovers a sinister conspiracy theory surrounding the death of Cleveland baseball player Ray Chapman, who was killed by a ball thrown by Yankees pitcher Carl Mays in 1920.'),
 ('Tidy Up',
  'Directed by Satsuki Okawa • 2011 • Japan, United States  Following the death of his hoarder mother, a young man eagerly looks forward to cleaning out her junk-strewn home—but first he’ll have to contend with his sister, who is intent on preserving the house just as it is—in this tender look at grief and letting go.'),
 ('Straits of Magellan: Pan 4',
  'Directed by Hollis Frampton • 1974 • United States  Magellan is an unfinished cycle of films by Hollis Frampton, to be shown over 371 days (the Magellan Calendar). STRAITS OF MAGELLAN was one of three phases of the calendar, and Pans are one-minute films interspersed throughout this phase.'),
 ('Straits of 