In [8]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import time
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer

from nltk import RegexpTokenizer
from nltk.corpus import stopwords

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
df = pickle.load( open( "criterion.pkl", "rb" ) )

In [3]:
tokenizer = RegexpTokenizer(r'\w+')
stopword_set = set(stopwords.words('english'))

In [4]:
def tokenize(input_data):
    lowered = input_data.lower()
    tokens = tokenizer.tokenize(lowered)
    return tokens

In [13]:
all_tokens = [tokenize(i) for i in df.summary.values]

In [14]:
tagged = [TaggedDocument(words=chunk, tags=[str(i)]) for i, chunk in enumerate(all_tokens)]

In [15]:
# MODEL PARAMETERS
dm = 0 # 1 for distributed memory(default); 0 for dbow 
size = 300
context_window = 20
seed = 0
min_count = 10
alpha = 0.01
max_iter = 200

In [16]:
# # BUILD MODEL
crit_model = Doc2Vec(documents = tagged,
dm = dm,
alpha = alpha, # initial learning rate
seed = seed,
min_count = min_count, # ignore words with freq less than min_count
max_vocab_size = None, # 
window = context_window, # the number of words before and after to be used as context
vector_size = size, # is the dimensionality of the feature vector
sample = 1e-4, # ?
negative = 5, # ? # number of cores
epochs = max_iter) 

In [17]:
# crit_model = pickle.load(open("crit_model.pkl", "rb"))
# crit = pickle.dump(df, open( "criterion.pkl", "wb" ))

In [24]:
test = tokenize('ingmar bergman') # expected to produce 'The Seventh Seal', which failed.

In [25]:
sev = crit_model.infer_vector(test)

In [26]:
sim = crit_model.docvecs.most_similar([sev], topn = 300)

In [27]:
tagsim = [int(i[0]) for i in sim]

In [28]:
best = []
for i in tagsim:
    if len(tokenize(df.iloc[i].summary)) > 40:
        best.append((df.iloc[i].title, df.iloc[i].summary))

In [5]:
df

Unnamed: 0,title,director,country,year,links,summaries,summary
0,2 or 3 Things I Know About Her,Jean-Luc Godard,"France,",1967,https://www.criterionchannel.com/2-or-3-things...,Directed by Jean-Luc Godard • 1967 • France\nS...,Directed by Jean-Luc Godard • 1967 • France St...
1,3:10 to Yuma,Delmer Daves,"United States,",1957,https://www.criterionchannel.com/3-10-to-yuma,Directed by Delmer Daves • 1957 • United State...,Directed by Delmer Daves • 1957 • United State...
2,3 Faces,Jafar Panahi,"Iran,",2018,https://www.criterionchannel.com/3-faces,Directed by Jafar Panahi • 2018 • Iran\nStarri...,Directed by Jafar Panahi • 2018 • Iran Starrin...
3,"4 Months, 3 Weeks and 2 Days",Cristian Mungiu,"Romania,",2007,https://www.criterionchannel.com/4-months-3-we...,Criterion Collection Edition #958\r\n\r\nRoman...,Criterion Collection Edition #958 Romanian ...
4,5 Against the House,Phil Karlson,"United States,",1955,https://www.criterionchannel.com/5-against-the...,Directed by Phil Karlson • 1955 • United State...,Directed by Phil Karlson • 1955 • United State...
...,...,...,...,...,...,...,...
2095,Zéro de conduite,Jean Vigo,"France,",1933,https://www.criterionchannel.com/zero-de-conduite,Directed by Jean Vigo • 1933 • France\nStarrin...,Directed by Jean Vigo • 1933 • France Starring...
2096,Zero Focus,Yoshitaro Nomura,"Japan,",1961,https://www.criterionchannel.com/zero-focus,Directed by Yoshitaro Nomura • 1961 • Japan\n\...,Directed by Yoshitaro Nomura • 1961 • Japan A...
2097,Zora Neale Hurston Fieldwork Footage (excerpt),Zora Neale Hurston,"United States,",1928,https://www.criterionchannel.com/zora-neale-hu...,Directed by Zora Neale Hurston • 1928 • United...,Directed by Zora Neale Hurston • 1928 • United...
2098,Zorgon: The H-Bomb Beast from Hell,Kevin Fernan,"United States,",1972,https://www.criterionchannel.com/zorgon-the-h-...,Directed by Kevin Fernan • 1972 • United State...,Directed by Kevin Fernan • 1972 • United State...


In [14]:
tfidf = TfidfVectorizer().fit_transform(df.summary.values)

In [16]:
tfidf

<2100x19404 sparse matrix of type '<class 'numpy.float64'>'
	with 130614 stored elements in Compressed Sparse Row format>

In [18]:
from sklearn.metrics.pairwise import linear_kernel

In [19]:
cosine_similarities = linear_kernel(tfidf[0:1], tfidf).flatten()

In [20]:
related_docs_indices = cosine_similarities.argsort()[:-5:-1]

In [21]:
related_docs_indices

array([   0,   51, 1412, 1146])

In [22]:
cosine_similarities[related_docs_indices]

array([1.        , 0.23508807, 0.22305452, 0.19359243])

In [24]:
df.iloc[0].summary

'Directed by Jean-Luc Godard • 1967 • France Starring Marina Vlady, Anny Duperey, Joseph Gerhard  In 2 OR 3 THINGS I KNOW ABOUT HER (2 OU 3 CHOSES QUE JE SAIS D’ELLE), Jean-Luc Godard beckons us ever closer, whispering in our ears as narrator. About what? Money, sex, fashion, the city, love, language, war: in a word, everything. Among the legendary French filmmaker’s finest achievements, the film takes as its ostensible subject the daily life of Juliette Janson (Marina Vlady), a housewife from the Paris suburbs who prostitutes herself for extra money. Yet this is only a template for Godard to spin off into provocative philosophical tangents and gorgeous images. 2 OR 3 THINGS I KNOW ABOUT HER is perhaps Godard’s most revelatory look at consumer culture, shot in ravishing widescreen color by Raoul Coutard.'

In [25]:
df.iloc[51].summary

'Directed by Jean-Luc Godard • 1957 • France  A man makes dates with two women on the same day without realizing that they are best friends. Directed by Jean-Luc Godard.'

In [28]:
for i in related_docs_indices:
    print(df.iloc[i].summary)
    print('\n')

Directed by Jean-Luc Godard • 1967 • France Starring Marina Vlady, Anny Duperey, Joseph Gerhard  In 2 OR 3 THINGS I KNOW ABOUT HER (2 OU 3 CHOSES QUE JE SAIS D’ELLE), Jean-Luc Godard beckons us ever closer, whispering in our ears as narrator. About what? Money, sex, fashion, the city, love, language, war: in a word, everything. Among the legendary French filmmaker’s finest achievements, the film takes as its ostensible subject the daily life of Juliette Janson (Marina Vlady), a housewife from the Paris suburbs who prostitutes herself for extra money. Yet this is only a template for Godard to spin off into provocative philosophical tangents and gorgeous images. 2 OR 3 THINGS I KNOW ABOUT HER is perhaps Godard’s most revelatory look at consumer culture, shot in ravishing widescreen color by Raoul Coutard.


Directed by Jean-Luc Godard • 1957 • France  A man makes dates with two women on the same day without realizing that they are best friends. Directed by Jean-Luc Godard.


Directed by 