In [96]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import time
import pickle
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from nltk import RegexpTokenizer
from nltk.corpus import stopwords

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# The Current Criterion DataFrame

In [97]:
df = pickle.load( open( "criterion.pkl", "rb" ) )

# Functions for Doc2Vec Model

In [3]:
tokenizer = RegexpTokenizer(r'\w+')
stopword_set = set(stopwords.words('english'))

In [4]:
def tokenize(input_data):
    lowered = input_data.lower()
    tokens = tokenizer.tokenize(lowered)
    return tokens

In [13]:
all_tokens = [tokenize(i) for i in df.summary.values]

In [14]:
tagged = [TaggedDocument(words=chunk, tags=[str(i)]) for i, chunk in enumerate(all_tokens)]

In [15]:
# MODEL PARAMETERS
dm = 0 # 1 for distributed memory(default); 0 for dbow 
size = 300
context_window = 20
seed = 0
min_count = 10
alpha = 0.01
max_iter = 200

In [16]:
# # BUILD MODEL
crit_model = Doc2Vec(documents = tagged,
dm = dm,
alpha = alpha, # initial learning rate
seed = seed,
min_count = min_count, # ignore words with freq less than min_count
max_vocab_size = None, # 
window = context_window, # the number of words before and after to be used as context
vector_size = size, # is the dimensionality of the feature vector
sample = 1e-4, # ?
negative = 5, # ? # number of cores
epochs = max_iter) 

In [17]:
# crit_model = pickle.load(open("crit_model.pkl", "rb"))
# crit = pickle.dump(df, open( "criterion.pkl", "wb" ))

In [24]:
test = tokenize('ingmar bergman') # expected to produce 'The Seventh Seal', which failed.

In [25]:
sev = crit_model.infer_vector(test)

In [26]:
sim = crit_model.docvecs.most_similar([sev], topn = 300)

In [27]:
tagsim = [int(i[0]) for i in sim]

In [28]:
best = []
for i in tagsim:
    if len(tokenize(df.iloc[i].summary)) > 40:
        best.append((df.iloc[i].title, df.iloc[i].summary))

In [5]:
df

Unnamed: 0,title,director,country,year,links,summaries,summary
0,2 or 3 Things I Know About Her,Jean-Luc Godard,"France,",1967,https://www.criterionchannel.com/2-or-3-things...,Directed by Jean-Luc Godard • 1967 • France\nS...,Directed by Jean-Luc Godard • 1967 • France St...
1,3:10 to Yuma,Delmer Daves,"United States,",1957,https://www.criterionchannel.com/3-10-to-yuma,Directed by Delmer Daves • 1957 • United State...,Directed by Delmer Daves • 1957 • United State...
2,3 Faces,Jafar Panahi,"Iran,",2018,https://www.criterionchannel.com/3-faces,Directed by Jafar Panahi • 2018 • Iran\nStarri...,Directed by Jafar Panahi • 2018 • Iran Starrin...
3,"4 Months, 3 Weeks and 2 Days",Cristian Mungiu,"Romania,",2007,https://www.criterionchannel.com/4-months-3-we...,Criterion Collection Edition #958\r\n\r\nRoman...,Criterion Collection Edition #958 Romanian ...
4,5 Against the House,Phil Karlson,"United States,",1955,https://www.criterionchannel.com/5-against-the...,Directed by Phil Karlson • 1955 • United State...,Directed by Phil Karlson • 1955 • United State...
...,...,...,...,...,...,...,...
2095,Zéro de conduite,Jean Vigo,"France,",1933,https://www.criterionchannel.com/zero-de-conduite,Directed by Jean Vigo • 1933 • France\nStarrin...,Directed by Jean Vigo • 1933 • France Starring...
2096,Zero Focus,Yoshitaro Nomura,"Japan,",1961,https://www.criterionchannel.com/zero-focus,Directed by Yoshitaro Nomura • 1961 • Japan\n\...,Directed by Yoshitaro Nomura • 1961 • Japan A...
2097,Zora Neale Hurston Fieldwork Footage (excerpt),Zora Neale Hurston,"United States,",1928,https://www.criterionchannel.com/zora-neale-hu...,Directed by Zora Neale Hurston • 1928 • United...,Directed by Zora Neale Hurston • 1928 • United...
2098,Zorgon: The H-Bomb Beast from Hell,Kevin Fernan,"United States,",1972,https://www.criterionchannel.com/zorgon-the-h-...,Directed by Kevin Fernan • 1972 • United State...,Directed by Kevin Fernan • 1972 • United State...


In [110]:
df.iloc[3].summary

'Criterion Collection Edition #958    Romanian filmmaker Cristian Mungiu shot to international prominence with this rigorously realistic Palme d’Or–winning second feature. In 1987, during the dictatorship of Nicolae Ceaușescu, college roommates Otilia (Anamaria Marinca) and Găbița (Laura Vasiliu) seek an illegal abortion for Găbița. In unflinching but empathetic detail, the film recounts the events of twenty-four perilous hours in their lives, culminating in their encounter with a manipulative and menacing abortionist (Vlad Ivanov). With powerful performances that accentuate the characters’ flawed humanity, 4 MONTHS, 3 WEEKS AND 2 DAYS is a gutting account of the impossible choices women face when taking control of their bodies means breaking the law.'

# Functions for TF-IDF Model (currently working better due to Doc2Vec inconsisent doc lengths)

In [98]:
summaries = df.summary.values

In [99]:
test = 'death chess'

In [100]:
array_with_query = np.insert(summaries, 0, test)

In [89]:
tfidf = TfidfVectorizer().fit_transform(array_with_query)

In [91]:
cosine_similarities = linear_kernel(tfidf[0], tfidf).flatten()

In [92]:
related_docs_indices = cosine_similarities.argsort()[:-20:-1]

In [114]:
for i in related_docs_indices[1:]:
    summ = array_with_query[i].split(' • ')
    try:
        print(summ[0], '\n', summ[1], '\n', summ[2])
    except:
        print(array_with_query[i])
    print('\n')

Directed by Ingmar Bergman 
 1957 
 Sweden    Disillusioned and exhausted after a decade of battling in the Crusades, a knight (Max von Sydow) encounters Death on a desolate beach and challenges him to a fateful game of chess. Much studied, imitated, even parodied, but never outdone, Bergman's stunning allegory of man's search for meaning, The Seventh Seal (Det sjunde inseglet), was one of the benchmark foreign imports of America's 1950s art-house heyday, pushing cinema's boundaries and ushering in a new era of moviegoing.


Directed by Céline Sciamma 
 2007 
 France Starring Pauline Acquart, Louise Blachère, Adèle Haenel  During a summer in Paris, a love triangle develops between three girls in this provocative and perceptive portrait of teen angst and nascent sexuality. The awkward Anne, the bad girl Floriane, and the gawky Marie play an intense game of emotional chess as they wrestle with love, friendship, and their desire for one another.


ZATOICHI #12 Directed by Kenji Misumi 
 1