In [None]:
import requests
from bs4 import BeautifulSoup
import os
import time
try:
    from urllib.request import urlretrieve
except ImportError:
    from urllib import urlretrieve
import xml.sax
from sklearn import svm
import subprocess
import mwparserfromhell
import json
from collections import Counter
from itertools import chain
import numpy as np
import random
from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers.merge import Dot
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import NearestNeighbors
import pickle
import gensim
from sklearn.decomposition import TruncatedSVD
import psycopg2

In [None]:
with open('../data/wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l) for l in fin]

In [None]:
link_counts = Counter()
for movie in movies:
    link_counts.update(movie[2])

top_links = [link for link, c in link_counts.items() if c >= 3]
link_to_idx = {link: idx for idx, link in enumerate(top_links)}
movie_to_idx = {movie[0]: idx for idx, movie in enumerate(movies)}
pairs = []
for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) for link in movie[2] if link in link_to_idx)
pairs_set = set(pairs)
len(pairs), len(top_links), len(movie_to_idx)

In [None]:
def movie_embedding_model(embedding_size=30):
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie', shape=(1,))
    link_embedding = Embedding(name='link_embedding', input_dim=len(top_links), output_dim=embedding_size)(link)
    movie_embedding = Embedding(name='movie_embedding', input_dim=len(movie_to_idx), output_dim=embedding_size)(movie)
    dot = Dot(name='dot_product', normalize=True, axes=2)([link_embedding, movie_embedding])
    merged = Reshape((1,))(dot)
    model = Model(inputs=[link, movie], outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')
    return model

model = movie_embedding_model()

In [None]:
import random
random.seed(5)

def batchifier(pairs, positive_samples=50, negative_ratio=5):
    batch_size = positive_samples * (1 + negative_ratio)
    batch = np.zeros((batch_size, 3))
    while True:
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs, positive_samples)):
            batch[idx, :] = (link_id, movie_id, 1)
        idx = positive_samples
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            if not (link_id, movie_id) in pairs_set:
                batch[idx, :] = (link_id, movie_id, -1)
                idx += 1
        np.random.shuffle(batch)
        yield {'link': batch[:, 0], 'movie': batch[:, 1]}, batch[:, 2]

next(batchifier(pairs, positive_samples=3, negative_ratio=2))

In [None]:
positive_samples_per_batch=256

model.fit_generator(
    batchifier(pairs, positive_samples=positive_samples_per_batch, negative_ratio=5),
    epochs=10,
    steps_per_epoch=len(pairs) // positive_samples_per_batch,
    verbose=2
)

In [None]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
movie_lengths = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / movie_lengths).T

def similar_movies(movie):
    dists = np.dot(normalized_movies, normalized_movies[movie_to_idx[movie]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c, movies[c][0], dists[c])

similar_movies('Rogue One')

In [None]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
movie_lengths = np.linalg.norm(movie_weights, axis=1)
normalized_movies = (movie_weights.T / movie_lengths).T
nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(normalized_movies)

with open('../data/movie_model.pkl', 'wb') as fout:
    pickle.dump({
        'nbrs': nbrs,
        'normalized_movies': normalized_movies,
        'movie_to_idx': movie_to_idx,
    }, fout)

In [None]:
with open('../data/movie_model.pkl', 'rb') as fin:
    m = pickle.load(fin)
movie_names = [x[0] for x in sorted(movie_to_idx.items(), key=lambda t:t[1])]
distances, indices = m['nbrs'].kneighbors(
    [m['normalized_movies'][m['movie_to_idx']['Rogue One']]])
for idx in indices[0]:
    print(movie_names[idx])

In [None]:
DB_NAME = 'douwe'
USER = 'djangosite'
PWD = 'z0g3h31m!'
HOST = '127.0.0.1'
connection_str = "dbname='%s' user='%s' password='%s' host='%s'"
conn = psycopg2.connect(connection_str % (DB_NAME, USER, PWD, HOST))

In [None]:
with conn.cursor() as cursor:
    cursor.execute('INSERT INTO movie (movie_name, embedding) VALUES (%s, %s)',
                   (movie_names[0], normalized_movies[0].tolist()))
conn.commit()

In [None]:
with conn.cursor() as cursor:
    cursor.execute('DELETE FROM movie;')
conn.commit()

In [None]:
with conn.cursor() as cursor:
    for movie, embedding in zip(movies, normalized_movies):
        cursor.execute('INSERT INTO movie (movie_name, embedding)'
                       ' VALUES (%s, %s)',
               (movie[0], embedding.tolist()))
conn.commit()

In [None]:
conn.rollback()

In [None]:
def recommend_movies(conn, q):
    with conn.cursor() as cursor:
        cursor.execute('SELECT movie_name, embedding FROM movie'
                       '    WHERE lower(movie_name) LIKE %s'
                       '    LIMIT 1',
                       ('%' + q.lower() + '%',))
        if cursor.rowcount == 0:
            return []
        movie_name, embedding = cursor.fetchone()
        cursor.execute('SELECT movie_name, '
                       '       cube_distance(cube(embedding), '
                       '                     cube(%s)) as distance '
                       '    FROM movie'
                       '    ORDER BY distance'
                       '    LIMIT 5',
                       (embedding,))
        return list(cursor.fetchall())
    
recommend_movies(conn, 'The Force Awakens')

In [None]:
with conn.cursor() as cursor:
    cursor.execute('SELECT movie_name, cube_distance(cube(embedding), cube(%s)) as distance '
                   '    FROM movie'
                   '    ORDER BY distance'
                   '    LIMIT 5',
                   (emb,))
    x = list(cursor)
x

In [None]:
movies[0]

In [None]:
MODEL = 'GoogleNews-vectors-negative300.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(MODEL, binary=True)

In [None]:
model.most_similar(positive=['espresso'])

In [None]:
def most_similar(norm, positive):
    vec = norm[model.vocab[positive].index]
    dists = np.dot(norm, vec)
    most_extreme = np.argpartition(-dists, 10)[:10]
    res = ((model.index2word[idx], dists[idx]) for idx in most_extreme)
    return list(sorted(res, key=lambda t:t[1], reverse=True))

for word, score in most_similar(model.syn0norm, 'espresso'):
    print(word, score)

In [None]:
svd = TruncatedSVD(n_components=100, random_state=42, n_iter=40)
reduced = svd.fit_transform(model.syn0norm)

In [None]:
reduced_lengths = np.linalg.norm(reduced, axis=1)
normalized_reduced = (reduced.T / reduced_lengths).T
normalized_reduced.shape

In [None]:
for word, score in most_similar(normalized_reduced, 'espresso'):
    print(word, score)

In [None]:
for idx in most_extreme:
    print(model.index2word[idx], dists[idx])