In [4]:
import json
from collections import Counter
from keras.models import Model
from keras.layers import Embedding,Input,Reshape
from keras.layers.merging import Dot
from sklearn.linear_model import LinearRegression
import numpy as np
import random
from sklearn import svm

In [5]:
with open('data/wp_movies_second.ndjson') as fin:
    movies = [json.loads(l) for l in fin]
# with open('wp_movies_10k.ndjson') as fin:
#     movies = fin.read()
#     print(movies)
#     mv2 = json.loads(movies)

In [6]:
link_counts = Counter()
for movie in movies:
    link_counts.update(movie[2])
link_counts.most_common(10)

[('Rotten Tomatoes', 9393),
 ('Category:English-language films', 5882),
 ('Category:American films', 5867),
 ('Variety (magazine)', 5450),
 ('Metacritic', 5112),
 ('Box Office Mojo', 4186),
 ('The New York Times', 3818),
 ('The Hollywood Reporter', 3553),
 ('Roger Ebert', 2707),
 ('Los Angeles Times', 2454)]

In [8]:
top_links = [link for link,c in link_counts.items() if c>=3]
link_to_idx = {link:idx for idx, link in enumerate(top_links)}
movie_to_idx = {movie[0]:idx for idx,movie in enumerate(movies)}
pairs=[]
for movie in movies:
    pairs.extend((link_to_idx[link],movie_to_idx[movie[0]]) for link in movie[2] if link in link_to_idx)
pairs_set = set(pairs)
len(pairs),len(top_links),len(movie_to_idx)

(949544, 66913, 10000)

In [9]:
def movie_embedding_model(embedding_size=50):
    link = Input(name='link',shape=(1,))
    movie = Input(name='movie', shape=(1,))
    link_embedding = Embedding(name="link_embedding", input_dim=len(top_links),output_dim=embedding_size)(link)
    movie_embedding = Embedding(name="movie_embedding", input_dim=len(movie_to_idx),output_dim=embedding_size)(movie)
    dot = Dot(name='dot_product',normalize=True,axes=2)([link_embedding, movie_embedding])
    merged = Reshape((1,))(dot)
    model = Model(inputs=[link,movie],outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')
    return model

In [10]:
model = movie_embedding_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 link (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 movie (InputLayer)             [(None, 1)]          0           []                               
                                                                                                  
 link_embedding (Embedding)     (None, 1, 50)        3345650     ['link[0][0]']                   
                                                                                                  
 movie_embedding (Embedding)    (None, 1, 50)        500000      ['movie[0][0]']                  
                                                                                              