In [3]:
import json
from collections import Counter
from keras.models import Model
from keras.layers import Embedding,Input,Reshape
from keras.layers import Dot
from sklearn.linear_model import LinearRegression
import numpy as np
import random
from sklearn import svm

In [4]:
with open('data/wp_movies_10k.ndjson') as fin:
    movies = [json.loads(l) for l in fin]

link_counts = Counter()
for movie in movies:
    link_counts.update(movie[2])
link_counts.most_common(10)

[('Rotten Tomatoes', 9393),
 ('Category:English-language films', 5882),
 ('Category:American films', 5867),
 ('Variety (magazine)', 5450),
 ('Metacritic', 5112),
 ('Box Office Mojo', 4186),
 ('The New York Times', 3818),
 ('The Hollywood Reporter', 3553),
 ('Roger Ebert', 2707),
 ('Los Angeles Times', 2454)]

In [3]:
top_links = [link for link, c in link_counts.items() if c >=3]
link_to_idx = {link: idx for idx, link in enumerate(top_links)}
movie_to_idx = {movie[0]: idx for idx, movie in enumerate(movies)}
pairs = []
for movie in movies:
    pairs.extend((link_to_idx[link], movie_to_idx[movie[0]]) for link in movie[2] if link in link_to_idx)
pairs_set = set(pairs)
len(pairs), len(top_links),len(movie_to_idx)

(949544, 66913, 10000)

In [4]:
def movie_embedding_model(embedding_size = 50):
    link = Input(name='link', shape=(1,))
    movie = Input(name='movie',shape=(1,))
    link_embedding = Embedding(name='link_embedding',
                               input_dim=len(top_links),
                               output_dim=embedding_size)(link)
    movie_embedding = Embedding(name='movie_embedding',
                                input_dim=len(movie_to_idx),
                                output_dim=embedding_size)(movie)
    dot = Dot(name='dot_product', normalize=True,axes=2)([link_embedding,movie_embedding])
    merged = Reshape((1,))(dot)
    model = Model(inputs=[link,movie], outputs=[merged])
    model.compile(optimizer='nadam', loss='mse')
    return model

model = movie_embedding_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 link (InputLayer)              [(None, 1)]          0           []                               
                                                                                                  
 movie (InputLayer)             [(None, 1)]          0           []                               
                                                                                                  
 link_embedding (Embedding)     (None, 1, 50)        3345650     ['link[0][0]']                   
                                                                                                  
 movie_embedding (Embedding)    (None, 1, 50)        500000      ['movie[0][0]']                  
                                                                                              

In [4]:
random.seed(5)

def batchfire(pairs,positive_samples=50, negative_ratio = 10):
    batch_size = positive_samples *(1+negative_ratio)
    batch = np.zeros((batch_size,3))
    while True:
        for idx, (link_id, movie_id) in enumerate(random.sample(pairs,positive_samples)):
            batch[idx,:] = (link_id, movie_id,1)
        idx = positive_samples
        while idx < batch_size:
            movie_id = random.randrange(len(movie_to_idx))
            link_id = random.randrange(len(top_links))
            if not (link_id,movie_id) in pairs_set:
                batch[idx,:] = (link_id, movie_id,-1)
                idx += 1
        np.random.shuffle(batch)
        yield {'link':batch[:,0], 'movie':batch[:,1]}, batch[:, 2]

next(batchfire(pairs, positive_samples=3, negative_ratio=2))

({'link': array([31254., 13365., 32643., 22418.,  3801.,  1313., 48731., 32318.,
         20558.]),
  'movie': array([5530., 6238., 7628., 1529., 5874., 7236., 1854., 7685.,  849.])},
 array([ 1., -1., -1.,  1., -1.,  1., -1., -1., -1.]))

In [9]:
positive_samples_per_batch = 512


model.fit_generator(
    batchfire(pairs,positive_samples=positive_samples_per_batch, negative_ratio=10),
    epochs=15,
    steps_per_epoch=len(pairs)//positive_samples_per_batch,
    verbose=2
)

Epoch 1/15
1854/1854 - 112s - loss: 0.4609 - 112s/epoch - 61ms/step
Epoch 2/15
1854/1854 - 113s - loss: 0.2289 - 113s/epoch - 61ms/step
Epoch 3/15
1854/1854 - 272s - loss: 0.2216 - 272s/epoch - 146ms/step
Epoch 4/15
1854/1854 - 380s - loss: 0.2189 - 380s/epoch - 205ms/step
Epoch 5/15
1854/1854 - 318s - loss: 0.2172 - 318s/epoch - 171ms/step
Epoch 6/15
1854/1854 - 306s - loss: 0.2161 - 306s/epoch - 165ms/step
Epoch 7/15
1854/1854 - 306s - loss: 0.2154 - 306s/epoch - 165ms/step
Epoch 8/15
1854/1854 - 306s - loss: 0.2148 - 306s/epoch - 165ms/step
Epoch 9/15
1854/1854 - 305s - loss: 0.2144 - 305s/epoch - 165ms/step
Epoch 10/15
1854/1854 - 305s - loss: 0.2140 - 305s/epoch - 165ms/step
Epoch 11/15
1854/1854 - 387s - loss: 0.2135 - 387s/epoch - 209ms/step
Epoch 12/15
1854/1854 - 428s - loss: 0.2136 - 428s/epoch - 231ms/step
Epoch 13/15
1854/1854 - 369s - loss: 0.2132 - 369s/epoch - 199ms/step
Epoch 14/15
1854/1854 - 375s - loss: 0.2130 - 375s/epoch - 202ms/step
Epoch 15/15
1854/1854 - 486s - 

  model.fit_generator(


<keras.callbacks.History at 0x7fe7bac1a3a0>

In [12]:
movie = model.get_layer('movie_embedding')
movie_weights = movie.get_weights()[0]
movie_lenghts = np.linalg.norm(movie_weights,axis=1)
normalized_movies = (movie_weights.T/movie_lenghts).T

def similar_movies(movie):
    dists = np.dot(normalized_movies,normalized_movies[movie_to_idx[movie]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c,movies[c][0],dists[c])
        
similar_movies('Rogue One')

29 Rogue One 1.0
19 Interstellar (film) 0.97960407
25 Star Wars sequel trilogy 0.97259885
245 Gravity (film) 0.96838254
659 Rise of the Planet of the Apes 0.9676329
3349 Star Wars: The Force Awakens 0.96297324
86 Tomorrowland (film) 0.96037847
181 Pacific Rim (film) 0.9594417
372 The Amazing Spider-Man (2012 film) 0.9587234
37 Avatar (2009 film) 0.9559336


In [14]:
link = model.get_layer('link_embedding')
link_weights = link.get_weights()[0]
link_lenghts = np.linalg.norm(link_weights,axis=1)
normalized_links = (link_weights.T/link_lenghts).T

def similar_links(link):
    dists = np.dot(normalized_links,normalized_links[link_to_idx[link]])
    closest = np.argsort(dists)[-10:]
    for c in reversed(closest):
        print(c,top_links[c],dists[c])

similar_links('George Lucas')

127 George Lucas 1.0
2707 Star Wars 0.9455315
4830 widescreen 0.9358544
3176 Star Wars (film) 0.9270115
976 Hugo Award for Best Dramatic Presentation 0.92455924
2931 LaserDisc 0.9020158
4051 novelization 0.8943769
2778 Lucasfilm 0.8883475
2829 storyboard 0.8839333
2860 Steven Spielberg 0.8815658


In [18]:
best = ['Star Wars: The Force Awakens','The Martian (film)','Tangerine (film)', 'Straight Outta Compton (film)',
        'Brooklyn (film)', 'Carol (film)', 'Spotlight (film)']

worst = ['American Ultra','The Cobbler (2014 film)','Entourage (film)','Fantastic Four (2015 film)',
         'Get Hard', 'Hot Pursuit (2015 film)','Mortdecai (film)','Serena (2014 film)', 'Vacation (2015 film)']

y = np.asarray([1 for _ in best] +[0 for _ in worst])
x = np.asarray([normalized_movies[movie_to_idx[movie]] for movie in best + worst])   

x.shape

(16, 50)

In [19]:
clf = svm.SVC(kernel='linear')
clf.fit(x,y)

In [21]:
estimated_movie_ratings = clf.decision_function(normalized_movies)
best = np.argsort(estimated_movie_ratings)
print("Najlepsze: ")
for c in reversed(best[-5:]):
    print(c,movies[c][0],estimated_movie_ratings[c])

print("Najgorsze: ")
for c in best[:5]:
    print(c,movies[c][0],estimated_movie_ratings[c])

Najlepsze: 
481 The Devil Wears Prada (film) 1.3588949943163744
66 Skyfall 1.3304521440989938
307 Les Misérables (2012 film) 1.1721145189915019
458 Hugo (film) 1.1497101761975705
939 Changeling (film) 1.0884455723615067
Najgorsze: 
1782 Scooby-Doo! WrestleMania Mystery -1.5851093884750933
5097 Ready to Rumble -1.5676092350495647
9595 Speed Zone -1.5644144257080987
1878 The Little Rascals (film) -1.5339818433244488
7889 The Comebacks -1.5155122010768474


In [22]:
rotten_y = np.asarray([float(movie[-2][:-1])/100 for movie in movies if movie[-2]])
rotten_x = np.asarray([normalized_movies[movie_to_idx[movie[0]]] for movie in movies if movie[-2]])

In [23]:
TRAINING_CUT_OFF = int(len(rotten_x)*0.8)
regr = LinearRegression()
regr.fit(rotten_x[:TRAINING_CUT_OFF], rotten_y[:TRAINING_CUT_OFF])

In [24]:
error = (regr.predict(rotten_x[TRAINING_CUT_OFF:])-rotten_y[TRAINING_CUT_OFF:])
'błąd średniokwadratowy %2.2f' %np.mean(error**2)

'błąd średniokwadratowy 0.06'

In [26]:
error = (np.mean(rotten_y[:TRAINING_CUT_OFF])-rotten_y[TRAINING_CUT_OFF:])
'błąd średniokwadratowy %2.2f' %np.mean(error**2)

'błąd średniokwadratowy 0.09'

In [30]:
def gross(movie):
    v = movie[1].get('gross')
    if not v or not ' ' in v:
        return None
    v, unit = v.split(' ',1)
    unit = unit.lower()
    if not unit in ('million','billion'):
        return None
    if not v.startswith('$'):
        return None
    try:
        v = float(v[1:])
    except ValueError:
        return None
    if unit == 'billion':
        v*=1000
    return v

movie_gross = [gross(m) for m in movies]
movie_gross = np.asarray([gr for gr in movie_gross if gr is not None])
highest = np.argsort(movie_gross)[-10:]
for c in reversed(highest):
    print(c,movies[c][0],movie_gross[c])

6 The Martian (film) 10900.0
7 List of Marvel Cinematic Universe films 4300.0
49 Back to the Future 3900.0
71 The Conjuring 2932.0
162 Thor (film) 2464.0
36 Furious 7 2340.0
30 Finding Dory 2187.0
1906 Jane Eyre (2011 film) 2068.0
19 Interstellar (film) 1670.0
2251 An American Werewolf in London 1655.0
