In [1]:
import keras
import numpy as np
import datetime
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import keras.backend as K

from keras.models import Sequential, Model, Input
from keras.layers import Dense, BatchNormalization, Activation, \
                                LeakyReLU, Dropout, Embedding, \
                                multiply, Flatten, add, dot
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
from keras.utils.vis_utils import model_to_dot

from sklearn.manifold import TSNE, LocallyLinearEmbedding, MDS                           
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from keras_tqdm import TQDMNotebookCallback
from IPython.display import SVG
from tqdm import tqdm_notebook
from collections import namedtuple

Using TensorFlow backend.


In [2]:
K.backend()

'tensorflow'

In [132]:
import re

d = {}
c1 = '<!--begin of joke -->'
c2 = '<!--end of joke -->'
c1hit = False

for jokeId in range(100):
    jokefile = Path(f'C://Users//ADAM//Downloads/jokes/init{jokeId+1}.html')
    with open(jokefile, 'r') as fi:
        text = ''
        for line in fi.readlines():
            line = line.strip()
            if c1hit and line != c2:
                text += ' '
                text += line
            if line == c1:
                c1hit = True
            if line == c2:
                c1hit = False
        d[jokeId] = text.replace('<P>', '')\
                        .replace('<p>', '')\
                        .replace('<BR>', '')\
                        .replace('<i>', '')\
                        .replace('</i>', '')\
                        .replace('\t', ' ')\
                        .strip()

In [45]:
ratings = pd.concat(
    (
        pd.read_excel(f'C://Users//ADAM//Downloads/jester-data-{i}.xls', names=['number'] + list(range(100))) for i in range(1, 4)
    )
).reset_index(drop=True)
df = ratings.drop(columns='number').replace({99.0: np.nan}).reset_index().rename(columns={'index': 'userId'})
df.head()

Unnamed: 0,userId,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,0,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
1,1,,,,,9.03,9.27,9.03,9.27,,...,,,,9.08,,,,,,
2,2,,8.35,,,1.8,8.16,-2.82,6.21,,...,,,,0.53,,,,,,
3,3,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6
4,4,-6.17,-3.54,0.44,-8.5,-7.09,-4.32,-8.69,-0.87,-6.65,...,-3.54,-6.89,-0.68,-2.96,-2.18,-3.35,0.05,-9.08,-5.05,-3.45


In [47]:
df.userId = df.userId.astype(int)

In [74]:
USER_DIM = 73418
JOKE_DIM = 100

In [49]:
df.head()

Unnamed: 0,userId,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,0,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
1,1,,,,,9.03,9.27,9.03,9.27,,...,,,,9.08,,,,,,
2,2,,8.35,,,1.8,8.16,-2.82,6.21,,...,,,,0.53,,,,,,
3,3,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6
4,4,-6.17,-3.54,0.44,-8.5,-7.09,-4.32,-8.69,-0.87,-6.65,...,-3.54,-6.89,-0.68,-2.96,-2.18,-3.35,0.05,-9.08,-5.05,-3.45


In [69]:
df = df.melt(id_vars='userId', value_name='rating', var_name='jokeId').dropna()
df.head()

Unnamed: 0,userId,jokeId,rating
0,0,0,4.08
3,3,0,8.5
4,4,0,-6.17
6,6,0,6.84
7,7,0,-3.79


In [81]:
df_train, df_eval = train_test_split(df, test_size=0.1, shuffle=True)

def make_data(df):
    return df.userId.values, df.jokeId.values, df.rating.values

U_train, M_train, R_train = make_data(df_train)
U_eval, M_eval, R_eval = make_data(df_eval)

In [137]:
EMBEDDINGS_DIM = 30
BIAS_EMBEDDINGS_DIM = EMBEDDINGS_DIM
BATCH_SIZE = 2**18

def build_model():
    u_input = Input(shape=(1,), dtype='int32')
    m_input = Input(shape=(1,), dtype='int32')
    u_branch = Embedding(USER_DIM, EMBEDDINGS_DIM, name='user')(u_input)
    u_branch = Dropout(0.2)(u_branch)
    m_branch = Embedding(JOKE_DIM, EMBEDDINGS_DIM, name='joke')(m_input)
    m_branch = Dropout(0.2)(m_branch)
    u_bias = Embedding(USER_DIM, BIAS_EMBEDDINGS_DIM, name='user_bias')(u_input)
    u_bias = Dropout(0.2)(u_bias)
    m_bias = Embedding(JOKE_DIM, BIAS_EMBEDDINGS_DIM, name='joke_bias')(m_input)
    m_bias = Dropout(0.2)(m_bias)
    joint1 = dot([u_branch, m_branch], axes=-1)
    middle = Flatten()(joint1)
    joint2 = add([u_bias, middle, m_bias])
    sequence = Sequential([
        Flatten(),
        Dense(256),
        Activation('tanh'),
        Dropout(0.2),
        Dense(1),
    ])
    output = sequence(joint2)
    return Model([u_input, m_input], output)

In [138]:
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_true - y_pred), axis=-1))

model = build_model()

model.compile(
    loss='mae',
    optimizer='adam',
    metrics=[rmse],
)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
user (Embedding)                (None, 1, 30)        2202540     input_7[0][0]                    
__________________________________________________________________________________________________
joke (Embedding)                (None, 1, 30)        3000        input_8[0][0]                    
__________________________________________________________________________________________________
dropout_8 

In [87]:
n = 40

In [139]:
n += 1

In [140]:
LOGDIR = '/tmp/runs'

tensorboard = TensorBoard(
    log_dir=f'{LOGDIR}/{n}',
    update_freq='batch',
)

In [141]:
inputs = [U_train, M_train]
outputs = R_train

inputs_eval = [U_eval, M_eval]
outputs_eval = R_eval

print(f'Run: {n} Batch Size: {BATCH_SIZE}')
model.fit(
    x=inputs,
    y=outputs,
    batch_size=BATCH_SIZE,
    epochs=200,
    validation_data=(inputs_eval, outputs_eval),
    verbose=False,
    callbacks=[TQDMNotebookCallback(), tensorboard],
)

Run: 42 Batch Size: 262144


HBox(children=(IntProgress(value=0, description='Training', max=200, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, description='Epoch 0', max=3722589, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 1', max=3722589, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 2', max=3722589, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 3', max=3722589, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 4', max=3722589, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 5', max=3722589, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 6', max=3722589, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 7', max=3722589, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 8', max=3722589, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 9', max=3722589, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='Epoch 10', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 11', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 12', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 13', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 14', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 15', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 16', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 17', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 18', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 19', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 20', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 21', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 22', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 23', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 24', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 25', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 26', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 27', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 28', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 29', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 30', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 31', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 32', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 33', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 34', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 35', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 36', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 37', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 38', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 39', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 40', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 41', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 42', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 43', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 44', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 45', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 46', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 47', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 48', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 49', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 50', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 51', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 52', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 53', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 54', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 55', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 56', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 57', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 58', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 59', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 60', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 61', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 62', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 63', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 64', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 65', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 66', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 67', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 68', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 69', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 70', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 71', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 72', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 73', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 74', max=3722589, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=0, description='Epoch 75', max=3722589, style=ProgressStyle(description_width…

KeyboardInterrupt: 

In [121]:
embedding_u = model.get_layer("user").get_weights()[0]
embedding_m = model.get_layer("joke").get_weights()[0]
results = embedding_m
embedding_u.shape, embedding_m.shape

((73418, 30), (100, 30))

In [150]:
embedding_ub = model.get_layer("user_bias").get_weights()[0]
embedding_mb = model.get_layer("movie_bias").get_weights()[0]

In [133]:
with open('tensors-jokes.tsv', 'w') as fi:
    np.savetxt(fi, embedding_m, delimiter='\t')

In [155]:
labels_m = df.drop_duplicates(subset='movieId').sort_values(by='movieId', ascending=True)[['title', 'genres']]
labels_m.to_csv('metadata.tsv', sep='\t', index=0)

In [136]:
pd.DataFrame(dict(jokes=list(d.values()))).jokes.to_csv('metadata-jokes.tsv', sep='\t', index=0)