<a href="https://colab.research.google.com/github/a-agmon/anomaly_det/blob/master/engagers_trans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls '/content/drive/MyDrive/engagers'

engag_content_oct.csv  tmp


In [None]:
#!gunzip '/content/drive/MyDrive/engagers/engag_content_oct.csv.gz'

In [None]:
!ls '/content/drive/MyDrive/engagers'

engag_content_oct.csv  tmp


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

#from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Conv1D
#from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
#from keras.models import Model
#from keras import backend as K
from sklearn.model_selection import train_test_split

#from tensorflow.keras.layers import Layer

In [None]:
df = pd.read_csv('/content/drive/MyDrive/engagers/engag_content_oct.csv')


In [None]:
df.columns = ['hh', 'content', 'duration_hours', 'active_days', 'count' , 'linear_duration_freq']
df.head()

Unnamed: 0,hh,content,duration_hours,active_days,count,linear_duration_freq
0,1001654,"{[ השושלת [B] (V)],[ חמש עם רפי רשף (L)],[ א...",335.188611,14,125,0.488763
1,1002762,"{[ מלכת היופי של ירושלים [B] (V)],[ Xמן הפני...",23.245,10,28,0.419314
2,1002771,"{[ מאסטר שף האגדות חדש [B] (L)],[ מטבחי הגי...",31.168611,14,37,0.847567
3,1003423,"{[ שישי עם אילה חסון (L)],[ המקור [B] (L)],[...",2.132778,2,3,1.0
4,1007760,"{[ עבודה חלומית (V)],[ המאסטרים של בתי העץ [...",132.660556,11,101,0.478843


In [None]:
df.iloc[158, 1].split(',')[:5]

['{[ הרעשנים   [B]  (V)]',
 '[ ישיר מ תא  הכוכב האדום (L)]',
 '[ חדשות סוף השבוע עם דנה ויס (L)]',
 '[ פגוש את העיתונות (L)]',
 '[ הרעשנים   [B]  (V)]}']

In [None]:
# extract words to embed from each HH
progs_per_HH = [
    df.loc[rid, 'content'][1:-1].split(',')
    for rid in df.index
]

In [None]:
np.median([len(history) for history in progs_per_HH])

86.0

In [None]:
train_df, test_df = train_test_split(df, test_size=0.10, random_state=123)

In [None]:
embed_size = 256
max_features = 10000 # vocab size
maxlen = 85 # max number of words in a question to use
num_heads = 2
dense_dim = 32

In [None]:
tokenizer = Tokenizer(num_words=max_features, filters='{}', split=',')

In [None]:
tokenizer.fit_on_texts(np.array(progs_per_HH, dtype=object)[train_df.index.values])

In [None]:
X_train_raw = tokenizer.texts_to_sequences(np.array(progs_per_HH, dtype=object)[train_df.index.values])
X_test_raw = tokenizer.texts_to_sequences(np.array(progs_per_HH, dtype=object)[test_df.index.values])

In [None]:
X_train = pad_sequences(X_train_raw, maxlen=maxlen)
X_test = pad_sequences(X_test_raw, maxlen=maxlen)

In [None]:
durations = df.duration_hours.values

In [None]:
y = np.log(durations)

In [None]:
print('mean:', np.mean(y))
print('median:', np.mean(y))
print('std:', np.std(y))

mean: 3.6817992172575322
median: 3.6817992172575322
std: 1.4373146839839472


In [None]:

bins = np.array([
    0.0,
    np.quantile(durations, .25),
    np.quantile(durations, .50),
    np.quantile(durations, .75)
])
y =  np.digitize(durations, bins)

In [None]:
# if we want to make multiclass
from keras.utils.np_utils import to_categorical

y = to_categorical(y)

In [None]:
# here we will make this task a regression task

y = stats.zscore(durations, nan_policy='omit')

In [None]:
y_train = y[train_df.index.values]
y_val = y[test_df.index.values]

In [None]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

In [None]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super().get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config

In [None]:
inputs = keras.Input(shape=(None,), dtype="int64")
x = PositionalEmbedding(maxlen, max_features, embed_size)(inputs)
x = TransformerEncoder(embed_size, dense_dim, num_heads)(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1)(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="adam",
              loss="mse",
              metrics=["mse"])
model.summary()

callbacks = [
    keras.callbacks.ModelCheckpoint("full_transformer_encoder_reglog.keras",
                                    save_best_only=True)
]
model.fit(X_train, y_train, validation_data=(X_test, y_val), epochs=4, callbacks=callbacks)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
positional_embedding (Positi (None, None, 256)         2581760   
_________________________________________________________________
transformer_encoder (Transfo (None, None, 256)         543776    
_________________________________________________________________
global_max_pooling1d (Global (None, 256)               0         
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 3,125,793
Trainable params: 3,125,793
Non-trainable params: 0
___________________________________________________

<keras.callbacks.History at 0x7f652870bc90>

In [None]:
model = keras.models.load_model(
    "full_transformer_encoder_reglog.keras",
    custom_objects={"TransformerEncoder": TransformerEncoder,
                    "PositionalEmbedding": PositionalEmbedding})

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

print('R2:', r2_score(y_val, y_pred))
print('RMSE:', mean_squared_error(y_val, y_pred, squared=False))

R2: 0.6410393961822789
RMSE: 0.6045031135732493


In [None]:
!ls

drive				       sample_data
full_transformer_encoder.keras	       transformer_encoder.keras
full_transformer_encoder_reglog.keras


In [None]:
embedding = model.get_layer('positional_embedding').get_weights()[0]

In [None]:
embedding.shape

(10000, 256)

In [None]:
def get_clf_ready_vec(token_list): # ['A', 'B']
  prog_tokenized = tokenizer.texts_to_sequences([token_list])[0]
  return pad_sequences([prog_tokenized], maxlen=maxlen)

In [None]:
import random

test_prog = '[ סרוגים   [B]  (V)]'
user_id = random.choice(range(len(progs_per_HH)))
prog_list = progs_per_HH[user_id]
prog_tokenized = tokenizer.texts_to_sequences([prog_list])[0]
prog_padded = pad_sequences([prog_tokenized], maxlen=maxlen)

In [None]:
model.predict(prog_padded)

array([[0.47396564]], dtype=float32)

In [None]:
y[user_id]

0.7055796763529029

In [None]:
# we add a prog
prog_list.append(test_prog)

In [None]:
prog_tokenized = tokenizer.texts_to_sequences([prog_list])[0]
prog_padded = pad_sequences([prog_tokenized], maxlen=maxlen)
np.exp(model.predict(prog_padded))

array([[3.6430902]], dtype=float32)

In [None]:

tokenized = tokenizer.texts_to_sequences([prog_names])[0]


In [None]:
vec = get_clf_ready_vec([test_prog for _ in range(10)])
np.exp(model.predict(vec))

array([[6.1134233]], dtype=float32)

In [None]:
prog_list

['[ באה בקלות (V)]',
 '[ תחנה    [B]  (V)]',
 '[ NCIS   [B]  (V)]',
 '[ ניו אמסטרדם   [B]  (V)]',
 '[ חוק וסדר פשע מאורגן  [B]  (V)]',
 '[ אףביאיי המבוקשים  [B]  (V)]',
 '[ חדשות הערב עם רומי נוימרק (L)]',
 '[ NCIS   [B] סיום [B] (V)]',
 '[ מהדורה מיוחדת עם אלמוג בוקר (L)]',
 '[ האנטומיה של גריי   [B]  (V)]',
 '[ החומה הגדולה (L)]',
 '[ באה בקלות (L)]',
 '[ האנטומיה של גריי   [B]  (V)]',
 '[ חדשות הערב  המהדורה המרכזית (L)]',
 '[ האנטומיה של גריי   [B]  (V)]',
 '[ ניו אמסטרדם   [B]  (V)]',
 '[ המגזין עם נגה ניר נאמן (L)]',
 '[ פותחים יום  שי לי ואלעד (L)]',
 '[ חדשות הערב עם רומי נוימרק (L)]',
 '[ שישי עם אילה חסון (L)]',
 '[ תכנית חיסכון (L)]',
 '[ משחקי השף (L)]',
 '[ יום מהדורה מרכזית (L)]',
 '[ המטה המרכזי עם אילה חסון (L)]',
 '[ יום מהדורה מרכזית (L)]',
 '[ מרדף בשידור חי (L)]',
 '[ משבת עד שבת (L)]',
 '[ NCIS   [B]  (V)]',
 '[ חוק וסדר פשע מאורגן  [B]  (V)]',
 '[ אףביאיי המבוקשים  [B]  (V)]',
 '[ אףביאיי המבוקשים  [B]  (V)]',
 '[ האנטומיה של גריי   [B]  (V)]',
 '[ החומה הגדולה (V

In [None]:
#for each content cluster

(5000, 64)