In [0]:
import pandas as pd
import tensorflow as tf
from sklearn.utils import shuffle

In [0]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('drive/My Drive/recsys')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
PATH = 'data/'
train = pd.read_csv(os.path.join(PATH, 'train.csv'))
train.head()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,


In [0]:
item_metadata = pd.read_csv(os.path.join(PATH, 'item_metadata.csv'))
item_metadata.head()

Unnamed: 0,item_id,properties
0,5101,Satellite TV|Golf Course|Airport Shuttle|Cosme...
1,5416,Satellite TV|Cosmetic Mirror|Safe (Hotel)|Tele...
2,5834,Satellite TV|Cosmetic Mirror|Safe (Hotel)|Tele...
3,5910,Satellite TV|Sailing|Cosmetic Mirror|Telephone...
4,6066,Satellite TV|Sailing|Diving|Cosmetic Mirror|Sa...


In [0]:
class Tokenizer:
  """
  Mapping ids to ints.
  """
  
  def fit(self, data):
    self.vocabulary = set(data)
    self.mapping = dict(zip(self.vocabulary, range(1, len(self.vocabulary) + 1)))
    return self
    
  def transform(self, data):
    if isinstance(data, list):
      return [self.mapping.get(d, 0) for d in data]
    else:
      return self.mapping.get(data, 0)
    

class ItemTokenizer(Tokenizer):
  def transform(self, data):
    if isinstance(data, list):
      arg = list(map(int, data))
    else:
      arg = int(data)
    return super().transform(arg)

item_tokenizer = ItemTokenizer().fit(item_metadata['item_id'])


## Getting only the clickout item actions!

In [0]:
train = train[train['action_type'] == 'clickout item']
train = train[['reference', 'impressions']]

![alt text](https://cdn-images-1.medium.com/max/800/1*yiH5sZI-IBxDSQMKhvbcHw.png
)

## Skip-gram for the items
based only on the impressions they appear with!

### Dataset

In [0]:
class InputGenerator:
  
  def __init__(self, item_tokenizer):
    self.item_tokenizer = item_tokenizer
    
  def input_generator_gen(self, df):
    def gen():

      for index, row in df.iterrows():
        reference = self.item_tokenizer.transform(row['reference'])
        impressions = self.item_tokenizer.transform(row['impressions'].split('|'))
        
        for impression in impressions:
          if reference == impression:
            continue
          yield reference, [impression]
    return gen, (tf.int32, tf.int32),

i = InputGenerator(item_tokenizer)  

In [0]:
params = {
    'batch_size': 4096,
    'items_vocabulary': len(item_tokenizer.vocabulary),
    'items_embedding': 100,
}
gen, types = i.input_generator_gen(train)
dataset_train = tf.data.Dataset.from_generator(gen, types)
dataset_train = dataset_train.repeat().batch(params['batch_size'])
# batch = dataset_train.make_one_shot_iterator().get_next()

steps_per_epoch = len(train) // params['batch_size']

In [0]:
tf.logging.set_verbosity(tf.logging.INFO)

In [0]:
!rm -r skip-gram-model

In [0]:
# It can only be trained!
def skip_gram_items(
   features, # This is batch_features from input_fn
   labels,   # This is batch_labels from input_fn
   mode,     # An instance of tf.estimator.ModeKeys, see below
   params):  # Additional configuration
  
  reference = features
  impression = labels
  
  # Encoding
  items_embeddings = tf.keras.layers.Embedding(input_dim=params['items_vocabulary'],
                                               output_dim=params['items_embedding'])
  
  # Reverse encoding
  W2 = tf.Variable(tf.random_normal([params['items_embedding'],
                                     params['items_vocabulary']]))
  b2 = tf.Variable(tf.random_normal([params['items_vocabulary']]))
  
  features.set_shape([None, params['items_embedding']])
  embeddings = tf.reshape(items_embeddings(features), [-1, params['items_embedding']])
  
  layer = tf.matmul(embeddings, W2) + b2
  layer = tf.nn.softmax(layer)
  
  loss = tf.losses.sparse_softmax_cross_entropy(labels=impression,
                                                logits=layer)
  loss = tf.reduce_mean(loss)
  optimizer = tf.train.AdamOptimizer(learning_rate=0.1)
  
  train_op = optimizer.minimize(
    loss=loss,
    global_step=tf.train.get_global_step()
  )
  
  return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)


run_config = tf.estimator.RunConfig(log_step_count_steps=100,
                                    # save_summary_steps=20,
                                    save_checkpoints_steps=100,
                                    keep_checkpoint_max=3)

skipgram_estimator = tf.estimator.Estimator(
  model_fn=skip_gram_items,
  model_dir='./skip-gram-model',
  params=params,
  config=run_config
)

skipgram_estimator.train(input_fn=lambda: tf.data.Dataset.from_generator(gen, types),
                   steps=1000 * steps_per_epoch)

I0621 18:52:41.403447 140585171875712 estimator.py:209] Using config: {'_model_dir': './skip-gram-model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 3, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fdbf0610438>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
I0621 18:52:41.445034 140585171875712 estimator.py:1145] Calling model_fn.
I0621 18:52:41.639

KeyboardInterrupt: ignored

## CBOW for the items

In [0]:
class InputGeneratorCBOW:
  
  def __init__(self, item_tokenizer):
    self.item_tokenizer = item_tokenizer
    
  def input_generator_gen(self, df):
    def gen():

      for index, row in df.iterrows():
        reference = self.item_tokenizer.transform(row['reference'])
        impressions = self.item_tokenizer.transform(row['impressions'].split('|'))
        
        if reference in impressions:
          impressions.remove(reference)
        yield impressions, [reference]
    return gen, (tf.int32, tf.int32), (tf.TensorShape([None]), tf.TensorShape([None]))
  
i_cbow = InputGeneratorCBOW(item_tokenizer)
params = {
    'batch_size': 256,
    'items_vocabulary': len(item_tokenizer.vocabulary),
    'items_embedding': 50,
}

def get_cbow_dataset():
  gen_cbow, types, shapes = i_cbow.input_generator_gen(train)
  dataset_train = tf.data.Dataset.from_generator(gen_cbow, types)
  dataset_train = dataset_train.repeat().padded_batch(params['batch_size'], shapes)
  return dataset_train



In [0]:
batch = get_cbow_dataset().make_one_shot_iterator().get_next()

with tf.Session() as sess:
  sess.run([tf.local_variables_initializer(), tf.tables_initializer()])
  b = sess.run(batch)
b
b[1].shape

(256, 1)

In [0]:
# It can only be trained!
def cbow_items(
   features, # This is batch_features from input_fn
   labels,   # This is batch_labels from input_fn
   mode,     # An instance of tf.estimator.ModeKeys, see below
   params):  # Additional configuration

  # In CBOW we predict the target REFERENCE from the context IMPRESSIONS.
  reference = labels
  impressions = features

  # Encoding
  items_embeddings = tf.keras.layers.Embedding(input_dim=params['items_vocabulary'],
                                               output_dim=params['items_embedding'])

  # Reverse encoding
  W2 = tf.Variable(tf.random_normal([params['items_embedding'],
                                     params['items_vocabulary']]))
  b2 = tf.Variable(tf.random_normal([params['items_vocabulary']]))

  embeddings = tf.reduce_mean(items_embeddings(impressions), axis=1)

  layer = tf.matmul(embeddings, W2) + b2
  # layer = tf.nn.softmax(layer)

  loss = tf.losses.sparse_softmax_cross_entropy(labels=tf.reshape(reference, [-1]),
                                                logits=layer)
  loss = tf.reduce_mean(loss)
  optimizer = tf.train.AdamOptimizer(learning_rate=0.0000001)

  train_op = optimizer.minimize(
    loss=loss,
    global_step=tf.train.get_global_step()
  )

  return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)


run_config = tf.estimator.RunConfig(log_step_count_steps=30,
                                  # save_summary_steps=20,
                                  save_checkpoints_steps=100,
                                  keep_checkpoint_max=3)

cbow_estimator = tf.estimator.Estimator(
model_fn=cbow_items,
model_dir='./cbow-model',
params=params,
config=run_config
)

cbow_estimator.train(input_fn=get_cbow_dataset,
                 steps=1000 * steps_per_epoch)

I0621 19:41:33.493898 140585171875712 estimator.py:209] Using config: {'_model_dir': './cbow-model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 3, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 30, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fdc06221c50>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
I0621 19:41:33.549085 140585171875712 estimator.py:1145] Calling model_fn.
I0621 19:41:34.047994 14

KeyboardInterrupt: ignored

In [0]:
# !rm -r cbow-model