In [0]:
import pandas as pd
import tensorflow as tf
import os
from src import dataset_gen, tokenizers
from collections import defaultdict
import numpy as np
from tensorflow.keras import layers


In [0]:
PATH = 'data/'
train = pd.read_csv(os.path.join(PATH, 'train.csv'))
item_metadata = pd.read_csv(os.path.join(PATH, 'item_metadata.csv'))


In [0]:
# Treating them as independent... which is definitely not correct
# but anyway
filters_statistics = defaultdict(int)


for filters in train['current_filters'].dropna():
  for current_filter in filters.split('|'):
    filters_statistics[current_filter] += 1


In [0]:
tokenizers_all = tokenizers.Tokenizer.get_all_tokenizers(train, item_metadata, filters)

In [0]:
item_metadata.set_index('item_id', inplace=True)

In [0]:
properties = set()

for i, row in item_metadata.iterrows():
  properties.update(row['properties'].split('|'))

In [0]:
properties = zip(properties, range(len(properties)))
properties = dict(properties)

In [0]:
train_length = int(len(train) * 0.8)
train_split = train[:train_length]
val_split = train[train_length:]
len(train_split), len(val_split)
del train

In [16]:
# DATASET
params = {
    'batch_size': 100,
    'size_items_vectos': len(properties),
    'num_impressions': 25, # Always.. if data is consistent
    'layer_1_units': 1000,
    'layer_2_units': 600,
    'layer_3_units': 200,
}
batch_size = params['batch_size']
fields = ['items', 'impressions_filters', 'prices', 'timestamp']
i = dataset_gen.InputGenerator(tokenizers_all, fields)

def get_dataset_train():
  gen, types, shapes = i.input_generator_gen(train_split,
                                             return_items_filters=True,
                                             item_metadata=item_metadata,
                                             properties=properties,
                                             return_idx=True)
  dataset_train = tf.data.Dataset.from_generator(gen, types)
  dataset_train = dataset_train.repeat().padded_batch(batch_size,
                                                      padded_shapes=shapes)
  batch = dataset_train.make_one_shot_iterator().get_next()
  
  return batch

def get_dataset_eval():
  gen_val, types, shapes = i.input_generator_gen(val_split,
                                                     return_items_filters=True,
                                                     item_metadata=item_metadata,
                                                     properties=properties,
                                                     return_idx=True)
  dataset_val = tf.data.Dataset.from_generator(gen_val, types)
  dataset_val = dataset_val.padded_batch(batch_size,
                                         padded_shapes=shapes)
  batch_val = dataset_val.make_one_shot_iterator().get_next()
  
  return batch_val


steps_per_epoch = len(train_split) // batch_size
validation_steps = len(val_split) // batch_size
# These are not epochs but steps since we're using tf.dataset.


['items', 'impressions_filters', 'prices', 'timestamp']


In [0]:
tf.logging.set_verbosity(tf.logging.INFO)

In [0]:
def small_nn_model_fn(
   features, # This is batch_features from input_fn
   labels,   # This is batch_labels from input_fn
   mode,     # An instance of tf.estimator.ModeKeys, see below
   params):  # Additional configuration

  batch_size = params['batch_size'] if mode in [tf.estimator.ModeKeys.TRAIN,
                                                tf.estimator.ModeKeys.EVAL] else 1

  # С този модел запазваме реда на items!!! Така първите ще имат по-голямо тегло.
  _, input_impressions_filters, input_prices, input_timestamp = features
  # input_items.set_shape([None, params['num_impressions']])
  
  input_impressions_filters = tf.cast(input_impressions_filters, dtype=tf.float32)
  input_layer = tf.concat([input_impressions_filters, input_prices], axis=1)
  
  W1 = tf.Variable(tf.random_normal([params['num_impressions'] * (params['size_items_vectos'] + 1),
                                     params['layer_1_units']]))
  b1 = tf.Variable(tf.random_normal([params['layer_1_units']]))
  
  first_layer = tf.nn.relu(tf.matmul(input_layer, W1) + b1)
 
  second_layer = tf.layers.dense(inputs=first_layer,
                                 units=params['layer_2_units'],
                              activation=tf.keras.activations.relu)
  third_layer = tf.layers.dense(inputs=second_layer,
                                units=params['layer_3_units'],
                             activation=tf.keras.activations.relu)
  last_layer = tf.layers.dense(inputs=third_layer,
                               units=params['num_impressions'],
                             activation=tf.keras.activations.softmax)

  predicted_classes = tf.argmax(last_layer, 1)
  predictions = {
      'class_ids': predicted_classes,
      'probabilities': last_layer,
      'timestamp': input_timestamp
  }

  if mode == tf.estimator.ModeKeys.PREDICT:
    return tf.estimator.EstimatorSpec(mode, predictions=predictions)
  
  
  predicted_classes
  
  
  sorted_probs = tf.argsort(last_layer, axis=1)
  # https://stackoverflow.com/questions/47759777/tensorflow-get-indices-of-values-in-a-tensor
  idxs = tf.where(tf.equal(sorted_probs, tf.expand_dims(labels, 1)))[:,-1]
  mrr = tf.reduce_mean(idxs / params['num_impressions'])

  
  loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,
                                                logits=last_layer)
  loss = tf.reduce_mean(loss)

  if mode == tf.estimator.ModeKeys.TRAIN:
    tf.summary.scalar('loss-train', tf.reduce_sum(loss))
    optimizer = tf.train.AdamOptimizer(learning_rate=0.0001)
    train_op = optimizer.minimize(
        loss=loss,
        global_step=tf.train.get_global_step())
    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
  
  hook = tf.train.LoggingTensorHook({"mrr:": mrr},
                                     every_n_iter=1)
  eval_metric_ops = {
      "accuracy": tf.metrics.accuracy(
          labels=labels, predictions=predicted_classes)}
  return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops, evaluation_hooks=[hook])


In [0]:
run_config = tf.estimator.RunConfig(log_step_count_steps=20,
                                    # save_summary_steps=20,
                                    save_checkpoints_steps=100,
                                    keep_checkpoint_max=1)

small_nn_estimator = tf.estimator.Estimator(
  model_fn=small_nn_model_fn,
  model_dir='./model_small_nn',
  params=params,
  config=run_config
)

train_spec = tf.estimator.TrainSpec(input_fn=get_dataset_train,
                                    max_steps=1 * steps_per_epoch)
# throttle_secs=10 lol
eval_spec = tf.estimator.EvalSpec(input_fn=get_dataset_eval,
                                  throttle_secs=30)

tf.estimator.train_and_evaluate(small_nn_estimator, train_spec, eval_spec)

INFO:tensorflow:Using config: {'_model_dir': './model_small_nn', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 100, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 20, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f49a030c908>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Not using Distribute Coordinator.
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The eval

In [0]:
!mkdir model_small_nn
!mv /tmp/tmp9k77hrcy/* model_small_nn/

In [0]:
del train_split
del val_split

test = pd.read_csv(os.path.join(PATH, 'test.csv'))
test = test[test['action_type'] == 'clickout item']
test = test[test['reference'].isnull()]
test['reference'].fillna(0, inplace=True)
#test_all['reference'].fillna(0, inplace=True)
# TODO: Use all of the test rows!!!

In [0]:
inputgen_test = InputGenerator(tokenizers,
                  ['user_id', 'session_id', 'platform', 'device',
                   'interacted_items', 'filters', 'items', 'timestamp'])

same_keys = ['user_id', 'session_id', 'timestamp', 'step']

def get_dataset_test():
  gen_test, types, shapes = inputgen_test.input_generator_gen(test)
  dataset = tf.data.Dataset.from_generator(gen_test, types).padded_batch(500, shapes)
  batch_val = dataset.make_one_shot_iterator().get_next()
  
  return dataset

inv_map = {(v - 1): k for k, v in item_tokenizer.mapping.items()}


small_nn_estimator = tf.estimator.Estimator(
  model_fn=yt_rec_model_fn,
  model_dir='./small_nn_model_fn',
  params=params,
  #config=run_config
)

def save_preds(filename, l_results):
  r = pd.DataFrame(l_results)
  r.to_csv(filename)
  del r
  uploaded = drive.CreateFile({'title': filename})
  uploaded.SetContentFile(os.path.join('/content/drive/My Drive/recsys/',
                                       filename))
  uploaded.Upload()
  print('Uploaded file with ID {}'.format(uploaded.get('id')))

all_count = len(test)
results = []
predictions = yt_rec_estimator.predict(input_fn=get_dataset_test)
test_iter = test.iterrows()
counter = 0
print('All rows: {}'.format(all_count))

In [0]:
for (index, row), prediction in zip(test_iter, predictions):
  result_row = {}
  for r in same_keys:
    result_row[r] = row[r]
  
  assert row['timestamp'] ==  prediction['timestamp']
  
  prediction['probabilities']
  
  # TODO: add separate score for price and position!

  result_row['item_recommendations'] = ' '.join(item_recommendations)
  
  results.append(result_row)
  counter += 1

  if counter % 1000 == 0:
    print(counter/all_count)
    print('PREDICTIONS: {}'.format(counter))
    
  if counter % 50_000 == 0:
    print(counter/all_count)
    print('PREDICTIONS: {}'.format(counter))
    save_preds('results2/predictions-{}.csv'.format(counter), results)
    del results
    results = []