In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_ranking as tfr

from utils import read_data, get_ndcg

import numpy as np
from tqdm import tqdm

from tensorflow_serving.apis import input_pb2

In [2]:
# df_train, y_train, group_size_train, query_id_train = read_data('Fold1', 'train')
# df_vali, y_vali, group_size_vali, query_id_vali = read_data('Fold1', 'vali')
df_test, y_test, group_size_test, query_id_test = read_data('Fold1', 'test')

# X_columns = df_train.columns

In [3]:
def _float_feature(value):
 """Returns a float_list from a float / double."""
 return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
 """Returns an int64_list from a bool / enum / int / uint."""
 return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def create_records(df, y, query_id, labels, name):
    options_ = tf.io.TFRecordOptions(compression_type='GZIP')
    writer = tf.io.TFRecordWriter(f"./{name}.record")
    
    elwc = input_pb2.ExampleListWithContext()
    last_query_id = None

    i = 0
    for qid, relevance, row in tqdm(zip(query_id, y, df.iterrows()), total=df.shape[0]):
        # print(qid, relevance, row[1].values)

        example_dict = {
           f'f{feat_name}': _float_feature(feat_val)
           for feat_name, feat_val in zip(labels, row[1].values)
        }

        example_dict['relevance_label'] = _int64_feature(int(relevance))

        example_ = tf.train.Example(features=tf.train.Features(feature=example_dict))

        if qid != last_query_id:
            if last_query_id != None:
                writer.write(elwc.SerializeToString())
            last_query_id = qid
            elwc = input_pb2.ExampleListWithContext()
            elwc.examples.append(example_)
        else:
            elwc.examples.append(example_)

    writer.write(elwc.SerializeToString())

In [3]:
labels = df_test.columns

In [None]:
create_records(df_train, y_train, query_id_train, labels, "train") 

In [None]:
create_records(df_test, y_test, query_id_test, labels, "test")

In [None]:
create_records(df_vali, y_vali, query_id_vali, labels, "vali")

In [5]:
context_spec_ = {}
example_spec_ = {f'f{feat}': tf.io.FixedLenFeature(shape=(1,), \
                        dtype=tf.float32, default_value=0.0) \
                        for feat in labels}
label_spec_ = ('relevance_label', \
                tf.io.FixedLenFeature(shape=(1,), \
                dtype=tf.int64, \
                default_value=-1))

# Define Input Creator
input_creator= tfr.keras.model.FeatureSpecInputCreator(
        context_feature_spec={},
        example_feature_spec=example_spec_)

In [6]:
preprocess_spec = {
    **{name: lambda t: tf.math.log1p(t * tf.sign(t)) * tf.sign(t)
       for name in example_spec_.keys()}
}

In [7]:
preprocessor=tfr.keras.model.PreprocessorWithSpec(preprocess_spec)
# preprocessor=tfr.keras.model.PreprocessorWithSpec()

In [8]:
scorer = tfr.keras.model.DNNScorer(
    hidden_layer_dims=[1024,512,256],
    output_units=1,
    activation=tf.nn.relu,
    use_batch_norm=True,
    batch_norm_moment=0.999,
    dropout=0.5
)

In [9]:
model_builder = tfr.keras.model.ModelBuilder(
    input_creator=input_creator,
    preprocessor=preprocessor,
    scorer=scorer,
    mask_feature_name="example_list_mask", # TODO: QUE ISSO?
    name="model_builder"
)

In [10]:
dataset_hparams = tfr.keras.pipeline.DatasetHparams(
    train_input_pattern="./train.record",
    valid_input_pattern="./vali.record",
    train_batch_size=128,
    valid_batch_size=128,
    list_size=600, # TODO: What happens if list is bigger?
    dataset_reader=tfr.keras.pipeline.DatasetHparams.dataset_reader
)

In [11]:
dataset_builder = tfr.keras.pipeline.SimpleDatasetBuilder(
    {},
    example_spec_,
    mask_feature_name="example_list_mask",
    label_spec=label_spec_,
    hparams=dataset_hparams,
    sample_weight_spec=None
)

In [15]:
pipeline_hparams = tfr.keras.pipeline.PipelineHparams(
    model_dir="./model/", # The model_dir argument is the path where TFR will save the model and training data, including Tensorboard files for training visualization.
    num_epochs=18,
    steps_per_epoch=5000,
    validation_steps=125,
    learning_rate=0.05,
    loss='approx_ndcg_loss',
    optimizer='adagrad'
)

In [16]:
ranking_pipeline = tfr.keras.pipeline.SimplePipeline(
    model_builder,
    dataset_builder=dataset_builder,
    hparams=pipeline_hparams
) 

In [17]:
ranking_pipeline.train_and_validate(verbose=1)





Epoch 18/18
INFO:tensorflow:Assets written to: ./model/export/latest_model/assets


INFO:tensorflow:Assets written to: ./model/export/latest_model/assets


# Load model and predict

TODO: This prediction needs to be fixed. It is a mess at the moment

In [4]:
loaded_model = tf.keras.models.load_model("./model/export/latest_model")

In [5]:
# raw_dataset = tf.data.TFRecordDataset(["test.record"])
# raw_dataset

In [6]:
# for raw_record in raw_dataset:
#     rr = raw_record
#     example = input_pb2.ExampleListWithContext()
#     example.ParseFromString(raw_record.numpy())

In [7]:
df_test_copy = df_test.copy()
df_test_copy["query_id"] = query_id_test

In [None]:
groups = df_test_copy.groupby(["query_id"])
for query, group in tqdm(groups, total=len(groups)):
    pred_dict = {}
    for column in group.columns:
        if column != "query_id":
            arr = np.pad(group[column], [0, 600-group.shape[0]])
            arr = np.reshape(arr, (1,600,1))
            pred_dict[f"f{column}"] = tf.constant(arr)
        
    pred_dict["example_list_mask"] = tf.constant(np.reshape(np.array(np.pad(np.ones(group.shape[0]), [0, 600-group.shape[0]]), dtype=np.bool), (1, 600)))
    
    scores = loaded_model.predict(pred_dict)
    df_test.loc[group.index, "predicted_scores"] = list(scores[0][0:group.shape[0]])

In [14]:
import pandas as pd
from sklearn.metrics import ndcg_score

import numpy as np

In [15]:
def get_ndcg_2(predicted_score, true_score, query_id, k=5):
    """
    model: Predictor
    df: Dataframe containing features
    true_score: True relevance labels
    query_id: A pd.Series containing the query_id for that row. Same shape as df.
    k: @k NDCG metric
    """
    # predicted_score += np.abs(np.min(predicted_score))
    
    ndcg_df = pd.DataFrame({'query_id': query_id, 'true_score': true_score, 'predicted_score': predicted_score})
    
    true_score_test = ndcg_df.groupby(['query_id'])['true_score'].apply(list).tolist()
    predicted_score_test = ndcg_df.groupby(['query_id'])['predicted_score'].apply(list).tolist()

    return np.mean([ndcg_score([_true], [_predicted], k=k) for _true, _predicted in zip(true_score_test, predicted_score_test) if len(_true) > 1])

In [16]:
get_ndcg_2(df_test.predicted_scores, y_test, query_id_test, k=10)

0.5308703162842248

In [None]:
# 0.527
# print(get_ndcg(model=model, df=df_test, true_score=y_test, query_id=query_id_test, k=k))