In [1]:
#%pip install -r requirements_tensorflow.txt

In [1]:
import numpy as np
from itertools import combinations
from sklearn.model_selection import train_test_split
import pandas as pd
import json
import umap

In [2]:
DATA_DIR = 'data'

# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {item['_id']: item['text'] for item in (json.loads(line) for line in f)}

with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {item['_id']: item['text'] for item in (json.loads(line) for line in f)}

train_data = pd.read_csv(f'{DATA_DIR}/task2_train.tsv', delimiter='\t')
test_data = pd.read_csv(f'{DATA_DIR}/task2_test.tsv', delimiter='\t')

In [3]:
# Create df from corpus_data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text']).reset_index()
# rename index to document_id
corpus_df = corpus_df.rename(columns={'index': 'document_id'})

In [4]:
# Create df from queries_data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text']).reset_index()
# rename index to query_id
queries_df = queries_df.rename(columns={'index': 'query_id'})

In [5]:
# Apply tf-idf to corpus_df and queries_df and append to respective dfs
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
corpus_tfidf = tfidf.fit_transform(corpus_df['text'])
corpus_tfidf_df = pd.DataFrame.sparse.from_spmatrix(corpus_tfidf)
corpus_tfidf_df['document_id'] = corpus_df['document_id'].astype('int64')
# make document_id the index
corpus_tfidf_df = corpus_tfidf_df.set_index('document_id')
corpus_tfidf_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,597614,597615,597616,597617,597618,597619,597620,597621,597622,597623
document_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1867825,0.0,0.10364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
419610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4614226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4108603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3744854,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Do the same to queries_df
queries_tfidf = tfidf.transform(queries_df['text'])
queries_tfidf_df = pd.DataFrame.sparse.from_spmatrix(queries_tfidf)
queries_tfidf_df['query_id'] = queries_df['query_id'].astype('int64')
# make query_id the index
queries_tfidf_df = queries_tfidf_df.set_index('query_id')
queries_tfidf_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,597614,597615,597616,597617,597618,597619,597620,597621,597622,597623
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1185869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1185868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
597651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
403613,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1183785,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# transform queries_tfidf_df using the same reducer
queries_tfidf_reduced = reducer.transform(queries_tfidf_df)

In [8]:
train_data['corpus-id'] = train_data.apply(lambda x: eval(x['corpus-id']), axis=1)
train_data['score'] = train_data.apply(lambda x: eval(x['score']), axis=1)

In [9]:
# train_data is currently in the form of (query_id, list(corpus-id	), list(score))
# We want to convert it to (query_id, corpus-id	, score)
# Where each row is a query-document pair
train_data = train_data.explode(['corpus-id', 'score'])

In [10]:
# split data into train and validation
train, validation = train_test_split(train_data, test_size=0.2, random_state=42)

In [11]:
# Create a machine learning model using tensorflow
import tensorflow as tf
from tensorflow import keras

# Create a model, but since we want to customize the fit function, we need to create a custom model
class RankModel(keras.Model):
    def __init__(self):
        super().__init__()
        # input will be 2 ids, 1 for query and 1 for document
        self.inputlayer = keras.layers.Input(shape=(3,))
        self.dense1 = keras.layers.Dense(128, activation='relu')
        self.dense2 = keras.layers.Dense(64, activation='relu')
        self.dense3 = keras.layers.Dense(32, activation='relu')
        self.dense4 = keras.layers.Dense(1, activation='relu') # will return a score should be positive so we use relu

    def call(self, inputs):
        x = self.inputlayer(inputs)
        x = self.dense1(inputs)
        x = self.dense2(x)
        x = self.dense3(x)
        return self.dense4(x)
    
    def train_step(self, data):
        # data is of shape [batch_size, 3]
        # where each batch contains 2 ids and 1 score
        # first id is the query id
        # second id is the document id
        # we'll get the features from the dataframe using the ids

        # get the query ids
        query_ids = data[0][:, 0]
        # get the document ids
        document_ids = data[0][:, 1]
        #print("Got data", data)
        #print("Got query_ids", query_ids)
        #print("Got document_ids", document_ids)
        print("(RANK) Starting to gather")
        # get the tfidf features for the query ids
        query_features = tf.gather(queries_tfidf_df, query_ids)
        # get the tfidf features for the document ids
        document_features = tf.gather(corpus_tfidf_df, document_ids)
        print("(RANK) Starting to concatenate")
        # concatenate the features
        features = tf.concat([query_features, document_features], axis=1)
        # get the scores
        scores = data[1]
        print("(RANK) Starting to tape")
        # get the predictions
        with tf.GradientTape() as tape:
            predictions = self(features)
            # calculate the loss
            loss = keras.losses.mean_squared_error(scores, predictions)
        print("(RANK) Starting to gradients")
        # get the gradients
        gradients = tape.gradient(loss, self.trainable_variables)
        # update the weights
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        # update the metrics
        self.compiled_metrics.update_state(scores, predictions)
        # return the metrics
        return {m.name: m.result() for m in self.metrics}

# Create a model
model = RankModel()

In [12]:
# Compile the model
model.compile(
    optimizer=keras.optimizers.legacy.Adam(learning_rate=0.001),
    loss=keras.losses.MeanSquaredError(),
    metrics=[keras.metrics.MeanSquaredError()],
)

In [13]:
# Create a tf dataset from the train data
train_dataset = tf.data.Dataset.from_tensor_slices((train[['query-id', 'corpus-id']].values.tolist(), train['score'].values.tolist()))

# Create a tf dataset from the validation data
validation_dataset = tf.data.Dataset.from_tensor_slices((validation[['query-id', 'corpus-id']].values.tolist(), validation['score'].values.tolist()))


In [14]:
# batch the data
train_dataset = train_dataset.batch(32)
validation_dataset = validation_dataset.batch(32)

In [15]:
# for batch in train_dataset.take(1):
#     # get the query ids
#     query_ids = batch[0][:, 0]
#     # get the document ids
#     document_ids = batch[0][:, 1]
#     print("Got batch", batch)
#     print("Got query_ids", query_ids)
#     print("Got document_ids", document_ids)

In [16]:
# Fit the model
model.fit(train_dataset, epochs=10, validation_data=validation_dataset)

Epoch 1/10


(RANK) Starting to gather


: 