In [89]:
#%pip install tensorflow-datasets numpy pandas scikit-learn matplotlib seaborn jsonlines pydot tqdm jupyter ipywidgets widgetsnbextension pandas-profiling tensorflow

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import pandas as pd
import json
import tensorflow_datasets as tfds

In [2]:
import pandas as pd
import json

DATA_DIR = 'data'

# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

train_data = pd.read_csv(f'{DATA_DIR}/task1_train.tsv', delimiter='\t')
test_data = pd.read_csv(f'{DATA_DIR}/task1_test.tsv', delimiter='\t')

# Rename corpus-id to document_id and query-id to query_id in both train and test data
train_data = train_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
test_data = test_data.rename(columns={'corpus-id': 'document_id', 'query-id': 'query_id'})
# Make sure that the document_id and query_id are int64
train_data['document_id'] = train_data['document_id'].astype('int64')
train_data['query_id'] = train_data['query_id'].astype('int64')

In [3]:
# Create a df from the corpus data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text'])
# Create a df from the queries data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text'])

In [4]:
# Apply vectorizer to the corpus and queries
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
corpus_vectors = vectorizer.fit_transform(corpus_df['text'])
queries_vectors = vectorizer.transform(queries_df['text'])

In [5]:
corpus_vectors_df = pd.DataFrame.sparse.from_spmatrix(corpus_vectors, index=corpus_df.index, columns=vectorizer.get_feature_names_out())
queries_vectors_df = pd.DataFrame.sparse.from_spmatrix(queries_vectors, index=queries_df.index, columns=vectorizer.get_feature_names_out())

In [6]:
# Merge the corpus_df and queries_df into the train_data
train_data = train_data.merge(corpus_vectors_df, left_on='document_id', right_index=True)
train_data = train_data.merge(queries_vectors_df, left_on='query_id', right_index=True)

MergeError: Passing 'suffixes' which cause duplicate columns {'document_id_x'} is not allowed.

In [121]:
# train-test split on train_data
from sklearn.model_selection import train_test_split
# scikit-learn vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


X_train, X_test, y_train, y_test = train_test_split(train_data[['query-id', 'corpus-id', 'query', 'document']], train_data['score'], test_size=0.2, random_state=42)
# Get vocabulary from the train set
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train['query'])
vectorizer.fit(X_train['document'])
vocab = vectorizer.vocabulary_

In [123]:
# create list from vocab keys
vocab_list = list(vocab.keys())

In [107]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(426200, 4) (106551, 4) (426200,) (106551,)


In [130]:
class RankingModel(tf.keras.Model):
    def __init__(self, vectorizer):
        super().__init__()
        self.vectorizer = vectorizer

        self.document_embeddings = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(
                vocabulary=vocab_list
            ),
            tf.keras.layers.Embedding(len(vectorizer.get_feature_names_out()), 128)
        ])

        self.query_embeddings = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.TextVectorization(
                vocabulary=vocab_list
            ),
            tf.keras.layers.Embedding(len(vectorizer.get_feature_names_out()), 128)
        ])

        self.ratings = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(1, activation='sigmoid')
        ])
    
    def call(self, inputs):
        print('Got input', inputs)
        query_id = inputs[0]
        doc_id = inputs[1]
        query = inputs[2]
        doc = inputs[3]
        query = self.query_embeddings(query)
        doc = self.document_embeddings(doc)
        print('After embedding', query.shape, doc.shape)
        # Concatenate the query and document
        query_doc = tf.concat([query, doc], axis=1)
        print('After concat', query_doc.shape)
        # Get the ratings
        ratings = self.ratings(query_doc)
        print('After ratings', ratings.shape)
        return ratings


In [131]:
# Train the model
model = RankingModel(vectorizer=vectorizer)
model.compile(optimizer='adam', loss='mse', metrics=['rmse'])

model.fit([X_train['query-id'], X_train['corpus-id'], X_train['query'], X_train['document']], y_train, epochs=10, batch_size=32, validation_split=0.2)



Epoch 1/10
Got input (<tf.Tensor: shape=(32, 1), dtype=int64, numpy=
array([[ 920745],
       [ 205412],
       [ 610026],
       [ 642671],
       [ 562283],
       [ 429114],
       [ 193753],
       [1153755],
       [1168336],
       [ 488003],
       [ 511189],
       [1034887],
       [ 544980],
       [ 118279],
       [  28205],
       [ 491605],
       [ 738429],
       [ 694836],
       [ 793372],
       [ 219834],
       [ 237897],
       [ 283116],
       [1017041],
       [ 295744],
       [  35627],
       [ 285267],
       [ 676520],
       [ 953025],
       [ 779996],
       [ 261230],
       [ 676352],
       [ 619674]])>, <tf.Tensor: shape=(32, 1), dtype=int64, numpy=
array([[3485949],
       [2069779],
       [1602971],
       [1365480],
       [2234478],
       [6196770],
       [1513697],
       [1230190],
       [5605405],
       [ 807342],
       [8205585],
       [3591252],
       [1513887],
       [4931545],
       [4793318],
       [1336589],
       [3367898],

TypeError: 'str' object is not callable