In [9]:
import pandas as pd
import json
import helpers
from tqdm.notebook import tqdm
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp
import os 
import math
from functools import partial
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text
DATA_DIR = 'data'

stemmer = PorterStemmer()
# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

train_data = pd.read_csv(f'{DATA_DIR}/task1_train.tsv', delimiter='\t')
test_data = pd.read_csv(f'{DATA_DIR}/task1_test.tsv', delimiter='\t')

# Make sure that the document_id and query_id are int64
train_data['corpus-id'] = train_data['corpus-id'].astype('int64')
train_data['query-id'] = train_data['query-id'].astype('int64')
CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, lambda x: strip_short(s=x,minsize=1), strip_multiple_whitespaces, remove_stopwords]

In [10]:
# Create a df from the corpus data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text'])
# Create a df from the queries data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text'])

In [11]:
train_data_2 = pd.read_csv(f'{DATA_DIR}/task2_train.tsv', delimiter='\t')
train_data_2['corpus-id'] = train_data_2['corpus-id'].apply(lambda x: eval(x))
train_data_2['query-id'] = train_data_2['query-id'].astype('int64')
train_data_2['score'] = train_data_2['score'].apply(lambda x: eval(x))
train_data_2.head(10)

Unnamed: 0,query-id,corpus-id,score
0,915593,"[1396701, 1396704, 1396705, 1396707, 1396708, ...","[0, 0, 1, 0, 2, 0, 3, 0, 0, 0, 2, 1, 2, 0, 0, ..."
1,146187,"[1028971, 1028972, 1131101, 1138801, 1230566, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,1114646,"[1002453, 1216492, 1316103, 1316109, 1342262, ...","[0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, ..."
3,1129237,"[1020793, 1128332, 1138726, 1169301, 120308, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 3, 0, ..."
4,573724,"[1005338, 104856, 1053303, 1165128, 1165129, 1...","[1, 1, 0, 0, 1, 0, 0, 2, 2, 0, 0, 0, 1, 0, 0, ..."
5,148538,"[1299824, 1299830, 1311202, 1311204, 1311206, ...","[2, 1, 2, 1, 0, 1, 1, 2, 1, 2, 2, 2, 0, 1, 1, ..."
6,527433,"[1000485, 1101462, 1187918, 1212778, 1212782, ...","[3, 0, 0, 2, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0, 3, ..."
7,130510,"[1046258, 1110766, 1156210, 1159414, 1211365, ...","[0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 2, 2, ..."
8,405717,"[1111371, 1111372, 1111375, 1538943, 1538949, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
9,1106007,"[1020463, 1040867, 1195441, 1334328, 1334330, ...","[1, 0, 0, 2, 3, 2, 0, 2, 3, 3, 1, 0, 0, 0, 0, ..."


In [12]:
train_data_2 = train_data_2.explode(['corpus-id', 'score'])

In [13]:
display(train_data_2.shape)
print('Max score:', train_data_2['score'].max())
train_data_2.head(10)

(1543, 3)

Max score: 3


Unnamed: 0,query-id,corpus-id,score
0,915593,1396701,0
0,915593,1396704,0
0,915593,1396705,1
0,915593,1396707,0
0,915593,1396708,2
0,915593,1453630,0
0,915593,1605506,3
0,915593,1652605,0
0,915593,1772930,0
0,915593,1772932,0


In [14]:
# write train_data_2 to a csv file, THIS FILE ONLY CONTAINS TRAIN DATA FROM TASK2
train_data_2.to_csv(f'{DATA_DIR}/my_custom_train_data2.csv')

In [15]:
# replace the scores of train_data with 3
train_data['score'] = 3
train_data.head(10)

Unnamed: 0,query-id,corpus-id,score
0,1185869,0,3
1,1185868,16,3
2,597651,49,3
3,403613,60,3
4,1183785,389,3
5,312651,616,3
6,80385,723,3
7,645590,944,3
8,645337,1054,3
9,186154,1160,3


In [16]:
# concat train_data rows and train_data_2 rows, remove duplicate (query-id, corpus-id) pairs
train_data = pd.concat([train_data, train_data_2], axis=0)
train_data.shape

(534294, 3)

In [18]:
train_data.drop_duplicates(subset=['query-id', 'corpus-id'], inplace=True)

In [19]:
train_data.head(10)

Unnamed: 0,query-id,corpus-id,score
0,1185869,0,3
1,1185868,16,3
2,597651,49,3
3,403613,60,3
4,1183785,389,3
5,312651,616,3
6,80385,723,3
7,645590,944,3
8,645337,1054,3
9,186154,1160,3


In [20]:
# save train_data to a csv file, THIS FILE CONTAINS TRAIN DATA FROM TASK2 + TASK1
train_data.to_csv(f'{DATA_DIR}/cross_encoder_train.csv')