In [9]:
import pandas as pd
import json
import helpers
from tqdm.notebook import tqdm
import numpy as np
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp
import os 
import math
from functools import partial
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text
DATA_DIR = 'data'

stemmer = PorterStemmer()
# Load the data from files
with open(f'{DATA_DIR}/corpus.jsonl', 'r') as f:
    corpus_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

with open(f'{DATA_DIR}/queries.jsonl', 'r') as f:
    queries_data = {int(item['_id']): item['text'] for item in (json.loads(line) for line in f)}

train_data = pd.read_csv(f'{DATA_DIR}/task1_train.tsv', delimiter='\t')
test_data = pd.read_csv(f'{DATA_DIR}/task1_test.tsv', delimiter='\t')

# Make sure that the document_id and query_id are int64
train_data['corpus-id'] = train_data['corpus-id'].astype('int64')
train_data['query-id'] = train_data['query-id'].astype('int64')
CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, lambda x: strip_short(s=x,minsize=1), strip_multiple_whitespaces, remove_stopwords]

In [10]:
# Create a df from the corpus data
corpus_df = pd.DataFrame.from_dict(corpus_data, orient='index', columns=['text'])
# Create a df from the queries data
queries_df = pd.DataFrame.from_dict(queries_data, orient='index', columns=['text'])

In [11]:
train_data_2 = pd.read_csv(f'{DATA_DIR}/task2_train.tsv', delimiter='\t')
train_data_2['corpus-id'] = train_data_2['corpus-id'].apply(lambda x: eval(x))
train_data_2['query-id'] = train_data_2['query-id'].astype('int64')
train_data_2['score'] = train_data_2['score'].apply(lambda x: eval(x))
train_data_2.head(10)

Unnamed: 0,query-id,corpus-id,score
0,915593,"[1396701, 1396704, 1396705, 1396707, 1396708, ...","[0, 0, 1, 0, 2, 0, 3, 0, 0, 0, 2, 1, 2, 0, 0, ..."
1,146187,"[1028971, 1028972, 1131101, 1138801, 1230566, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,1114646,"[1002453, 1216492, 1316103, 1316109, 1342262, ...","[0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, ..."
3,1129237,"[1020793, 1128332, 1138726, 1169301, 120308, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 3, 0, ..."
4,573724,"[1005338, 104856, 1053303, 1165128, 1165129, 1...","[1, 1, 0, 0, 1, 0, 0, 2, 2, 0, 0, 0, 1, 0, 0, ..."
5,148538,"[1299824, 1299830, 1311202, 1311204, 1311206, ...","[2, 1, 2, 1, 0, 1, 1, 2, 1, 2, 2, 2, 0, 1, 1, ..."
6,527433,"[1000485, 1101462, 1187918, 1212778, 1212782, ...","[3, 0, 0, 2, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0, 3, ..."
7,130510,"[1046258, 1110766, 1156210, 1159414, 1211365, ...","[0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 2, 2, ..."
8,405717,"[1111371, 1111372, 1111375, 1538943, 1538949, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
9,1106007,"[1020463, 1040867, 1195441, 1334328, 1334330, ...","[1, 0, 0, 2, 3, 2, 0, 2, 3, 3, 1, 0, 0, 0, 0, ..."


In [12]:
train_data_2 = train_data_2.explode(['corpus-id', 'score'])

In [13]:
display(train_data_2.shape)
print('Max score:', train_data_2['score'].max())
train_data_2.head(10)

(1543, 3)

Max score: 3


Unnamed: 0,query-id,corpus-id,score
0,915593,1396701,0
0,915593,1396704,0
0,915593,1396705,1
0,915593,1396707,0
0,915593,1396708,2
0,915593,1453630,0
0,915593,1605506,3
0,915593,1652605,0
0,915593,1772930,0
0,915593,1772932,0


In [14]:
# write train_data_2 to a csv file
train_data_2.to_csv(f'{DATA_DIR}/my_custom_train_data2.csv')

In [15]:
# replace the scores of train_data with 3
train_data['score'] = 3
train_data.head(10)

Unnamed: 0,query-id,corpus-id,score
0,1185869,0,3
1,1185868,16,3
2,597651,49,3
3,403613,60,3
4,1183785,389,3
5,312651,616,3
6,80385,723,3
7,645590,944,3
8,645337,1054,3
9,186154,1160,3


In [16]:
# concat train_data rows and train_data_2 rows, remove duplicate (query-id, corpus-id) pairs
train_data = pd.concat([train_data, train_data_2], axis=0)
train_data.shape

(534294, 3)

In [18]:
train_data.drop_duplicates(subset=['query-id', 'corpus-id'], inplace=True)

In [19]:
train_data.head(10)

Unnamed: 0,query-id,corpus-id,score
0,1185869,0,3
1,1185868,16,3
2,597651,49,3
3,403613,60,3
4,1183785,389,3
5,312651,616,3
6,80385,723,3
7,645590,944,3
8,645337,1054,3
9,186154,1160,3


In [20]:
# save train_data to a csv file
train_data.to_csv(f'{DATA_DIR}/cross_encoder_train.csv')

In [8]:
print(train_data.shape)
train_data['relevant'] = train_data['score'].apply(lambda x: 1 if x >= 1 else 0)
train_data.head(10)

(534294, 3)


Unnamed: 0,query-id,corpus-id,score,relevant
0,1185869,0,3,1
1,1185868,16,3,1
2,597651,49,3,1
3,403613,60,3,1
4,1183785,389,3,1
5,312651,616,3,1
6,80385,723,3,1
7,645590,944,3,1
8,645337,1054,3,1
9,186154,1160,3,1


In [11]:
corpus_df

Unnamed: 0,text
1867825,"After the invention of the cotton gin, cotton ..."
419610,"Timer has separate night and day outlets, whic..."
4614226,The rose-buying public still encounters a wide...
4108603,Map of Wendover (Aut) Airport. A detailed map ...
3744854,And as the poems Reapers and Cotton Song indic...
...,...
7962609,Top 10 facts about the world. Oxycodone is an ...
7864307,One of the benefits of Vitex Chasteberry Tree ...
7667700,1 The frequency of the recessive allele. 2 An...
4620277,Queen of the mountains. The Rigi mountain is p...


In [28]:
def create_corpus(result, original_df, id='corpus-id'):
    unique_docid=result[id].unique()
    condition=original_df.index.isin(unique_docid)
    corpus=original_df[condition].reset_index(drop=True)
    print('Number of Rows=>',len(corpus))
    return corpus

training_corpus=create_corpus(train_data, corpus_df)
training_corpus.head()

Number of Rows=> 517893


Unnamed: 0,text
0,1 3. Picture This!Photo-based writing can be a...
1,Acxiom Corporation is a marketing technology a...
2,"How much do wound, ostomy, and continence nurs..."
3,Cold fronts produce most of the severe weather...
4,Typical costs: Charges for the use of a hearse...


In [29]:
import re

# Lowercasing the text
training_corpus['cleaned']=training_corpus['text'].apply(lambda x:x.lower())
tqdm.pandas()

# Dictionary of english Contractions
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not","can't": "can not","can't've": "cannot have",
"'cause": "because","could've": "could have","couldn't": "could not","couldn't've": "could not have",
"didn't": "did not","doesn't": "does not","don't": "do not","hadn't": "had not","hadn't've": "had not have",
"hasn't": "has not","haven't": "have not","he'd": "he would","he'd've": "he would have","he'll": "he will",
"he'll've": "he will have","how'd": "how did","how'd'y": "how do you","how'll": "how will","i'd": "i would",
"i'd've": "i would have","i'll": "i will","i'll've": "i will have","i'm": "i am","i've": "i have",
"isn't": "is not","it'd": "it would","it'd've": "it would have","it'll": "it will","it'll've": "it will have",
"let's": "let us","ma'am": "madam","mayn't": "may not","might've": "might have","mightn't": "might not",
"mightn't've": "might not have","must've": "must have","mustn't": "must not","mustn't've": "must not have",
"needn't": "need not","needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
"oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
"shan't've": "shall not have","she'd": "she would","she'd've": "she would have","she'll": "she will",
"she'll've": "she will have","should've": "should have","shouldn't": "should not",
"shouldn't've": "should not have","so've": "so have","that'd": "that would","that'd've": "that would have",
"there'd": "there would","there'd've": "there would have",
"they'd": "they would","they'd've": "they would have","they'll": "they will","they'll've": "they will have",
"they're": "they are","they've": "they have","to've": "to have","wasn't": "was not","we'd": "we would",
"we'd've": "we would have","we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have",
"weren't": "were not","what'll": "what will","what'll've": "what will have","what're": "what are",
"what've": "what have","when've": "when have","where'd": "where did",
"where've": "where have","who'll": "who will","who'll've": "who will have","who've": "who have",
"why've": "why have","will've": "will have","won't": "will not","won't've": "will not have",
"would've": "would have","wouldn't": "would not","wouldn't've": "would not have","y'all": "you all",
"y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
"you'd": "you would","you'd've": "you would have","you'll": "you will","you'll've": "you will have",
"you're": "you are","you've": "you have"}

# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

# Expanding Contractions
training_corpus['cleaned']=training_corpus['cleaned'].progress_apply(lambda x:expand_contractions(x))

  0%|          | 0/517893 [00:00<?, ?it/s]

In [30]:
# Function for Cleaning Text
def clean_text(text):
    text=re.sub('\w*\d\w*','', text)
    text=re.sub('\n',' ',text)
    text=re.sub(r"http\S+", "", text)
    text=re.sub('[^a-z]',' ',text)
    return text
 
# Cleaning corpus using RegEx
training_corpus['cleaned']=training_corpus['cleaned'].apply(lambda x: clean_text(x))

In [31]:
# Removing extra spaces
training_corpus['cleaned']=training_corpus['cleaned'].apply(lambda x: re.sub(' +',' ',x))

In [32]:
# Stopwords removal & Lemmatizing tokens using SpaCy
import spacy
nlp = spacy.load('en_core_web_sm',disable=['ner','parser'])
nlp.max_length=5000000

# Removing Stopwords and Lemmatizing words
training_corpus['lemmatized']=training_corpus['cleaned'].progress_apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))

  0%|          | 0/517893 [00:00<?, ?it/s]

In [33]:
#training_corpus['cleaned']=training_corpus['text'].apply(lambda x:preprocess_string(x,CUSTOM_FILTERS))

In [None]:
training_queries=create_corpus(train_data, queries_df, id='query-id')
training_queries.head()

Number of Rows=> 502949


Unnamed: 0,text
0,)what was the immediate impact of the success ...
1,_________ justice is designed to repair the ha...
2,what color is amber urine
3,is autoimmune hepatitis a bile acid synthesis ...
4,elegxo meaning


In [None]:
# Lowercasing the text
training_queries['cleaned']=training_queries['query'].apply(lambda x:x.lower())

# Expanding contractions
training_queries['cleaned']=training_queries['cleaned'].apply(lambda x:expand_contractions(x))

# Cleaning queries using RegEx
training_queries['cleaned']=training_queries['cleaned'].apply(lambda x: clean_text(x))

# Removing extra spaces
training_queries['cleaned']=training_queries['cleaned'].apply(lambda x: re.sub(' +',' ',x))

In [None]:
combined_training=pd.concat([training_corpus.rename(columns={'lemmatized':'text'})['text'],\
                             training_queries.rename(columns={'cleaned':'text'})['text']])\
                             .sample(frac=1).reset_index(drop=True)

In [None]:
from gensim.models import Word2Vec

# Creating data for the model training
train_data=[]
for i in combined_training:
    train_data.append(i.split())

# Training a word2vec model from the given data set
w2v_model = Word2Vec(train_data, size=300, min_count=2,window=5, sg=1,workers=4)