# Dataset Preparation

In [1]:
!pip install python-terrier -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.7/110.7 kB[0m [31m615.0 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.6/69.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m347.4/347.4 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.8/48.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from IPython.display import display
from xml.dom.minidom import parse, parseString
from pyterrier.measures import *

import random
import pyterrier as pt
import pandas as pd
import json
import os
import re
from tqdm import tqdm
tqdm.pandas()
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import string
import math
from scipy.spatial import distance
import tensorflow as tf
import numpy as np
from gensim.models import Word2Vec, FastText
from transformers import AutoTokenizer, TFBertModel



if not pt.started():
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

terrier-assemblies 5.9 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
Done
terrier-prf -SNAPSHOT jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8



In [4]:
articles = pd.read_excel("/content/drive/MyDrive/Tugas Akhir/Data/df_articles.xlsx")
train_data = pd.read_excel("/content/drive/MyDrive/Tugas Akhir/Data/df_query.xlsx")

In [5]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 781 entries, 0 to 780
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        781 non-null    object
 1   docno       776 non-null    object
 2   part        781 non-null    object
 3   chap        781 non-null    object
 4   sect        695 non-null    object
 5   subsect     311 non-null    object
 6   subsubsect  763 non-null    object
dtypes: object(7)
memory usage: 42.8+ KB


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1402 entries, 0 to 1401
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   qid       1402 non-null   object
 1   query     1402 non-null   object
 2   entail    1402 non-null   object
 3   label     1402 non-null   int64 
 4   art       1402 non-null   object
 5   art_code  1402 non-null   object
dtypes: int64(1), object(5)
memory usage: 65.8+ KB


In [7]:
articles.dropna(subset=['docno'], inplace=True)
articles.fillna('', inplace=True)
articles.info()

<class 'pandas.core.frame.DataFrame'>
Index: 776 entries, 0 to 780
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        776 non-null    object
 1   docno       776 non-null    object
 2   part        776 non-null    object
 3   chap        776 non-null    object
 4   sect        776 non-null    object
 5   subsect     776 non-null    object
 6   subsubsect  776 non-null    object
dtypes: object(7)
memory usage: 48.5+ KB


# Legal Stopwords

In [8]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [9]:
articles_text = articles['text'].tolist()
query_text = train_data['query'].tolist()

combined_text = articles_text + query_text

In [10]:
def preprocess_stop(text):
  # Remove punctuation and convert to lowercase
  text = text.translate(str.maketrans('', '', string.punctuation)).lower()
  # Tokenize the text
  tokens = nltk.word_tokenize(text)
  return tokens


all_tokens = []
for sentence in combined_text:
  all_tokens.extend(preprocess_stop(sentence))
word_freq = Counter(all_tokens)

unique_word = len(word_freq)
threshold = math.ceil(0.02 * unique_word)

additional_stop = word_freq.most_common(threshold)
add_stopwords = [word[0] for word in additional_stop]

In [11]:
for word in add_stopwords:
  stop_words.add(word)

In [12]:
stop_words

{'1',
 '2',
 'a',
 'about',
 'above',
 'act',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'apply',
 'are',
 'aren',
 "aren't",
 'article',
 'as',
 'at',
 'b',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'c',
 'can',
 'case',
 'cases',
 'claim',
 'contract',
 'couldn',
 "couldn't",
 'd',
 'demand',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'due',
 'during',
 'each',
 'even',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'intention',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'land',
 'll',
 'm',
 'ma',
 'may',
 'me',
 'mightn',
 "mightn't",
 'more',
 'mortgage',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn

# Global Query Expansion

## Query Expansion W2V



### Data Preprocessing

In [None]:
seed = 1111

In [None]:
# Function to transform sentence in DataFrame to a list of words
def sentence_to_list(sentence):
    return sentence.lower().split()
low = pd.DataFrame()
# Apply the function to the 'Sentences' column
low['sentences'] = articles['text'].apply(sentence_to_list)
low

Unnamed: 0,sentences
0,"[article, 1, 1, private, rights, must, be, con..."
1,"[article, 2, this, code, must, be, construed, ..."
2,"[article, 3, 1, the, enjoyment, of, private, r..."
3,"[article, 3, 2, if, the, person, making, a, ju..."
4,"[article, 4, the, age, of, majority, is, 20, y..."
...,...
776,"[article, 721, an, unborn, child, is, deemed, ..."
777,"[article, 722, 1, the, provisions, of, article..."
778,"[article, 723, the, court, may, order, a, pers..."
779,"[article, 724, in, the, following, cases, the,..."


In [None]:
q=pd.DataFrame()
q['sentences'] = train_data['query'].apply(sentence_to_list)
q

Unnamed: 0,sentences
0,"[in, the, case, where, a, person, under, curat..."
1,"[in, cases, any, third, party, commits, any, f..."
2,"[in, the, case, where, a, person, under, curat..."
3,"[in, the, case, where, a, person, under, curat..."
4,"[a, person, who, intends, to, conclude, the, c..."
...,...
1397,"[in, a, lawsuit, demanding, the, payment, of, ..."
1398,"[the, beneficiary, in, bad, faith, of, unjust,..."
1399,"[even, if, an, obligation, does, not, exist, a..."
1400,"[in, cases, where, an, obligor, mistakenly, be..."


In [None]:
df_training = pd.concat([low, q],ignore_index=True)
# low.append(q,)
final_data = df_training['sentences'].tolist()
len(final_data)

2178

In [None]:
final_data[2]

['article',
 '3',
 '1',
 'the',
 'enjoyment',
 'of',
 'private',
 'rights',
 'commences',
 'at',
 'birth',
 '2',
 'unless',
 'otherwise',
 'prohibited',
 'by',
 'applicable',
 'laws',
 'regulations',
 'or',
 'treaties',
 'foreign',
 'nationals',
 'enjoy',
 'private',
 'rights']

### Word2Vec Model

In [None]:
model_sg = Word2Vec(window=4, min_count=1,  workers=4, vector_size=128, sg=1, seed=seed)
model_sg.build_vocab(final_data, progress_per=1000)

model_cbow = Word2Vec(window=4, min_count=1,  workers=4 , vector_size=128, seed=seed)
model_cbow.build_vocab(final_data, progress_per=1000)

In [None]:
model_sg.train(final_data, total_examples=model_sg.corpus_count, epochs=100)
model_cbow.train(final_data, total_examples=model_cbow.corpus_count, epochs=100)

(7523612, 11619900)

In [None]:
model_sg.save("skipgram.w2v")
model_cbow.save("cbow.w2v")

In [None]:
model_sg = Word2Vec.load("skipgram.w2v")
model_cbow = Word2Vec.load("cbow.w2v")

In [None]:
w2v_sg= model_sg.wv
w2v_cbow = model_cbow.wv

## Fast Text Ekspansion

### Model

In [None]:
model_ft_sg = FastText(window=4, min_count=1,  workers=4, vector_size=128, sg=1, seed=seed)
model_ft_sg.build_vocab(final_data, progress_per=1000)
model_ft_sg.train(final_data, total_examples=model_ft_sg.corpus_count, epochs=100)

model_ft_cb = FastText(window=4, min_count=1, vector_size=128, workers=4, sg=0, seed=seed)
model_ft_cb.build_vocab(final_data, progress_per=1000)
model_ft_cb.train(final_data, total_examples=model_ft_cb.corpus_count, epochs=100)

(7523396, 11619900)

In [None]:
model_ft_sg.save("fasttext_sg.fasttext")
model_ft_cb.save("fasttext_cb.fasttext")

In [None]:
model_ft_sg = FastText.load("fasttext_sg.fasttext")
model_ft_cb = FastText.load("fasttext_cb.fasttext")

In [None]:
fasttext_sg = model_ft_sg.wv
fasttext_cb = model_ft_cb.wv

## Bert Contextual Similarity Check

### Load Pre-Trained Model

In [None]:


bert_tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased', use_fast=True, max_length=512)
bert_model = TFBertModel.from_pretrained('google-bert/bert-base-uncased', output_hidden_states=True, from_pt = True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [None]:
def tokenized_texts(text):

  return bert_tokenizer(text, return_tensors='pt', truncation=True)

def word_pos(text):
  list_of_word_pos = []
  tokenized_text = bert_tokenizer.tokenize(text, add_special_tokens=True)
  i = 0
  while i < len(tokenized_text) - 1:
    curr_token, next_token = tokenized_text[i], tokenized_text[i + 1]
    if next_token.startswith("##"):
      whole_word = curr_token
      pos = [i]
      i += 1
      while i < len(tokenized_text) and tokenized_text[i].startswith("##"):
        whole_word += tokenized_text[i][2:]
        pos.append(i)
        i += 1
      list_of_word_pos.append((whole_word, pos))
    else:
      list_of_word_pos.append((curr_token, [i]))
      i += 1
  return list_of_word_pos

In [None]:
def get_token_vecs(token_embeddings):
  token_vecs_sum = []
  for token in token_embeddings:
    sum_vec = tf.reduce_sum(token[-4:], axis = 0)
    token_vecs_sum.append(sum_vec)
  return token_vecs_sum

def similarity_vector(encoded_input):
  token_ids = np.array(encoded_input["input_ids"])
  token_types = np.array(encoded_input["token_type_ids"])
  token_masks = np.array(encoded_input["attention_mask"])

  outputs = bert_model([token_ids, token_types, token_masks])
  hidden_states = outputs[2]
  stacked_hidden_states = tf.stack(hidden_states)

  token_embeddings_sentence = tf.transpose(stacked_hidden_states, perm=[1, 2, 0, 3])

  return get_token_vecs(token_embeddings_sentence[0])

def cosine_sim(a, b):
  return 1 - distance.cosine(a, b)


In [None]:
def process(sentence, th):

  tokenized_text = tokenized_texts(sentence)
  word_position = word_pos(sentence)
  token = similarity_vector(tokenized_text)
  mean = tf.reduce_mean(token[word_position[0][1][0] : word_position[-1][1][-1]], axis=0)
  lst = []

  for i in word_position:
    if len(i[1]) == 1:
      sim = cosine_sim(mean, token[i[1][0]])
      if sim > th :
        lst.append(i[0])
    else:
      temp = tf.reduce_mean(token[i[1][0] :i[1][-1]+1], axis=0)
      sim = cosine_sim(mean, temp)
      if sim > th:
        lst.append(i[0])

  return lst

# Query Expansion

In [None]:
train_data_1 = train_data.copy()

In [None]:
def get_similar(model, word, threshold):
  data = model.most_similar(word, topn=1)
  result = [entry[0] for entry in data if entry[1]> threshold]
  return result

In [None]:
# stop_words = set()
def expand_query(query, model, threshold,th, isContext):
  res = []
  res_sentence=""
  q = query['query']
  if isContext :
    sentence = process(q, th)
  else:
    sentence = q.split()

  for word in sentence:
    if word not in stop_words:
      try:
        hasil = get_similar(model, word.lower(), threshold)
      except KeyError:
        continue
      else:
        for i in hasil:
          if i in res:
            continue
          else:
            res.append(" "+i)
        continue

  for i in res:
    res_sentence += i

  return q + res_sentence

In [None]:
threshold = 0.5
bert_th = 0.5

# Retrieval

## Indexing

In [None]:
!rm -rf ./coliee_index

# we need to use blocks = True
# All indexer classes expose a blocks boolean constructor argument to allow position information to be
# recoreded in the index. Defaults to False, i.e. positions are not recorded.
# Indexing will record "position" aka blocks information, however this makes the indices much larger and also slower.
pd_indexer = pt.DFIndexer("./coliee_index", type=pt.index.IndexingType(1), blocks = True)

# note that we also keep the articles as "text" in the metadata part
# (Add the entire dataframe as metadata)
# this will be useful when we re-rank model using LLM, such as T5, BERT, etc.
index_ref = pd_indexer.index(articles["text"], \
                             articles["text"], \
                             articles["docno"], \
                             articles["part"], \
                             articles["chap"], \
                             articles["sect"], \
                             articles["subsect"], \
                             articles["subsubsect"])

!rm -rf ./coliee_index_stem

pd_indexer_2 = pt.DFIndexer("./coliee_index_stem", type=pt.index.IndexingType(1), blocks = True,stemmer ='porter', stopwords='none')

index_ref_2 = pd_indexer_2.index(articles["text"], \
                             articles["text"], \
                             articles["docno"], \
                             articles["part"], \
                             articles["chap"], \
                             articles["sect"], \
                             articles["subsect"], \
                             articles["subsubsect"])

!rm -rf ./coliee_index_stop

pd_indexer_3 = pt.DFIndexer("./coliee_index_stop", type=pt.index.IndexingType(1), blocks = True,stemmer ='none')

index_ref_3 = pd_indexer_3.index(articles["text"], \
                             articles["text"], \
                             articles["docno"], \
                             articles["part"], \
                             articles["chap"], \
                             articles["sect"], \
                             articles["subsect"], \
                             articles["subsubsect"])

!rm -rf ./coliee_index_stop_stem

pd_indexer_4 = pt.DFIndexer("./coliee_index_stop_stem", type=pt.index.IndexingType(1), blocks = True,stemmer ='none', stopwords='none')

index_ref_4 = pd_indexer_4.index(articles["text"], \
                             articles["text"], \
                             articles["docno"], \
                             articles["part"], \
                             articles["chap"], \
                             articles["sect"], \
                             articles["subsect"], \
                             articles["subsubsect"])

# Local Query Expansion

In [None]:
bo1 = pt.rewrite.Bo1QueryExpansion(index_ref, fb_terms=10, fb_docs=2)
kl = pt.rewrite.KLQueryExpansion(index_ref, fb_terms=10, fb_docs=2)

rm3 = pt.rewrite.RM3(index_ref)
aq = pt.rewrite.AxiomaticQE(index_ref, fb_terms=10, fb_docs=2)

tfidf = pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
bm25 = pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30

#Creating pipeline

bm25bo1 = bm25 >> bo1 >> bm25
bm25kl = bm25 >> kl >> bm25
bm25rm3 = bm25 >> rm3 >> bm25
bm25aq = bm25 >> aq >> bm25

tfidfbo1 = tfidf >> bo1 >> tfidf
tfidfkl = tfidf >> kl >> tfidf
tfidfrm3 = tfidf >> rm3 >> tfidf
tfidfaq = tfidf >> aq >> tfidf

## Pipeline Global Expansion

Embedding Model Available:
*   w2v_sg
*   w2v_cbow
*   fasttext_sg
*   fasttext_cbow

Hyperparameter:
*   Contextual
    *   Threshold 0.6
    *   Threshold 0.5
*   Embedding Model Threshold
    *   Threshold 0.4
    *   Threshold 0.45    






In [None]:
# experiment

# topics --- at the end we only run experiment for R04 (test data for COLIEE 2023)
topics = train_data[["qid", "query"]].drop_duplicates()
topics['category'] = topics.apply(lambda row: row.qid.split('-')[0], axis = 1)
topics_R04 = topics[topics["category"] == 'R04']

# creating qrels ---> all articles in the training data are relevant; all others
# are assumed to be non-relevant (including negative samples)
qrels_temp = train_data[["qid", "art_code", "label"]].copy()
qrels_q = qrels_temp.rename(columns={"art_code": "docno"})

### BM25 Model

In [None]:
bm25 = pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30

# Non-Contextualized
bm25v11 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0.4, 0, False)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25v12 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0.45, 0, False)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25v21 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0.4, 0, False)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25v22 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0.45, 0, False)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25v31 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0.4, 0, False)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25v32 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0.45, 0, False)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25v41 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0.4, 0, False)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25v42 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0.45, 0, False)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30


# Contextualized
bm25vb13 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0.4, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25vb14 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0.45, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25vb15 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0.4, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25vb16 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0.45, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30

bm25vb23 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0.4, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25vb24 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0.45, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25vb25 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0.4, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25vb26 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0.45, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30

bm25vb33 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0.4, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25vb34 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0.45, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25vb35 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0.4, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25vb36 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0.45, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30

bm25vb43 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0.4, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25vb44 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0.45, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25vb45 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0.4, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
bm25vb46 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0.45, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30

In [None]:
# bm25v13 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0.4, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25v14 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0.45, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25v23 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0.4, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25v24 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0.45, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25v33 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0.4, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25v34 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0.45, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25v43 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0.4, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25v44 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0.45, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30

### BM25 NYOBA

In [None]:
# bm25 = pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30


# bm25vb13 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25vb14 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25vb15 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25vb16 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30

# bm25vb23 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25vb24 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25vb25 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25vb26 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30

# bm25vb33 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25vb34 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25vb35 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25vb36 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30

# bm25vb43 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25vb44 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25vb45 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
# bm25vb46 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30

### TF-IDF Model

In [None]:
tfidf = pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30

# Non-Contextualized
tfidfv11 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0.4, 0, False)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv12 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0.45, 0, False)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv21 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0.4, 0, False)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv22 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0.45, 0, False)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv31 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0.4, 0, False)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv32 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0.45, 0, False)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv41 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0.4, 0, False)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv42 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0.45, 0, False)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30

# Contextualized
tfidfv13 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0.4, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv14 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0.45, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv15 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0.4, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv16 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0.45, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30

tfidfv23 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0.4, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv24 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0.45, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv25 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0.4, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv26 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0.45, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30

tfidfv33 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0.4, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv34 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0.45, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv35 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0.4, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv36 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0.45, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30

tfidfv43 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0.4, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv44 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0.45, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv45 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0.4, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
tfidfv46 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0.45, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30

In [None]:
# tfidfv13 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
# tfidfv14 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
# tfidfv15 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
# tfidfv16 = pt.apply.query(lambda x: expand_query(x, w2v_sg, 0, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30

# tfidfv23 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
# tfidfv24 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
# tfidfv25 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
# tfidfv26 = pt.apply.query(lambda x: expand_query(x, w2v_cbow, 0, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30

# tfidfv33 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
# tfidfv34 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
# tfidfv35 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
# tfidfv36 = pt.apply.query(lambda x: expand_query(x, fasttext_sg, 0, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30

# tfidfv43 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
# tfidfv44 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0, 0.5, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
# tfidfv45 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
# tfidfv46 = pt.apply.query(lambda x: expand_query(x, fasttext_cb, 0, 0.6, True)) >> pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30

## Experiment

In [None]:
for i in range(1,5):
  for j in range(3,5):
    print("tfidfv"+str(i)+str(j)+',')
model_embed = ["W2V-SG", "W2V-CBOW", "FT-SG", "FT-CBOW"]
for a in model_embed:
  for b in range(1,3):
    print('"'+str(a)+str(b)+'"')

tfidfv13,
tfidfv14,
tfidfv23,
tfidfv24,
tfidfv33,
tfidfv34,
tfidfv43,
tfidfv44,
"W2V-SG1"
"W2V-SG2"
"W2V-CBOW1"
"W2V-CBOW2"
"FT-SG1"
"FT-SG2"
"FT-CBOW1"
"FT-CBOW2"


In [None]:
eks_bm25 = pt.Experiment(
    [bm25,
     bm25v11,
     bm25v12,
     bm25v21,
     bm25v22,
     bm25v31,
     bm25v32,
     bm25v41,
     bm25v42,
     ],
    topics_R04,
    qrels_q,
    eval_metrics= [P@5, R@10, R@15, R@20, "recip_rank"],
    verbose=True,
    names=["BM25-BaseLine","W2V-SG1", "W2V-SG2", "W2V-CBOW1", "W2V-CBOW2","FT-SG1", "FT-SG2", "FT-CBOW1", "FT-CBOW2"],
    baseline =0
)

pt.Experiment:   0%|          | 0/9 [00:00<?, ?system/s]

In [None]:
eks_bm25

Unnamed: 0,name,recip_rank,P@5,R@10,R@15,R@20,recip_rank +,recip_rank -,recip_rank p-value,P@5 +,...,P@5 p-value,R@10 +,R@10 -,R@10 p-value,R@15 +,R@15 -,R@15 p-value,R@20 +,R@20 -,R@20 p-value
0,BM25-BaseLine,0.726728,0.188119,0.861386,0.89604,0.89604,,,,,...,,,,,,,,,,
1,W2V-SG1,0.696329,0.178218,0.866337,0.886139,0.891089,6.0,21.0,0.108245,0.0,...,0.024597,4.0,2.0,0.740688,1.0,3.0,0.319724,2.0,3.0,0.656968
2,W2V-SG2,0.696341,0.178218,0.866337,0.886139,0.891089,6.0,21.0,0.108384,0.0,...,0.024597,4.0,2.0,0.740688,1.0,3.0,0.319724,2.0,3.0,0.656968
3,W2V-CBOW1,0.672082,0.176238,0.841584,0.871287,0.90099,10.0,24.0,0.009922,2.0,...,0.057385,4.0,4.0,0.373717,2.0,5.0,0.16665,2.0,1.0,0.56629
4,W2V-CBOW2,0.686935,0.176238,0.856436,0.881188,0.891089,11.0,20.0,0.025341,2.0,...,0.057385,4.0,3.0,0.783057,2.0,4.0,0.319724,2.0,2.0,0.70745
5,FT-SG1,0.713654,0.180198,0.846535,0.891089,0.90099,8.0,15.0,0.341347,1.0,...,0.102744,2.0,4.0,0.319724,1.0,2.0,0.56629,2.0,1.0,0.56629
6,FT-SG2,0.713654,0.180198,0.846535,0.891089,0.90099,8.0,15.0,0.341347,1.0,...,0.102744,2.0,4.0,0.319724,1.0,2.0,0.56629,2.0,1.0,0.56629
7,FT-CBOW1,0.729017,0.19802,0.861386,0.876238,0.910891,13.0,13.0,0.869794,6.0,...,0.058402,3.0,2.0,1.0,1.0,4.0,0.15833,3.0,0.0,0.083249
8,FT-CBOW2,0.729017,0.19802,0.861386,0.876238,0.910891,13.0,13.0,0.869794,6.0,...,0.058402,3.0,2.0,1.0,1.0,4.0,0.15833,3.0,0.0,0.083249


In [None]:
eks_bm25_bert = pt.Experiment(
    [bm25,
bm25vb13,
bm25vb14,
bm25vb15,
bm25vb16,
bm25vb23,
bm25vb24,
bm25vb25,
bm25vb26,
bm25vb33,
bm25vb34,
bm25vb35,
bm25vb36,
bm25vb43,
bm25vb44,
bm25vb45,
bm25vb46,
     ],
    topics_R04,
    qrels_q,
    eval_metrics= [P@5, R@10, R@15, R@20, "recip_rank"],
    verbose=True,
    # names=["BM25-BaseLine","W2V-SG1", "W2V-SG2", "W2V-CBOW1", "W2V-CBOW2","FT-SG1", "FT-SG2", "FT-CBOW1", "FT-CBOW2"]
    baseline=0
)

pt.Experiment:   0%|          | 0/17 [00:00<?, ?system/s]

In [None]:
eks_bm25_bert

Unnamed: 0,name,recip_rank,P@5,R@10,R@15,R@20,recip_rank +,recip_rank -,recip_rank p-value,P@5 +,...,P@5 p-value,R@10 +,R@10 -,R@10 p-value,R@15 +,R@15 -,R@15 p-value,R@20 +,R@20 -,R@20 p-value
0,"RankCutoff(BR(BM25), 30)",0.726728,0.188119,0.861386,0.89604,0.89604,,,,,...,,,,,,,,,,
1,"Compose(pt.apply.query(), RankCutoff(BR(BM25),...",0.704579,0.182178,0.866337,0.886139,0.891089,6.0,19.0,0.222079,1.0,...,0.181012,4.0,2.0,0.740688,1.0,3.0,0.319724,2.0,3.0,0.656968
2,"Compose(pt.apply.query(), RankCutoff(BR(BM25),...",0.704592,0.182178,0.866337,0.886139,0.891089,6.0,19.0,0.222328,1.0,...,0.181012,4.0,2.0,0.740688,1.0,3.0,0.319724,2.0,3.0,0.656968
3,"Compose(pt.apply.query(), RankCutoff(BR(BM25),...",0.713427,0.186139,0.871287,0.886139,0.886139,8.0,16.0,0.445001,1.0,...,0.56629,2.0,0.0,0.15833,1.0,3.0,0.319724,1.0,3.0,0.319724
4,"Compose(pt.apply.query(), RankCutoff(BR(BM25),...",0.713427,0.186139,0.871287,0.886139,0.886139,8.0,16.0,0.445001,1.0,...,0.56629,2.0,0.0,0.15833,1.0,3.0,0.319724,1.0,3.0,0.319724
5,"Compose(pt.apply.query(), RankCutoff(BR(BM25),...",0.692573,0.180198,0.841584,0.881188,0.90099,11.0,20.0,0.089533,2.0,...,0.15833,4.0,4.0,0.373717,2.0,4.0,0.319724,2.0,1.0,0.56629
6,"Compose(pt.apply.query(), RankCutoff(BR(BM25),...",0.703212,0.182178,0.856436,0.881188,0.891089,12.0,17.0,0.16719,2.0,...,0.258873,4.0,3.0,0.783057,2.0,4.0,0.319724,2.0,2.0,0.70745
7,"Compose(pt.apply.query(), RankCutoff(BR(BM25),...",0.705656,0.186139,0.856436,0.886139,0.89604,13.0,15.0,0.282293,3.0,...,0.70745,3.0,2.0,0.764686,1.0,3.0,0.319724,1.0,1.0,1.0
8,"Compose(pt.apply.query(), RankCutoff(BR(BM25),...",0.723611,0.188119,0.876238,0.886139,0.89604,15.0,11.0,0.847573,2.0,...,1.0,3.0,0.0,0.083249,1.0,3.0,0.319724,1.0,1.0,1.0
9,"Compose(pt.apply.query(), RankCutoff(BR(BM25),...",0.728794,0.180198,0.846535,0.891089,0.90099,9.0,14.0,0.891871,1.0,...,0.102744,2.0,4.0,0.319724,1.0,2.0,0.56629,2.0,1.0,0.56629


In [None]:
eks_tfidf =pt.Experiment(
    [
tfidf,
tfidfv11,
tfidfv12,
tfidfv21,
tfidfv22,
tfidfv31,
tfidfv32,
tfidfv41,
tfidfv42,
     ],
    topics_R04,
    qrels_q,
    eval_metrics= [P@5, R@10, R@15, R@20, "recip_rank"],
    verbose=True,
    names=["TF-IDF-BaseLine","W2V-SG1", "W2V-SG2", "W2V-CBOW1", "W2V-CBOW2","FT-SG1", "FT-SG2", "FT-CBOW1", "FT-CBOW2"],
    baseline=0
)

pt.Experiment:   0%|          | 0/9 [00:00<?, ?system/s]

In [None]:
eks_tfidf_bert =pt.Experiment(
    [
tfidf,
tfidfv13,
tfidfv14,
tfidfv15,
tfidfv16,
tfidfv23,
tfidfv24,
tfidfv25,
tfidfv26,
tfidfv33,
tfidfv34,
tfidfv35,
tfidfv36,
tfidfv43,
tfidfv44,
tfidfv45,
tfidfv46,
     ],
    topics_R04,
    qrels_q,
    eval_metrics= [P@5, R@10, R@15, R@20, "recip_rank"],
    verbose=True,
    # names=["TF-IDF-BaseLine","W2V-SG1", "W2V-SG2", "W2V-CBOW1", "W2V-CBOW2","FT-SG1", "FT-SG2", "FT-CBOW1", "FT-CBOW2"]
    baseline=0
)

pt.Experiment:   0%|          | 0/17 [00:00<?, ?system/s]

In [None]:
eks_tfidf_bert

Unnamed: 0,name,recip_rank,P@5,R@10,R@15,R@20,recip_rank +,recip_rank -,recip_rank p-value,P@5 +,...,P@5 p-value,R@10 +,R@10 -,R@10 p-value,R@15 +,R@15 -,R@15 p-value,R@20 +,R@20 -,R@20 p-value
0,"RankCutoff(BR(TF_IDF), 30)",0.735206,0.186139,0.856436,0.89604,0.89604,,,,,...,,,,,,,,,,
1,"Compose(pt.apply.query(), RankCutoff(BR(TF_IDF...",0.703438,0.182178,0.851485,0.886139,0.886139,5.0,18.0,0.093263,2.0,...,0.41693,3.0,2.0,0.764686,1.0,3.0,0.319724,1.0,3.0,0.319724
2,"Compose(pt.apply.query(), RankCutoff(BR(TF_IDF...",0.703451,0.182178,0.851485,0.886139,0.886139,5.0,18.0,0.093395,2.0,...,0.41693,3.0,2.0,0.764686,1.0,3.0,0.319724,1.0,3.0,0.319724
3,"Compose(pt.apply.query(), RankCutoff(BR(TF_IDF...",0.712679,0.186139,0.871287,0.881188,0.886139,7.0,16.0,0.222852,3.0,...,1.0,3.0,0.0,0.083249,0.0,3.0,0.083249,1.0,3.0,0.319724
4,"Compose(pt.apply.query(), RankCutoff(BR(TF_IDF...",0.712692,0.186139,0.871287,0.881188,0.886139,7.0,16.0,0.223117,3.0,...,1.0,3.0,0.0,0.083249,0.0,3.0,0.083249,1.0,3.0,0.319724
5,"Compose(pt.apply.query(), RankCutoff(BR(TF_IDF...",0.690631,0.180198,0.861386,0.876238,0.90099,8.0,20.0,0.051893,2.0,...,0.258873,5.0,2.0,0.783057,1.0,4.0,0.15833,2.0,1.0,0.56629
6,"Compose(pt.apply.query(), RankCutoff(BR(TF_IDF...",0.7078,0.182178,0.861386,0.876238,0.891089,9.0,17.0,0.169255,2.0,...,0.41693,5.0,2.0,0.783057,1.0,4.0,0.15833,2.0,2.0,0.70745
7,"Compose(pt.apply.query(), RankCutoff(BR(TF_IDF...",0.696693,0.186139,0.876238,0.886139,0.89604,10.0,16.0,0.087563,3.0,...,1.0,4.0,0.0,0.044941,1.0,3.0,0.319724,1.0,1.0,1.0
8,"Compose(pt.apply.query(), RankCutoff(BR(TF_IDF...",0.715415,0.188119,0.866337,0.886139,0.89604,11.0,13.0,0.296666,3.0,...,0.656968,4.0,1.0,0.482235,1.0,3.0,0.319724,1.0,1.0,1.0
9,"Compose(pt.apply.query(), RankCutoff(BR(TF_IDF...",0.735565,0.182178,0.866337,0.891089,0.89604,10.0,14.0,0.984405,1.0,...,0.41693,4.0,2.0,0.41693,1.0,2.0,0.56629,1.0,1.0,1.0


In [None]:
eks_bm25_bert

Unnamed: 0,name,recip_rank,P@5,R@10,R@15,R@20,recip_rank +,recip_rank -,recip_rank p-value,P@5 +,...,P@5 p-value,R@10 +,R@10 -,R@10 p-value,R@15 +,R@15 -,R@15 p-value,R@20 +,R@20 -,R@20 p-value
0,"RankCutoff(BR(BM25), 30)",0.726728,0.188119,0.861386,0.89604,0.89604,,,,,...,,,,,,,,,,
1,"Compose(pt.apply.query(), RankCutoff(BR(BM25),...",0.704579,0.182178,0.866337,0.886139,0.891089,6.0,19.0,0.222079,1.0,...,0.181012,4.0,2.0,0.740688,1.0,3.0,0.319724,2.0,3.0,0.656968
2,"Compose(pt.apply.query(), RankCutoff(BR(BM25),...",0.704592,0.182178,0.866337,0.886139,0.891089,6.0,19.0,0.222328,1.0,...,0.181012,4.0,2.0,0.740688,1.0,3.0,0.319724,2.0,3.0,0.656968
3,"Compose(pt.apply.query(), RankCutoff(BR(BM25),...",0.713427,0.186139,0.871287,0.886139,0.886139,8.0,16.0,0.445001,1.0,...,0.56629,2.0,0.0,0.15833,1.0,3.0,0.319724,1.0,3.0,0.319724
4,"Compose(pt.apply.query(), RankCutoff(BR(BM25),...",0.713427,0.186139,0.871287,0.886139,0.886139,8.0,16.0,0.445001,1.0,...,0.56629,2.0,0.0,0.15833,1.0,3.0,0.319724,1.0,3.0,0.319724
5,"Compose(pt.apply.query(), RankCutoff(BR(BM25),...",0.692573,0.180198,0.841584,0.881188,0.90099,11.0,20.0,0.089533,2.0,...,0.15833,4.0,4.0,0.373717,2.0,4.0,0.319724,2.0,1.0,0.56629
6,"Compose(pt.apply.query(), RankCutoff(BR(BM25),...",0.703212,0.182178,0.856436,0.881188,0.891089,12.0,17.0,0.16719,2.0,...,0.258873,4.0,3.0,0.783057,2.0,4.0,0.319724,2.0,2.0,0.70745
7,"Compose(pt.apply.query(), RankCutoff(BR(BM25),...",0.705656,0.186139,0.856436,0.886139,0.89604,13.0,15.0,0.282293,3.0,...,0.70745,3.0,2.0,0.764686,1.0,3.0,0.319724,1.0,1.0,1.0
8,"Compose(pt.apply.query(), RankCutoff(BR(BM25),...",0.723611,0.188119,0.876238,0.886139,0.89604,15.0,11.0,0.847573,2.0,...,1.0,3.0,0.0,0.083249,1.0,3.0,0.319724,1.0,1.0,1.0
9,"Compose(pt.apply.query(), RankCutoff(BR(BM25),...",0.728794,0.180198,0.846535,0.891089,0.90099,9.0,14.0,0.891871,1.0,...,0.102744,2.0,4.0,0.319724,1.0,2.0,0.56629,2.0,1.0,0.56629


In [None]:
eks_tfidf

Unnamed: 0,name,recip_rank,P@5,R@10,R@15,R@20,recip_rank +,recip_rank -,recip_rank p-value,P@5 +,...,P@5 p-value,R@10 +,R@10 -,R@10 p-value,R@15 +,R@15 -,R@15 p-value,R@20 +,R@20 -,R@20 p-value
0,TF-IDF-BaseLine,0.735206,0.186139,0.856436,0.89604,0.89604,,,,,...,,,,,,,,,,
1,W2V-SG1,0.698817,0.182178,0.851485,0.886139,0.886139,5.0,20.0,0.056089,2.0,...,0.41693,3.0,2.0,0.764686,1.0,3.0,0.319724,1.0,3.0,0.319724
2,W2V-SG2,0.69883,0.182178,0.851485,0.886139,0.886139,5.0,20.0,0.056176,2.0,...,0.41693,3.0,2.0,0.764686,1.0,3.0,0.319724,1.0,3.0,0.319724
3,W2V-CBOW1,0.679134,0.176238,0.861386,0.876238,0.90099,8.0,24.0,0.018028,2.0,...,0.095751,5.0,2.0,0.783057,1.0,4.0,0.15833,2.0,1.0,0.56629
4,W2V-CBOW2,0.694939,0.178218,0.861386,0.876238,0.891089,8.0,20.0,0.044237,2.0,...,0.15833,5.0,2.0,0.783057,1.0,4.0,0.15833,2.0,2.0,0.70745
5,FT-SG1,0.720399,0.182178,0.866337,0.891089,0.89604,8.0,14.0,0.343554,1.0,...,0.41693,4.0,2.0,0.41693,1.0,2.0,0.56629,1.0,1.0,1.0
6,FT-SG2,0.720399,0.182178,0.866337,0.891089,0.89604,8.0,14.0,0.343554,1.0,...,0.41693,4.0,2.0,0.41693,1.0,2.0,0.56629,1.0,1.0,1.0
7,FT-CBOW1,0.732013,0.194059,0.856436,0.891089,0.910891,13.0,14.0,0.849012,6.0,...,0.15833,3.0,2.0,1.0,1.0,2.0,0.56629,3.0,0.0,0.083249
8,FT-CBOW2,0.732013,0.194059,0.856436,0.891089,0.910891,13.0,14.0,0.849012,6.0,...,0.15833,3.0,2.0,1.0,1.0,2.0,0.56629,3.0,0.0,0.083249


In [None]:
eks_bm25

Unnamed: 0,name,recip_rank,P@5,R@10,R@15,R@20,recip_rank +,recip_rank -,recip_rank p-value,P@5 +,...,P@5 p-value,R@10 +,R@10 -,R@10 p-value,R@15 +,R@15 -,R@15 p-value,R@20 +,R@20 -,R@20 p-value
0,BM25-BaseLine,0.726728,0.188119,0.861386,0.89604,0.89604,,,,,...,,,,,,,,,,
1,W2V-SG1,0.696329,0.178218,0.866337,0.886139,0.891089,6.0,21.0,0.108245,0.0,...,0.024597,4.0,2.0,0.740688,1.0,3.0,0.319724,2.0,3.0,0.656968
2,W2V-SG2,0.696341,0.178218,0.866337,0.886139,0.891089,6.0,21.0,0.108384,0.0,...,0.024597,4.0,2.0,0.740688,1.0,3.0,0.319724,2.0,3.0,0.656968
3,W2V-CBOW1,0.672082,0.176238,0.841584,0.871287,0.90099,10.0,24.0,0.009922,2.0,...,0.057385,4.0,4.0,0.373717,2.0,5.0,0.16665,2.0,1.0,0.56629
4,W2V-CBOW2,0.686935,0.176238,0.856436,0.881188,0.891089,11.0,20.0,0.025341,2.0,...,0.057385,4.0,3.0,0.783057,2.0,4.0,0.319724,2.0,2.0,0.70745
5,FT-SG1,0.713654,0.180198,0.846535,0.891089,0.90099,8.0,15.0,0.341347,1.0,...,0.102744,2.0,4.0,0.319724,1.0,2.0,0.56629,2.0,1.0,0.56629
6,FT-SG2,0.713654,0.180198,0.846535,0.891089,0.90099,8.0,15.0,0.341347,1.0,...,0.102744,2.0,4.0,0.319724,1.0,2.0,0.56629,2.0,1.0,0.56629
7,FT-CBOW1,0.729017,0.19802,0.861386,0.876238,0.910891,13.0,13.0,0.869794,6.0,...,0.058402,3.0,2.0,1.0,1.0,4.0,0.15833,3.0,0.0,0.083249
8,FT-CBOW2,0.729017,0.19802,0.861386,0.876238,0.910891,13.0,13.0,0.869794,6.0,...,0.058402,3.0,2.0,1.0,1.0,4.0,0.15833,3.0,0.0,0.083249




In [None]:
local_experiment = pt.Experiment(
    [bm25,tfidf, tfidfbo1, tfidfkl, tfidfrm3, tfidfaq, bm25bo1, bm25kl, bm25rm3, bm25aq ],
    topics_R04,
    qrels_q,
    eval_metrics=[P@5, R@10, R@15, R@20, "recip_rank"],
    names = ["BM25", "TF-IDF", "TF-Bo1", "TF-KLD", "TF-RM3", "TF-AQE", "BM25-Bo1", "BM25-KLD", "BM25-RM3", "BM25-AQE"],
    verbose = True,
    baseline=0
)

pt.Experiment:   0%|          | 0/10 [00:00<?, ?system/s]

In [None]:
local_experiment

Unnamed: 0,name,recip_rank,P@5,R@10,R@15,R@20,recip_rank +,recip_rank -,recip_rank p-value,P@5 +,...,P@5 p-value,R@10 +,R@10 -,R@10 p-value,R@15 +,R@15 -,R@15 p-value,R@20 +,R@20 -,R@20 p-value
0,BM25,0.726728,0.188119,0.861386,0.89604,0.89604,,,,,...,,,,,,,,,,
1,TF-IDF,0.735206,0.186139,0.856436,0.89604,0.89604,8.0,7.0,0.484421,1.0,...,0.56629,0.0,1.0,0.319724,0.0,0.0,,0.0,0.0,
2,TF-Bo1,0.726519,0.184158,0.846535,0.886139,0.89604,8.0,11.0,0.987725,2.0,...,0.41693,3.0,4.0,0.408081,0.0,2.0,0.15833,1.0,1.0,1.0
3,TF-KLD,0.722179,0.184158,0.841584,0.876238,0.891089,7.0,12.0,0.71973,2.0,...,0.41693,3.0,5.0,0.287281,0.0,4.0,0.044941,1.0,2.0,0.56629
4,TF-RM3,0.682405,0.180198,0.816832,0.856436,0.886139,7.0,22.0,0.027395,2.0,...,0.15833,1.0,7.0,0.028321,0.0,6.0,0.020158,1.0,3.0,0.319724
5,TF-AQE,0.735206,0.186139,0.856436,0.89604,0.89604,8.0,7.0,0.484421,1.0,...,0.56629,0.0,1.0,0.319724,0.0,0.0,,0.0,0.0,
6,BM25-Bo1,0.709538,0.178218,0.836634,0.886139,0.891089,4.0,14.0,0.097421,0.0,...,0.024597,1.0,4.0,0.13236,0.0,2.0,0.15833,1.0,2.0,0.56629
7,BM25-KLD,0.71006,0.180198,0.836634,0.881188,0.89604,2.0,14.0,0.108448,1.0,...,0.102744,0.0,4.0,0.058402,0.0,3.0,0.083249,1.0,1.0,1.0
8,BM25-RM3,0.649623,0.180198,0.806931,0.861386,0.871287,5.0,28.0,0.00057,0.0,...,0.044941,1.0,8.0,0.015622,1.0,6.0,0.051732,1.0,5.0,0.095751
9,BM25-AQE,0.726728,0.188119,0.861386,0.89604,0.89604,0.0,0.0,,0.0,...,,0.0,0.0,,0.0,0.0,,0.0,0.0,


In [None]:
qrels_q

Unnamed: 0,qid,docno,label
0,H25-2-I,95,1
1,H25-2-U,95,1
2,H25-2-E,13,1
3,H25-2-E,120,1
4,H25-2-O,14,1
...,...,...,...
1397,H22-27-5,633,1
1398,H22-28-1,704,1
1399,H22-28-2,705,1
1400,H22-28-3,706,1


In [None]:
bm25_stop_stem = pt.BatchRetrieve(index_ref, wmodel = "BM25") % 30
tfidf_stop_stem = pt.BatchRetrieve(index_ref, wmodel = "TF_IDF") % 30
bm25_stop = pt.BatchRetrieve(index_ref_3, wmodel = "BM25") % 30
tfidf_stop = pt.BatchRetrieve(index_ref_3, wmodel = "TF_IDF") % 30
bm25_stem = pt.BatchRetrieve(index_ref_2, wmodel = "BM25") % 30
tfidf_stem = pt.BatchRetrieve(index_ref_2, wmodel = "TF_IDF") % 30
bm25_kosong = pt.BatchRetrieve(index_ref_4, wmodel = "BM25") % 30
tfidf_kosong = pt.BatchRetrieve(index_ref_4, wmodel = "TF_IDF") % 30
stop = pt.Experiment(
    [bm25_kosong,tfidf_kosong, bm25_stop,tfidf_stop,bm25_stem,tfidf_stem,bm25_stop_stem,tfidf_stop_stem,],
    topics_R04,
    qrels_q,
    eval_metrics=[P@5, R@10, R@15, R@20, "recip_rank"],
    names = ["BM25", "TF-IDF", "BM25-Stop", "TF-IDF-Stop", "BM25-Stem", "TF-IDF-Stem", "BM25-Stop-Stem", "TF-IDF-Stop-Stem"],
    verbose = True,
    baseline=0
)

pt.Experiment:   0%|          | 0/8 [00:00<?, ?system/s]

In [None]:
stop

Unnamed: 0,name,recip_rank,P@5,R@10,R@15,R@20,recip_rank +,recip_rank -,recip_rank p-value,P@5 +,...,P@5 p-value,R@10 +,R@10 -,R@10 p-value,R@15 +,R@15 -,R@15 p-value,R@20 +,R@20 -,R@20 p-value
0,BM25,0.448044,0.10495,0.50495,0.549505,0.658416,,,,,...,,,,,,,,,,
1,TF-IDF,0.691698,0.172277,0.821782,0.841584,0.851485,54.0,3.0,2.756076e-09,30.0,...,9.510246e-08,38.0,1.0,1.488879e-10,36.0,1.0,7.739094e-10,26.0,2.0,1.627521e-06
2,BM25-Stop,0.702032,0.174257,0.816832,0.841584,0.871287,51.0,3.0,1.905615e-09,32.0,...,2.264082e-08,35.0,1.0,6.050178e-10,34.0,1.0,1.772659e-09,26.0,2.0,8.974304e-07
3,TF-IDF-Stop,0.699422,0.174257,0.836634,0.846535,0.856436,53.0,3.0,1.691236e-09,32.0,...,2.264082e-08,37.0,1.0,1.344455e-10,34.0,1.0,1.500025e-09,25.0,2.0,2.004024e-06
4,BM25-Stem,0.439958,0.10099,0.509901,0.554455,0.683168,17.0,20.0,0.5894767,1.0,...,0.3197242,4.0,2.0,0.797705,3.0,3.0,0.7406883,6.0,4.0,0.3197242
5,TF-IDF-Stem,0.710999,0.184158,0.851485,0.861386,0.866337,51.0,4.0,6.216784e-10,35.0,...,4.142467e-09,39.0,1.0,3.444185e-11,37.0,1.0,2.688896e-10,27.0,2.0,7.391518e-07
6,BM25-Stop-Stem,0.726728,0.188119,0.861386,0.89604,0.89604,53.0,4.0,4.492771e-10,37.0,...,1.066815e-09,39.0,1.0,2.366714e-11,41.0,1.0,1.455197e-11,29.0,1.0,6.91251e-08
7,TF-IDF-Stop-Stem,0.735206,0.186139,0.856436,0.89604,0.89604,54.0,3.0,1.373965e-10,35.0,...,4.57386e-09,39.0,1.0,2.860589e-11,41.0,1.0,1.455197e-11,29.0,1.0,6.91251e-08


In [None]:
stop.to_excel('stop.xlsx', index=False)

In [None]:
def process_table(df):
  columns_to_drop = ['recip_rank +', 'recip_rank -', 'P@5 +', 'P@5 -', 'R@10 +', 'R@10 -', 'R@15 +', 'R@15 -', 'R@20 +', 'R@20 -']
  df = df.drop(columns=columns_to_drop)
  return df

In [None]:
a = process_table(eks_bm25)

In [None]:
b = process_table(eks_tfidf)
c = process_table(eks_bm25_bert)
d = process_table(eks_tfidf_bert)

In [None]:
a=a.round(2)
b=b.round(2)
c=c.round(2)
d=d.round(2)

In [None]:
asda = ["W2V-CBOW", "W2V-SG","FT-CBOW", "FT-SG"]
asd12 = ["0,50", "0,60"]
sasd = ["0,40", "0,45"]
listas = ["Baseline"]
for i in asda:
  for j in sasd:
    for k in asd12:
      listas.append(i+"-"+j+"-BERT-"+k)

c['name'] = listas
d['name'] = listas

In [None]:
aprint = a.to_latex(index=False, longtable=True)
bprint = b.to_latex(index=False, longtable=True)
cprint = c.to_latex(index=False, longtable=True)
dprint = d.to_latex(index=False, longtable=True)

In [None]:
print(aprint)

\begin{longtable}{lrrrrrrrrrr}
\toprule
name & recip_rank & P@5 & R@10 & R@15 & R@20 & recip_rank p-value & P@5 p-value & R@10 p-value & R@15 p-value & R@20 p-value \\
\midrule
\endfirsthead
\toprule
name & recip_rank & P@5 & R@10 & R@15 & R@20 & recip_rank p-value & P@5 p-value & R@10 p-value & R@15 p-value & R@20 p-value \\
\midrule
\endhead
\midrule
\multicolumn{11}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
BM25-BaseLine & 0.730000 & 0.190000 & 0.860000 & 0.900000 & 0.900000 & NaN & NaN & NaN & NaN & NaN \\
W2V-SG1 & 0.700000 & 0.180000 & 0.870000 & 0.890000 & 0.890000 & 0.110000 & 0.020000 & 0.740000 & 0.320000 & 0.660000 \\
W2V-SG2 & 0.700000 & 0.180000 & 0.870000 & 0.890000 & 0.890000 & 0.110000 & 0.020000 & 0.740000 & 0.320000 & 0.660000 \\
W2V-CBOW1 & 0.670000 & 0.180000 & 0.840000 & 0.870000 & 0.900000 & 0.010000 & 0.060000 & 0.370000 & 0.170000 & 0.570000 \\
W2V-CBOW2 & 0.690000 & 0.180000 & 0.860000 & 0.880000 & 0.890000 & 0.030000 & 0.060000 &

In [None]:
print(bprint)

\begin{longtable}{lrrrrrrrrrr}
\toprule
name & recip_rank & P@5 & R@10 & R@15 & R@20 & recip_rank p-value & P@5 p-value & R@10 p-value & R@15 p-value & R@20 p-value \\
\midrule
\endfirsthead
\toprule
name & recip_rank & P@5 & R@10 & R@15 & R@20 & recip_rank p-value & P@5 p-value & R@10 p-value & R@15 p-value & R@20 p-value \\
\midrule
\endhead
\midrule
\multicolumn{11}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
TF-IDF-BaseLine & 0.735206 & 0.186139 & 0.856436 & 0.896040 & 0.896040 & NaN & NaN & NaN & NaN & NaN \\
W2V-SG1 & 0.698817 & 0.182178 & 0.851485 & 0.886139 & 0.886139 & 0.056089 & 0.416930 & 0.764686 & 0.319724 & 0.319724 \\
W2V-SG2 & 0.698830 & 0.182178 & 0.851485 & 0.886139 & 0.886139 & 0.056176 & 0.416930 & 0.764686 & 0.319724 & 0.319724 \\
W2V-CBOW1 & 0.679134 & 0.176238 & 0.861386 & 0.876238 & 0.900990 & 0.018028 & 0.095751 & 0.783057 & 0.158330 & 0.566290 \\
W2V-CBOW2 & 0.694939 & 0.178218 & 0.861386 & 0.876238 & 0.891089 & 0.044237 & 0.158330

In [None]:
print(cprint)

\begin{longtable}{lrrrrrrrrrr}
\toprule
name & recip_rank & P@5 & R@10 & R@15 & R@20 & recip_rank p-value & P@5 p-value & R@10 p-value & R@15 p-value & R@20 p-value \\
\midrule
\endfirsthead
\toprule
name & recip_rank & P@5 & R@10 & R@15 & R@20 & recip_rank p-value & P@5 p-value & R@10 p-value & R@15 p-value & R@20 p-value \\
\midrule
\endhead
\midrule
\multicolumn{11}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
Baseline & 0.726728 & 0.188119 & 0.861386 & 0.896040 & 0.896040 & NaN & NaN & NaN & NaN & NaN \\
W2V-CBOW-0,40-BERT-0,50 & 0.704579 & 0.182178 & 0.866337 & 0.886139 & 0.891089 & 0.222079 & 0.181012 & 0.740688 & 0.319724 & 0.656968 \\
W2V-CBOW-0,40-BERT-0,60 & 0.704592 & 0.182178 & 0.866337 & 0.886139 & 0.891089 & 0.222328 & 0.181012 & 0.740688 & 0.319724 & 0.656968 \\
W2V-CBOW-0,45-BERT-0,50 & 0.713427 & 0.186139 & 0.871287 & 0.886139 & 0.886139 & 0.445001 & 0.566290 & 0.158330 & 0.319724 & 0.319724 \\
W2V-CBOW-0,45-BERT-0,60 & 0.713427 & 0.186139 &

In [None]:
print(dprint)

\begin{longtable}{lrrrrrrrrrr}
\toprule
name & recip_rank & P@5 & R@10 & R@15 & R@20 & recip_rank p-value & P@5 p-value & R@10 p-value & R@15 p-value & R@20 p-value \\
\midrule
\endfirsthead
\toprule
name & recip_rank & P@5 & R@10 & R@15 & R@20 & recip_rank p-value & P@5 p-value & R@10 p-value & R@15 p-value & R@20 p-value \\
\midrule
\endhead
\midrule
\multicolumn{11}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
Baseline & 0.735206 & 0.186139 & 0.856436 & 0.896040 & 0.896040 & NaN & NaN & NaN & NaN & NaN \\
W2V-CBOW-0,40-BERT-0,50 & 0.703438 & 0.182178 & 0.851485 & 0.886139 & 0.886139 & 0.093263 & 0.416930 & 0.764686 & 0.319724 & 0.319724 \\
W2V-CBOW-0,40-BERT-0,60 & 0.703451 & 0.182178 & 0.851485 & 0.886139 & 0.886139 & 0.093395 & 0.416930 & 0.764686 & 0.319724 & 0.319724 \\
W2V-CBOW-0,45-BERT-0,50 & 0.712679 & 0.186139 & 0.871287 & 0.881188 & 0.886139 & 0.222852 & 1.000000 & 0.083249 & 0.083249 & 0.319724 \\
W2V-CBOW-0,45-BERT-0,60 & 0.712692 & 0.186139 &

In [None]:
d

Unnamed: 0,name,recip_rank,P@5,R@10,R@15,R@20,recip_rank p-value,P@5 p-value,R@10 p-value,R@15 p-value,R@20 p-value
0,Baseline,0.735206,0.186139,0.856436,0.89604,0.89604,,,,,
1,"W2V-CBOW-0,40-BERT-0,50",0.703438,0.182178,0.851485,0.886139,0.886139,0.093263,0.41693,0.764686,0.319724,0.319724
2,"W2V-CBOW-0,40-BERT-0,60",0.703451,0.182178,0.851485,0.886139,0.886139,0.093395,0.41693,0.764686,0.319724,0.319724
3,"W2V-CBOW-0,45-BERT-0,50",0.712679,0.186139,0.871287,0.881188,0.886139,0.222852,1.0,0.083249,0.083249,0.319724
4,"W2V-CBOW-0,45-BERT-0,60",0.712692,0.186139,0.871287,0.881188,0.886139,0.223117,1.0,0.083249,0.083249,0.319724
5,"W2V-SG-0,40-BERT-0,50",0.690631,0.180198,0.861386,0.876238,0.90099,0.051893,0.258873,0.783057,0.15833,0.56629
6,"W2V-SG-0,40-BERT-0,60",0.7078,0.182178,0.861386,0.876238,0.891089,0.169255,0.41693,0.783057,0.15833,0.70745
7,"W2V-SG-0,45-BERT-0,50",0.696693,0.186139,0.876238,0.886139,0.89604,0.087563,1.0,0.044941,0.319724,1.0
8,"W2V-SG-0,45-BERT-0,60",0.715415,0.188119,0.866337,0.886139,0.89604,0.296666,0.656968,0.482235,0.319724,1.0
9,"FT-CBOW-0,40-BERT-0,50",0.735565,0.182178,0.866337,0.891089,0.89604,0.984405,0.41693,0.41693,0.56629,1.0
