In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd /content/drive/MyDrive/Colab Notebooks/pygaggle-master/

In [None]:
ls

In [None]:
pip install --upgrade pip

In [None]:
pip install -r requirements.txt

In [None]:
cd /content/drive/MyDrive/Colab Notebooks/pygaggle-master/

In [None]:
# Metric Functions

import pandas as pd
import numpy as np

def calculate_accuracy(df_json, k=100):
  for i in range(len(df_json)): # iterate per row
    true_cnt = 0
    true_k_cnt = 0
    for j in range(len(df_json['ctxs'][i])): # iterate per ctxs
      if df_json['ctxs'][i][j]['has_answer']:
        if j <= k:
          true_k_cnt += 1
        true_cnt += 1
  return true_k_cnt / true_cnt

def calculate_precision(df_json, k=100):
  precision = 0
  for i in range(len(df_json)): # iterate per row
    true_cnt = 0
    pos_lst = [1 for x in df_json['ctxs'][i] if x['has_answer']]
    for j in range(k): # iterate per ctxs
      if df_json['ctxs'][i][j]['has_answer']:
        true_cnt += 1
    if len(pos_lst):
      curr_precision = true_cnt / len(pos_lst)
    else:
      curr_precision = 0
    precision += curr_precision
  return precision / len(df_json)

def calculate_mrr(df_json, k=100):
  mrr = 0
  for i in range(len(df_json)): # iterate per row
    for j in range(k): # iterate per ctxs
      if df_json['ctxs'][i][j]['has_answer']:
        mrr += 1 / (j + 1) # (j+1)-th rank
        break
  return mrr / len(df_json)

def calculate_map(df_json, k=100):
  map = 0
  for i in range(len(df_json)): # iterate per row
    true_cnt = 0
    curr_precision = 0
    for j in range(k): # iterate per ctxs
      if df_json['ctxs'][i][j]['has_answer']:
        true_cnt += 1
        curr_precision += true_cnt / (j + 1)
    if true_cnt:
      map += curr_precision / true_cnt
  return map / len(df_json)

def calculate_ndcg(df_json, k=100):
  ndcg = 0
  for i in range(len(df_json)): # iterate per row
    dcg = 0
    true_cnt = 0
    ideal_dcg = 1
    for j in range(k): # iterate per ctxs
      if df_json['ctxs'][i][j]['has_answer']:
        dcg += 1 / np.log2(j + 2)
        ideal_dcg += 1 / np.log2(true_cnt + 2)
        true_cnt += 1
    if ideal_dcg or dcg:
      ndcg += dcg / ideal_dcg
  return ndcg / len(df_json)

In [None]:
import json

def reading_json_file(path):
    queryy=[]

    with open(path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        corpus = [[['a' for _ in range(4)] for _ in range(100)] for _ in range(len(data))]
        for i in range(len(data)):
            queryy.append(data[i]["question"])
            for j in range(100):
                corpus[i][j][0]=data[i]["ctxs"][j]["id"]
                corpus[i][j][1]=data[i]["ctxs"][j]["title"]
                corpus[i][j][2]=data[i]["ctxs"][j]["text"]
                corpus[i][j][3]=data[i]["ctxs"][j]["has_answer"]

    return queryy, corpus, data

In [None]:
from pygaggle.rerank.base import Query, Text
from pygaggle.rerank.transformer import MonoBERT, MonoT5

##reranker 설정 필요
# reranker =  MonoBERT()
reranker =  MonoT5()

In [None]:
##path 설정 필요
file_path ='/content/drive/MyDrive/Colab Notebooks/NQ_test_top100_all.json'
#file_path ='/content/drive/MyDrive/Colab Notebooks/NQ_dev_top100_all.json'

queryy, corpus, data = reading_json_file(file_path)

In [None]:
for q in range(len(data)): ##range 설정 필요
  query = Query(queryy[q])
  passages=corpus[q]

  texts = [ Text(p[2], {'id': p[0],'title':p[1],'text':p[2],'has_answer' : p[3]}, 0) for p in passages]

  reranked = reranker.rerank(query, texts)

  #json 파일 값 변경
  for k in range(100):
    data[q]["ctxs"][k]["id"]=reranked[k].metadata["id"]
    data[q]["ctxs"][k]["title"]=reranked[k].metadata["title"]
    data[q]["ctxs"][k]["text"]=reranked[k].metadata["text"]
    data[q]["ctxs"][k]["score"]=str(reranked[k].score) 
    data[q]["ctxs"][k]["has_answer"]=reranked[k].metadata["has_answer"]

k = 10

print(f'Accuracy@{k}: {calculate_accuracy(data, k):.4f}')
print(f'Precision@{k}: {calculate_precision(data, k):.4f}')
print(f'MRR@{k}: {calculate_mrr(data, k):.4f}')
print(f'MAP@{k}: {calculate_map(data, k):.4f}')
print(f'nDCG@{k}: {calculate_ndcg(data, k):.4f}')

## json 파일명 변경 및 data range 설정 필요
with open('NQ_dev_top100_qe_reranked_t5_2.json', 'w') as file:
      json.dump(data[4000:8757], file)