In [1]:
from evaluation import f1_score, exact_match_score, f1, ems
import pickle as pkl
import json
import numpy as np
import itertools

In [2]:
def evaluate(dataset, retriever, k = None, round = 5, topks = [1, 5, 10, 20, 30], k_emb = 15):    
    if retriever != 'knn':
        res = json.load(open('../result/{}/{}_{}.json'.format(dataset, retriever, k), 'rb'))
    else:
        res = json.load(open('../result/{}/{}_{}_{}.json'.format(dataset, retriever, k_emb, k), 'rb'))
    
    
    filter_res = [r for r in res if r['prediction'] != 'System mistake']
    
    f1s, emss, accs = [], [], []
    
    if retriever not in ['golden', 'no']:
        recall, precision, sp_em = [], [], []
    
    for r in filter_res:
        accs.append(('1' in r['grade'])*1.0)
        
        if dataset in ['hotpotqa', '2WikiMQA', 'musique']:
            f1s.append(f1_score(r['prediction'], r['answer']))
            emss.append(exact_match_score(r['prediction'], r['answer']))
            
        elif dataset in ['iirc']:
            f1s.append(f1(r['prediction'], r['answer']))
            emss.append(ems(r['prediction'], r['answer']))
        
        r['corpus'] = list(itertools.chain(*[_.split('\n') for _ in r['corpus']]))
        if retriever not in ['golden', 'no']:
            evi = set([_[1] for _ in r['supports']])
            
            tmp_recall = []
            tmp_precision = []    
            tmp_sp_em = []
            for kk in topks:
                if kk <= k:
                    tmp = set(r['corpus'][:kk])

                    tmp_recall.append(len(evi.intersection(tmp))/len(evi))
                    tmp_precision.append(len(evi.intersection(tmp))/kk)
                    
                    if evi.issubset(tmp):
                        tmp_sp_em.append(1)
                    else:
                        tmp_sp_em.append(0)
                
            
            recall.append(tmp_recall)
            precision.append(tmp_precision)
            sp_em.append(tmp_sp_em)

    
    print('Acc:', np.mean(accs))
    print('F1:', np.mean(f1s))
    print('EM:', np.mean(emss))
    

    if retriever not in ['golden', 'no']:
        print('Recall:', np.mean(np.array(recall), axis = 0))
        print('Precision:', np.mean(np.array(precision), axis = 0))
        print('SP_EM:', np.mean(np.array(sp_em), axis = 0))

evaluate(dataset = '2WikiMQA', retriever = 'Golden', k = 30)

# IIRC

In [None]:
# Golden
evaluate(dataset = 'iirc', retriever = 'golden', k = 30)

In [None]:
# No
evaluate(dataset = 'iirc', retriever = 'no', k = 30)

In [None]:
# KNN
evaluate(dataset = 'iirc', retriever = 'knn', k = 30, k_emb = 15)

In [None]:
# Tf-IDF
evaluate(dataset = 'iirc', retriever = 'tf-idf', k = 30)

In [None]:
# Bm25
evaluate(dataset = 'iirc', retriever = 'bm25', k = 30)

In [None]:
# KG
evaluate(dataset = 'iirc', retriever = 'kg_test_docs_graph', k = 30)

In [None]:
# MDR
evaluate(dataset = 'iirc', retriever = 'mhop', k = 30)

In [None]:
# LLaMA
evaluate(dataset = 'iirc', retriever = 'llama', k = 30)

In [None]:
# DPR
evaluate(dataset = 'iirc', retriever = 'dpr', k = 30)

In [None]:
# T5
evaluate(dataset = 'iirc', retriever = 't5', k = 30)

In [None]:
# KG-T5
evaluate(dataset = 'iirc', retriever = 'kg-t5_test_docs_graph', k = 30)

In [None]:
# KG-LLaMA
evaluate(dataset = 'iirc', retriever = 'kg-llama_test_docs_graph', k = 30)

In [None]:
# IRCoT
evaluate(dataset = 'iirc', retriever = 'ircot', k = 30)

In [None]:
# KG-MDR
evaluate(dataset = 'iirc', retriever = 'kg-mdr_test_docs_graph', k = 30)

In [None]:
# LLM-MDR
evaluate(dataset = 'iirc', retriever = 'llm-mdr', k = 30)

# HotpotQA

In [None]:
# Golden
evaluate(dataset = 'hotpotqa', retriever = 'golden', k = 30)

In [None]:
# No
evaluate(dataset = 'hotpotqa', retriever = 'no', k = 30)

In [None]:
# bm25
evaluate(dataset = 'hotpotqa', retriever = 'bm25', k = 30)

In [None]:
# TF-IDF
evaluate(dataset = 'hotpotqa', retriever = 'tf-idf', k = 30)

In [None]:
# KNN
evaluate(dataset = 'hotpotqa', retriever = 'knn', k = 30, k_emb = 15)

In [None]:
# MDR
evaluate(dataset = 'hotpotqa', retriever = 'mhop', k = 30)

In [None]:
# DPR
evaluate(dataset = 'hotpotqa', retriever = 'dpr', k = 30)

In [None]:
# LLaMA
evaluate(dataset = 'hotpotqa', retriever = 'llama', k = 30)

In [None]:
# T5
evaluate(dataset = 'hotpotqa', retriever = 't5', k = 30)

In [None]:
# T5-KG-TAGME
evaluate(dataset = 'hotpotqa', retriever = 'kg-t5_graph_tagme_0.8', k = 30)

In [None]:
# LLaMA-KG-TAGME
evaluate(dataset = 'hotpotqa', retriever = 'kg-llama_test_docs_graph', k = 30)

In [None]:
# MDR-KG-TAGME
evaluate(dataset = 'hotpotqa', retriever = 'kg-mdr_test_docs_graph', k = 30)

In [None]:
# IRCOT
evaluate(dataset = 'hotpotqa', retriever = 'ircot', k = 30)

In [None]:
# llm-mdr
evaluate(dataset = 'hotpotqa', retriever = 'llm-mdr', k = 30)

In [None]:
# kg-chatgpt
evaluate(dataset = 'hotpotqa', retriever = 'kg-chatgpt_test_docs_graph', k = 30)

# 2WikiMQA(Wikimultihop)

In [4]:
# Golden
evaluate(dataset = '2WikiMQA', retriever = 'Golden', k = 30)

Acc: 0.812
F1: 0.6942132756132755
EM: 0.538
Recall: [0.44316667 0.996      0.996      0.996      0.996     ]
Precision: [0.996  0.486  0.243  0.1215 0.081 ]
SP_EM: [0.    0.992 0.992 0.992 0.992]


In [5]:
# No
evaluate(dataset = '2WikiMQA', retriever = 'no', k = 30)

Acc: 0.358
F1: 0.3172920634920635
EM: 0.254


In [7]:
# MDR
evaluate(dataset = '2WikiMQA', retriever = 'MDR', k = 30)

Acc: 0.704
F1: 0.5824903539022184
EM: 0.44
Recall: [0.15216667 0.39133333 0.56833333 0.684      0.735     ]
Precision: [0.336      0.1748     0.1296     0.0789     0.05666667]
SP_EM: [0.    0.14  0.32  0.444 0.5  ]


In [10]:
# DPR
evaluate(dataset = '2WikiMQA', retriever = 'DPR', k = 30)

Acc: 0.718
F1: 0.6077136974136974
EM: 0.478
Recall: [0.166      0.48266667 0.6685     0.806      0.8805    ]
Precision: [0.372  0.2228 0.1566 0.0959 0.0708]
SP_EM: [0.    0.196 0.422 0.632 0.762]


In [12]:
# KNN
evaluate(dataset = '2WikiMQA', retriever = 'knn', k = 30, k_emb = 15)

Acc: 0.596
F1: 0.5079017205017206
EM: 0.4
Recall: [0.0075     0.0645     0.169      0.66016667 0.68266667]
Precision: [0.018      0.0324     0.0414     0.0778     0.05393333]
SP_EM: [0.    0.    0.02  0.392 0.408]


In [11]:
# bm25
evaluate(dataset = '2WikiMQA', retriever = 'bm25', k = 30)

Acc: 0.582
F1: 0.48706925296925296
EM: 0.398
Recall: [0.1375     0.40816667 0.50266667 0.58816667 0.62416667]
Precision: [0.33       0.192      0.1186     0.0706     0.05026667]
SP_EM: [0.    0.152 0.244 0.328 0.364]


In [13]:
# TF-IDF
evaluate(dataset = '2WikiMQA', retriever = 'tf-idf', k = 30)

Acc: 0.642
F1: 0.5404131313131313
EM: 0.424
Recall: [0.208      0.52416667 0.61216667 0.69566667 0.73666667]
Precision: [0.48   0.244  0.1434 0.0825 0.0588]
SP_EM: [0.    0.236 0.334 0.442 0.5  ]


In [None]:
# LLaMA
evaluate(dataset = '2WikiMQA', retriever = 'llama', k = 30)

In [14]:
# T5 
evaluate(dataset = '2WikiMQA', retriever = 't5', k = 30)

Acc: nan
F1: nan
EM: nan
Recall: nan
Precision: nan
SP_EM: nan


In [None]:
# IRCoT
evaluate(dataset = '2WikiMQA', retriever = 'ircot', k = 30)

In [None]:
# KG-T5-TAGME
evaluate(dataset = '2WikiMQA', retriever = 'kg-t5_test_docs_graph', k = 30)

In [None]:
# KG-LLaMA-TAGME
evaluate(dataset = '2WikiMQA', retriever = 'kg-llama_test_docs_graph', k = 30)

In [None]:
# KG-MDR-TAGME
evaluate(dataset = '2WikiMQA', retriever = 'kg-mdr_test_docs_graph', k = 30)

In [None]:
# llm-mdr
evaluate(dataset = '2WikiMQA', retriever = 'llm-mdr', k = 30)

In [15]:
# kg-chatgpt
evaluate(dataset = '2WikiMQA', retriever = 'kg-chatgpt_test_docs_graph', k = 30)

FileNotFoundError: [Errno 2] No such file or directory: '../result/2WikiMQA/kg-chatgpt_test_docs_graph_30.json'

# MuSiQue

In [None]:
# no
evaluate(dataset = 'musique', retriever = 'no', k = 30)

In [None]:
# golden
evaluate(dataset = 'musique', retriever = 'golden', k = 30)

In [None]:
# knn
evaluate(dataset = 'musique', retriever = 'knn', k = 30)

In [None]:
# bm25
evaluate(dataset = 'musique', retriever = 'bm25', k = 30)

In [None]:
# tf-idf
evaluate(dataset = 'musique', retriever = 'tf-idf', k = 30)

In [None]:
# mdr
evaluate(dataset = 'musique', retriever = 'mhop', k = 30)

In [None]:
# dpr
evaluate(dataset = 'musique', retriever = 'dpr', k = 30)

In [None]:
# t5
evaluate(dataset = 'musique', retriever = 't5', k = 30)

In [None]:
# llama
evaluate(dataset = 'musique', retriever = 'llama', k = 30)

In [None]:
# kg
evaluate(dataset = 'musique', retriever = 'kg_test_docs_graph', k = 30)

In [None]:
# kg-t5
evaluate(dataset = 'musique', retriever = 'kg-t5_graph_tagme_0.9', k = 30)

In [None]:
# kg-mdr
evaluate(dataset = 'musique', retriever = 'kg-mdr_test_docs_graph', k = 30)

In [None]:
# kg-chatgpt
evaluate(dataset = 'musique', retriever = 'kg-chatgpt_test_docs_graph', k = 30)

In [None]:
# IRCoT
evaluate(dataset = 'musique', retriever = 'ircot', k = 30)

In [None]:
# llm-mdr
evaluate(dataset = 'musique', retriever = 'llm-mdr', k = 30)