In [79]:
import warnings
import regex as re
from pathlib import Path
import spacy
from spacy import displacy
import pandas as pd
import numpy as np
import coreferee
from sentence_transformers import SentenceTransformer, util
from nltk.translate.bleu_score import SmoothingFunction


def resolve_coreference(text):
    doc = nlp(text)
    doc_list = list(doc)
    # doc._.coref_chains.print()
    resolving_indecies = []
    for _,item in enumerate(doc._.coref_chains):
        resolving_indecies.extend(item)
        
    for word in resolving_indecies:
        new_word = ""
        for index in word:
            if doc[index]._.coref_chains.resolve(doc[index]) is not None:
                temp = []
                for item in doc._.coref_chains.resolve(doc[index]):
                    temp.append(str(item))
                new_word = ", ".join(temp)
            
                doc_list[index] = new_word

    final_doc = []
    for item in doc_list:
        final_doc.append(str(item))
    return " ".join(final_doc)

def extract_subjects(sentence):
    subjects = {}
    verbIdx = 0
    for token in sentence:
        if token.pos_ == "VERB" or token.pos_ == "AUX" or token.dep_ == "ROOT":
            verbIdx += 1
            subjectFlag = False
            verb = token
            for child in token.children:
                if child.dep_ in ("nsubj", "csubj"):
                    subtree_tokens = [str(t) for t in child.subtree]
                    subjects[token] = (" ".join(subtree_tokens), verbIdx)
                    subjectFlag = True
                elif child.dep_ == "nsubjpass":
                    for child in verb.children:
                        if child.dep_ == "agent" and len(list(child.children)) > 0:
                            subject = [str(t) for t in list(child.children)[0].subtree]
                            subject = " ".join(subject)
                            break
                        else:
                            subject = "Unknown"
                    subjects[verb] = (subject, verbIdx)
                    subjectFlag = True
            if not subjectFlag:  # didn't find a normal subject
                if token.dep_ in ("relcl" , "acl"):
                    subject = str(token.head)
                    subjects[token] = (subject, verbIdx)  # should get the subtree of the subject
                elif token.dep_ in ("advcl", "conj"):
                    verb = token.head
                    if verb in subjects:
                        subjects[token] = (subjects[verb][0], verbIdx)
                    else:
                        subjects[token] = ("Unknown", verbIdx)  # replace "Unknown" with a suitable default
                elif token.dep_ == "xcomp":
                    verb = token.head
                    if verb in subjects:
                        subjects[token] = (subjects[verb][0], verbIdx)
                    else:
                        subjects[token] = ("Unknown", verbIdx)
                    for child in verb.subtree:
                        if child.dep_ in ("dobj", "dative", "pobj"):
                            subtree_tokens = [str(t) for t in child.subtree]
                            subjects[token] = (" ".join(subtree_tokens), verbIdx)
                            break
                else:
                    subjects[token] = ("Unknown", verbIdx)
                                        
    # (subject, verbIdx, verb)
    return [(v[0], k, v[1]) for k, v in subjects.items()]             
                            
def extract_objects(sentence):
    objects = []
    verbIdx = 0
    for token in sentence:
        if token.pos_ == "VERB" or token.pos_ == "AUX" or token.dep_ == "ROOT":
            verbIdx += 1
            for child in token.children:
                if child.dep_ in ("dobj", "dative", "attr", "oprd", "acomp","ccomp", "xcomp", "nsubjpass"):
                    subtree_tokens = [str(t) for t in child.subtree]
                    objects.append((" ".join(subtree_tokens), token, verbIdx))
                    
    return objects

def extract_state(sentence):
    states = []
    verbIdx = 0
    for token in sentence:
        if token.pos_ =="VERB" or token.pos_ == "AUX":
            verbIdx += 1
            for child in token.children:
                if child.dep_ == "prep":
                    subtree_tokens = [str(t) for t in child.subtree]
                    states.append(((" ".join(subtree_tokens), token, verbIdx)))
    return states

def extract_time(sentence):
    times = {}
    verbIdx = 0
    year_pattern = re.compile(r'\b\d{4}\b')  # matches any four-digit number
    for token in sentence:
        if token.pos_ == "VERB" or token.pos_ == "AUX" or token.dep_ == "ROOT":
            verbIdx += 1
            for child in token.subtree:
                if child.ent_type_ == "DATE" or child.ent_type_ == "TIME":
                    times[child.text] = (token, verbIdx)
                elif year_pattern.search(child.text):
                    year = year_pattern.search(child.text).group()
                    times[year] = (token, verbIdx)
    return [(k, v[0], v[1]) for k, v in times.items()]

def extract_location(sentence):
    locations = {}
    verbIdx = 0
    for token in sentence:
        if token.pos_ == "VERB" or token.pos_ == "AUX" or token.dep_ == "ROOT":
            verbIdx += 1
            for child in token.subtree:
                if child.ent_type_ in ("GPE", "LOC", "FAC"):
                    locations[child.text] = (token, verbIdx)
                    
    return [(k, v[0], v[1]) for k, v in locations.items()]
                                        

def extract_facts(sentence):
    sentence = nlp(sentence)
    states = extract_state(sentence)
    subjects = extract_subjects(sentence)
    objects = extract_objects(sentence)
    times = extract_time(sentence)
    locations = extract_location(sentence)
    
    facts = pd.DataFrame(columns=["Subject", "Relation", "verbIdx", "Objects", "States", "Times", "Locations"])
    
    for subject in subjects: #(Aly, is, 1), (Ziad,is, 2) 
        currentSubject = subject[0]
        verb = subject[1].lemma_
        verbIdx = subject[2]
        mask = (facts['Subject'] != currentSubject) | (facts['Relation'] != verb)
        if mask.all():
            new_row = pd.DataFrame([{"Subject": currentSubject, "Relation": verb, "verbIdx": verbIdx, "Objects": [], "States": [], "Times": [], "Locations": []}])
            facts = pd.concat([facts, new_row], ignore_index=True)

    for obj in objects: #(happy, is, 1), (good, is, 2)
        currentObj = obj[0]
        verb = obj[1].lemma_
        verbIdx = obj[2]
        mask = (facts['Relation'] == verb) & (facts['verbIdx'] == verbIdx)
        if mask.any():
            oldObjects = list(facts.loc[mask, "Objects"].values[0])
            oldObjects.append(currentObj)
            for idx in facts.loc[mask].index:
                facts.at[idx, "Objects"] = oldObjects
            
    for state in states:
        currentState = state[0]
        verb = state[1].lemma_
        verbIdx = state[2]
        mask = (facts['Relation'] == verb) & (facts['verbIdx'] == verbIdx)
        if mask.any():
            oldStates = list(facts.loc[mask, "States"].values[0])
            oldStates.append(currentState)
            for idx in facts.loc[mask].index:
                facts.at[idx, "States"] = oldStates
            
    for time in times:
        currentTime = time[0]
        verb = time[1].lemma_
        verbIdx = time[2]
        mask = (facts['Relation'] == verb) & (facts['verbIdx'] == verbIdx)
        if mask.any():
            oldTimes = list(facts.loc[mask, "Times"].values[0])
            oldTimes.append(currentTime)
            for idx in facts.loc[mask].index:
                facts.at[idx, "Times"] = oldTimes
            
    for location in locations:
        currentLocation = location[0]
        verb = location[1].lemma_
        verbIdx = location[2]
        mask = (facts['Relation'] == verb) & (facts['verbIdx'] == verbIdx)
        if mask.any():
            oldLocations = list(facts.loc[mask, "Locations"].values[0])
            oldLocations.append(currentLocation)
            for idx in facts.loc[mask].index:
                facts.at[idx, "Locations"] = oldLocations
            
    facts = facts.drop(columns=["verbIdx"])
    return facts
        
def preprocess_context(doc):
    text = doc.strip()
    text.replace(".", ",")
    resolved_text = resolve_coreference(text)
    resolved_text = resolved_text.strip()
    resolved_text = resolved_text.replace("  ", " ").replace(" ,", ",").replace(" .", ".").replace("\n", "")
    return resolved_text

def join_sentences_facts(sentences):
    all_facts = pd.DataFrame(columns=["Subject", "Relation", "Objects", "States", "Times", "Locations"])
    for sentence in sentences:
        facts = extract_facts(sentence)
        all_facts = pd.concat([all_facts, facts])
    all_facts = all_facts.groupby(["Subject", "Relation"], as_index=False).agg({
        "Objects": lambda x: [item for sublist in x for item in sublist],
        "States": lambda x: [item for sublist in x for item in sublist],
        "Times": lambda x: [item for sublist in x for item in sublist],
        "Locations": lambda x: [item for sublist in x for item in sublist]
    })
    return all_facts

def change_subject_relation(factsDF, isQuestion = True):
    if not isQuestion:
        factsDF = factsDF[~((factsDF["Subject"] == "Unknown") & (factsDF["Objects"].apply(len) == 0) & (factsDF["States"].apply(len) == 0) & (factsDF["Times"].apply(len) == 0) & (factsDF["Locations"].apply(len) == 0))]
        factsDF = factsDF.reset_index(drop=True)

    for index, row in factsDF.iterrows():
        factsDF.loc[index, "Subject"] = [row['Subject']]
        factsDF.loc[index, "Relation"] = [row['Relation']]
    return factsDF

def similarity(factRow, questionRow, column):
    if len(factRow[column]) == 0 or len(questionRow[column]) == 0 or factRow[column] == ["Unknown"] or questionRow[column] == ["Unknown"]:
        return 0
    columnString = " ".join(factRow[column])
    questionString = " ".join(questionRow[column])
    embeddingFact = model.encode(columnString)
    embeddingQuestion = model.encode(questionString)
    return util.cos_sim(embeddingFact, embeddingQuestion)

        
def cost_function(factsDf, questionFact, excludeColumns=[]):
    cost = 0
    maxFactIdx = 0
    columnNames = ["Subject","Relation", "Objects", "States", "Times", "Locations"]
    for column in excludeColumns:
        columnNames.remove(column)
    for factIdx, factRow in factsDf.iterrows():
        currCost = 0
        for _, questionRow in questionFact.iterrows():
            if len(factRow[excludeColumns[0]]) == 0:
                continue
            for column in columnNames:
                currCost += similarity(factRow, questionRow, column)
        if currCost > cost:
            cost = currCost
            maxFactIdx = factIdx
    return maxFactIdx, cost


def process_question_context(question, doc):
    splitted_question = question.split(" ")
    question_type = splitted_question[0].lower()
    question_nlp = nlp(question)
    if question_nlp[0].ent_type_ == "DATE":
        question_type = "when"
    resolved_doc = preprocess_context(doc)
    cleaned_doc = nlp(resolved_doc)
    sentences = [one_sentence.text.strip() for one_sentence in cleaned_doc.sents]
    
    questionDF = extract_facts(question)
    if len(questionDF) == 1:
        questionDF["Subject"] = question_nlp.text
        questionDF["Relation"] = question_nlp.text
        questionDF["Objects"] = question_nlp.text
        questionDF["States"] = question_nlp.text
        questionDF["Times"] = question_nlp.text
        questionDF["Locations"] = question_nlp.text
    factsDF = join_sentences_facts(sentences)
    
    newFactsDF = change_subject_relation(factsDF, False)
    newQuestionDF = change_subject_relation(questionDF, False)
    
    return newFactsDF, newQuestionDF, question_type

def get_answer(factsDF, questionDF, question_type):
    correctIdx, _ = cost_function(factsDF, questionDF, excludeColumns=[excludesPerQuestionType[question_type]])
    answer = factsDF.loc[correctIdx, excludesPerQuestionType[question_type]]
    if answer == []:
        answer = factsDF.loc[correctIdx, "States"]    
    return " ".join(answer)
    

# if __name__ == "__main__":
nlp = spacy.load('en_core_web_md')
nlp.add_pipe("merge_entities")
nlp.add_pipe("merge_noun_chunks")
nlp.add_pipe('coreferee')
model = SentenceTransformer("all-MiniLM-L6-v2")

excludesPerQuestionType = {
    "when": "Times",
    "where": "Locations",
    "who": "Subject",
    "what": "Objects",
    "how": "States"
}   
    
# doc = """
# Lionel Andrés "Leo" Messi was born in 24 June 1987 is an Argentine professional footballer plays as a forward for and captains both Major League Soccer club Inter Miami and the Argentina national team.
# He played in Barcelona in 2010.
# Widely regarded as one of the greatest players of all time, Messi has won a record eight Ballon d'Or awards, a record six European Golden Shoes, and was named the world's best player for a record eight times by FIFA.
# Until 2021, he had spent his entire professional career with Barcelona, where he won a club-record 34 trophies, including ten La Liga titles, seven Copa del Rey titles, and the UEFA Champions League four times.
# With his country, he won the 2021 Copa América and the 2022 FIFA World Cup. A prolific goalscorer and creative playmaker, Messi holds the records for most goals, hat-tricks, and assists in La Liga. He has the most international goals by a South American male. Messi has scored over 800 senior career goals for club and country, and the most goals for a single club.
# """
# question = "how did messi play?"
# factsDF, questionDF, question_type = process_question_context(question, doc)
# answer = get_answer(factsDF, questionDF, question_type)

# print("========================================================")
# print("Question: ", question)
# print("Answer: ", answer)




In [80]:
import random
from datasets import load_dataset

dataset = load_dataset("rajpurkar/squad")
# train = dataset['train']
validation = dataset['validation']

random.seed(42)
import getopt
import sys

import pandas as pd
import json 
import regex as re
from tqdm import tqdm
import nltk

In [81]:
len(validation['question'])

10570

In [82]:
# get length of questions which start with when and doesn't contain what, who, where, how and save the fake when questions indicies
when_questions = []
fake_when_questions = []
fake_when_questions_indices = []
for index, question in enumerate(validation['question']):
    if question.startswith("When") and not any(word in question.lower() for word in [", what", ", who", ", where", ", how"]):
        when_questions.append(question)
    elif question.startswith("When"):
        fake_when_questions.append(question)
        fake_when_questions_indices.append(index)
        
    
        
# remove rows from validation which contains questions inside fake_when_questions_indices
validation = validation.filter(lambda example, idx: idx not in fake_when_questions_indices, with_indices=True)
fake_when_questions_indices

[260,
 1420,
 5063,
 5788,
 6600,
 6808,
 6885,
 6904,
 7362,
 7459,
 8962,
 9035,
 9599,
 9667,
 9877,
 10253,
 10403,
 10422]

In [83]:
len(validation['question'])

10552

In [86]:

def QuestionStartsWith_Accuracy(dataset, startsWith):

    correct = 0
    EM = 0
    BLEU = 0
    BLEU1 = 0
    BLEU2 = 0
    BLEU3 = 0
    BLEU4 = 0
    total = 0
    errors = []
    corrects = []
    empties = []
    kolo = 0
    for item in tqdm(dataset):
        random_number = random.randint(0, len(dataset))
        kolo += 1
        included_question = False
        try:
            context = item['context']
            context = re.sub(' +', ' ', context)
            
            question = item['question']
            tempQuestion = question.lower()
            question = re.sub(' +', ' ', question)
            
            answer = item['answers']['text'][0]
            title = item['title']
    
            # check if question starts with startsWith
            for start in startsWith:
                if tempQuestion.startswith(start):
                    included_question = True
                    break
            
            if included_question:
    
                total += 1
                if total == 74:
                    pass
                    
                factsDF, questionDF, question_type = process_question_context(question, context)
                outputAnswer = get_answer(factsDF, questionDF, question_type)
                if outputAnswer == "":
                    empties.append(kolo)
                    outputAnswer = "No_Answer_Found"
                
                # print("Question: " , question)
                # print("Answer: ", answer, "-------" , "Our Answer: ", outputAnswer)
                if outputAnswer in answer or answer in outputAnswer:
                    correct += 1
                    corrects.append(kolo)
                else:
                    # write to when_wrong_answers.txt
                    with open("results/when_wrong_answers.txt", "a") as file:
                        file.write(f"Question: {question}\n")
                        file.write(f"Answer: {answer}\n")
                        file.write(f"Our Answer: {outputAnswer}\n\n")
                        
                        
                if outputAnswer == answer:
                    EM += 1

                n = min(len(outputAnswer.split()), 4)
                if n == 0:
                    BLEUscore = 0
                else:
                    weights = [1.0/n]*n
                    smoothie = SmoothingFunction().method4
                    BLEUscore = nltk.translate.bleu_score.sentence_bleu([answer], outputAnswer, weights=weights, smoothing_function=smoothie)
                    
                    BLEU += BLEUscore
                    BLEU1 += nltk.translate.bleu_score.sentence_bleu([answer], outputAnswer, weights=(1, 0, 0, 0), smoothing_function=smoothie)
                    BLEU2 += nltk.translate.bleu_score.sentence_bleu([answer], outputAnswer, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie)
                    BLEU3 += nltk.translate.bleu_score.sentence_bleu([answer], outputAnswer, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie)
                    BLEU4 += nltk.translate.bleu_score.sentence_bleu([answer], outputAnswer, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
                    

            if kolo % 1000 == 0:
                print(f"Correct: {correct}, out of {total}: {100*correct/total}%")
                print(f"EM: {EM}, out of {total}: {100*EM/total}%")
                print(f"BLEU: {BLEU}, out of {total}: {100*BLEU/total}%")
                print(f"BLEU1: {BLEU1}, out of {total}: {100*BLEU1/total}%")
                print(f"BLEU2: {BLEU2}, out of {total}: {100*BLEU2/total}%")
                print(f"BLEU3: {BLEU3}, out of {total}: {100*BLEU3/total}%")
                print(f"BLEU4: {BLEU4}, out of {total}: {100*BLEU4/total}%")
        
        except Exception as e:
            print("title: ", title)
            print("Error in question number: ", total)
            print("Question: ", question)
            print("Answer: ", answer)
            print("Error: ", e)
            errors.append(total)
            # total -= 1
            print("\n\n")
    
    if total != 0:
        print(f"Correct: {correct}, out of {total}: {100*correct/total}%")
        print(f"EM: {EM}, out of {total}: {100*EM/total}%")
        print(f"BLEU: {BLEU}, out of {total}: {100*BLEU/total}%")
        print(f"BLEU1: {BLEU1}, out of {total}: {100*BLEU1/total}%")
        print(f"BLEU2: {BLEU2}, out of {total}: {100*BLEU2/total}%")
        print(f"BLEU3: {BLEU3}, out of {total}: {100*BLEU3/total}%")
        print(f"BLEU4: {BLEU4}, out of {total}: {100*BLEU4/total}%")
    else:
        print("No Questions found with the given starting word")
    print("Errors: ", errors)
    print("Empties: ", len(empties), empties)
    return correct, total, errors, corrects


if __name__ == "__main__":

    startsWith = ["where "]
    x = QuestionStartsWith_Accuracy(validation, startsWith)

  9%|▉         | 1000/10552 [00:10<03:16, 48.67it/s]

Correct: 11, out of 34: 32.35294117647059%
EM: 1, out of 34: 2.9411764705882355%
BLEU: 9.726965773204334, out of 34: 28.60872286236569%
BLEU1: 14.396272864234573, out of 34: 42.34197901245462%
BLEU2: 10.921672994256284, out of 34: 32.122567630165534%
BLEU3: 9.487509362101699, out of 34: 27.904439300299117%
BLEU4: 8.683629935592467, out of 34: 25.540088045860198%


 19%|█▉        | 1984/10552 [00:27<01:27, 98.10it/s] 

Correct: 33, out of 97: 34.02061855670103%
EM: 17, out of 97: 17.52577319587629%
BLEU: 32.11004231485293, out of 97: 33.10313640706488%
BLEU1: 42.40285022997555, out of 97: 43.71427858760366%
BLEU2: 33.87904057434809, out of 97: 34.92684595293617%
BLEU3: 30.476275837764373, out of 97: 31.41884106986018%
BLEU4: 28.715016302224946, out of 97: 29.603109589922624%


 29%|██▊       | 3009/10552 [00:40<02:19, 54.06it/s] 

Correct: 47, out of 134: 35.07462686567164%
EM: 21, out of 134: 15.671641791044776%
BLEU: 43.60780555196087, out of 134: 32.54313847161259%
BLEU1: 56.00398996791866, out of 134: 41.794022364118405%
BLEU2: 44.7500200667397, out of 134: 33.39553736323858%
BLEU3: 40.41391317289923, out of 134: 30.159636696193456%
BLEU4: 38.14017524240127, out of 134: 28.462817345075575%


 38%|███▊      | 3990/10552 [00:52<00:40, 163.07it/s]

Correct: 56, out of 164: 34.146341463414636%
EM: 24, out of 164: 14.634146341463415%
BLEU: 51.73249165763776, out of 164: 31.544202230266926%
BLEU1: 67.27648996505964, out of 164: 41.0222499786949%
BLEU2: 53.32943071000831, out of 164: 32.51794555488311%
BLEU3: 47.97926380951687, out of 164: 29.255648664339553%
BLEU4: 45.10963919704107, out of 164: 27.505877559171385%


 47%|████▋     | 4982/10552 [01:05<02:46, 33.51it/s] 

Correct: 64, out of 198: 32.323232323232325%
EM: 29, out of 198: 14.646464646464647%
BLEU: 60.14417567646138, out of 198: 30.375846301243122%
BLEU1: 79.3959717057536, out of 198: 40.09897560896646%
BLEU2: 62.344310071079086, out of 198: 31.487025288423784%
BLEU3: 55.5789375985511, out of 198: 28.07017050431874%
BLEU4: 51.905716543517265, out of 198: 26.21500835531175%


 55%|█████▌    | 5849/10552 [01:17<01:08, 68.21it/s] 

Correct: 71, out of 229: 31.004366812227076%
EM: 34, out of 229: 14.847161572052402%
BLEU: 69.21768330854937, out of 229: 30.22606258015256%
BLEU1: 90.3795995536418, out of 229: 39.46707404089162%
BLEU2: 71.47948388642934, out of 229: 31.213748422021542%
BLEU3: 63.93194461710433, out of 229: 27.917879745460404%
BLEU4: 59.738695705656546, out of 229: 26.086766683692815%


 66%|██████▋   | 6998/10552 [01:31<00:13, 259.64it/s]

Correct: 80, out of 258: 31.007751937984494%
EM: 35, out of 258: 13.565891472868216%
BLEU: 74.68774888511221, out of 258: 28.94873987795047%
BLEU1: 98.26541801939418, out of 258: 38.08737132534658%
BLEU2: 76.6020468490663, out of 258: 29.690715832971435%
BLEU3: 68.32207892410275, out of 258: 26.48142593957471%
BLEU4: 63.76198443310685, out of 258: 24.71394745469258%


 75%|███████▌  | 7962/10552 [01:35<00:08, 293.18it/s]

Correct: 85, out of 270: 31.48148148148148%
EM: 37, out of 270: 13.703703703703704%
BLEU: 78.02538250229046, out of 270: 28.898289815663134%
BLEU1: 102.90113804138355, out of 270: 38.111532607919834%
BLEU2: 80.19273511064534, out of 270: 29.70101300394272%
BLEU3: 71.5428648738088, out of 270: 26.497357360669923%
BLEU4: 66.84143136832775, out of 270: 24.75608569197324%


  referred_head_lexeme.similarity(referring_head_lexeme)
  referred_head_lexeme.similarity(referring_head_lexeme)
  referred_head_lexeme.similarity(referring_head_lexeme)
  referred_head_lexeme.similarity(referring_head_lexeme)
 85%|████████▍ | 8956/10552 [01:57<00:24, 66.21it/s]

Correct: 93, out of 331: 28.09667673716012%
EM: 40, out of 331: 12.084592145015106%
BLEU: 89.70917598367197, out of 331: 27.102470085701505%
BLEU1: 119.75748870511767, out of 331: 36.180510182815006%
BLEU2: 90.66809508709726, out of 331: 27.392173742325458%
BLEU3: 79.94651884921835, out of 331: 24.153026842664154%
BLEU4: 74.11394286005749, out of 331: 22.39091929306873%


  referred_head_lexeme.similarity(referring_head_lexeme)
  referred_head_lexeme.similarity(referring_head_lexeme)
  referred_head_lexeme.similarity(referring_head_lexeme)
  referred_head_lexeme.similarity(referring_head_lexeme)
  referred_head_lexeme.similarity(referring_head_lexeme)
 95%|█████████▍| 9997/10552 [02:26<00:13, 40.81it/s]

Correct: 108, out of 400: 27.0%
EM: 43, out of 400: 10.75%
BLEU: 102.98285502952864, out of 400: 25.74571375738216%
BLEU1: 141.21840620715292, out of 400: 35.30460155178823%
BLEU2: 104.74844810908009, out of 400: 26.18711202727002%
BLEU3: 91.03907405515244, out of 400: 22.759768513788107%
BLEU4: 83.50820165622403, out of 400: 20.877050414056008%


100%|██████████| 10552/10552 [02:43<00:00, 64.72it/s] 

Correct: 114, out of 431: 26.45011600928074%
EM: 44, out of 431: 10.208816705336426%
BLEU: 110.38802863047235, out of 431: 25.612071607998224%
BLEU1: 151.86970645919962, out of 431: 35.23659082580038%
BLEU2: 113.03159572667822, out of 431: 26.225428242848775%
BLEU3: 98.14737674219926, out of 431: 22.772013165243447%
BLEU4: 90.05480145366859, out of 431: 20.89438548809016%
Errors:  []
Empties:  47 [1195, 1508, 1668, 1964, 2042, 2409, 2431, 2473, 3393, 3892, 3906, 4220, 4584, 4592, 4624, 4628, 4631, 5077, 5459, 5460, 6151, 6389, 6398, 6626, 7485, 7825, 8735, 8748, 8755, 8764, 8791, 8805, 8825, 8826, 8829, 8844, 8887, 8889, 8890, 8920, 8942, 8956, 9086, 9467, 9473, 9542, 10116]





In [85]:
# [174, 315, 326, 349, 472, 488, 493, 498, 530, 667]
# when: 48%