# defining functions with parameters

In [1]:
import pandas as pd
import gc
import torch
from datasets import Dataset
from mirage import factcc, trueTeacher, feqa, factacc

def load_dataset(dataset_path):
    df = pd.read_csv(dataset_path)
    return Dataset.from_pandas(df)

def clear_gpu_memory():
    gc.collect()
    torch.cuda.empty_cache()
    for device_id in range(torch.cuda.device_count()):
        torch.cuda.set_device(device_id)
        torch.cuda.empty_cache()

def evaluate_factcc(dataset, dataset_name, source_col, gen_col, save_folder):
    clear_gpu_memory()
    
    metric = factcc()
    metric.create_pipeline(device="cuda:0")
    
    map_kwargs = {"batched": True, "batch_size": 10}
    
    return metric.evaluate_dataset(
        dataset,
        source_col=source_col,
        gen_col=gen_col,
        truncation=True,
        save_result_dataset_folder_path=save_folder,
        map_kwargs=map_kwargs
    )

def evaluate_trueTeacher(dataset, dataset_name, source_col, gen_col, save_folder):
    clear_gpu_memory()
    
    metric = trueTeacher()
    metric.create_pipeline(
        torch_dtype=torch.float16,
        device_map="auto",
        offload_folder="offload",
        max_memory={0: "22GB", 1: "22GB", "cpu": "20GB"}
    )
    
    map_kwargs = {"batched": False, "batch_size": 1}
    
    return metric.evaluate_dataset(
        dataset,
        source_col=source_col,
        gen_col=gen_col,
        truncation=True,
        save_result_dataset_folder_path=save_folder
    )

def evaluate_feqa(dataset, dataset_name, source_col, gen_col, save_folder):
    clear_gpu_memory()
    
    metric = feqa()
    metric.create_pipeline(device="cuda:0")
    
    map_kwargs = {"batched": True, "batch_size": 10}
    
    return metric.evaluate_dataset(
        dataset,
        source_col=source_col,
        gen_col=gen_col,
        keep_questions=True,
        keep_answers=True,
        qg_pipeline_call_args={"truncation":True, "max_length":512},
        save_result_dataset_folder_path=save_folder,
        map_kwargs=map_kwargs
    )

def evaluate_qags(dataset, dataset_name, source_col, gen_col, save_folder):
    clear_gpu_memory()
    
    metric = qags()
    metric.create_pipeline(device="cuda:0")
    
    map_kwargs = {"batched": True, "batch_size": 10}
    
    return metric.evaluate_dataset(
        dataset,
        source_col=source_col,
        gen_col=gen_col,
        keep_questions=True,
        keep_answers=True,
        qg_pipeline_call_args={"truncation":True, "max_length":512},
        save_result_dataset_folder_path=save_folder,
        map_kwargs=map_kwargs
    )

def evaluate_factacc(dataset, dataset_name, source_col, gen_col, save_folder):
    clear_gpu_memory()
    
    metric = factacc()
    metric.create_pipeline(device="cuda:0")

    #batched produces errors for dialfact
    map_kwargs = {"batched": False, "batch_size": 10}
    
    return metric.evaluate_dataset(
        dataset,
        source_col=source_col,
        gen_col=gen_col,
        truncation=True,
        save_result_dataset_folder_path=save_folder,
        map_kwargs=map_kwargs
    )


In [2]:
metric_functions = {
    "FactCC": evaluate_factcc,
    #"TrueTeacher": evaluate_trueTeacher,
    "FEQA": evaluate_feqa,
    "FactAcc": evaluate_factacc,
}

# True datasets

In [3]:
true_dataset_folder = "../../datasets/true/"
dataset_dict = {
    "begin": true_dataset_folder + "begin_dev_download.csv",
    "dialfact": true_dataset_folder + "dialfact_valid_download.csv",
    "fever": true_dataset_folder + "fever_dev_download.csv",
    "mnbm": true_dataset_folder + "mnbm_download.csv",
    "q2": true_dataset_folder + "q2_download.csv",
    "qags_cnndm": true_dataset_folder + "qags_cnndm_download.csv",
    "qags_xsum": true_dataset_folder + "qags_xsum_download.csv",
    "summeval": true_dataset_folder + "summeval_download.csv", 
    "vitc": true_dataset_folder + "vitc_dev_download.csv"
}
    

In [4]:
for dataset_name, dataset_path in dataset_dict.items():
    
    data = load_dataset(dataset_path)

    display(data.to_pandas())

Unnamed: 0.1,Unnamed: 0,grounding,previous turn,generated_text,gold label,coarse label,full label set,label
0,0,early skiers used one long pole or spear.,i imagine it makes travel on snow much easier!...,"it is a long pole, or spear",entailment,entailment,not-generic;on-topic;uncooperative;faithful,1
1,1,"every country aims to provide basic education,...",yes before age 12. they have lasted from at le...,most countries aim to provide basic education.,entailment,entailment,not-generic;on-topic;cooperative;faithful,1
2,2,although modern skiing has evolved from beginn...,is it scary?,"yeah, it can be scary",generic,neutral,generic;on-topic;cooperative;hallucination-per...,0
3,3,although modern skiing has evolved from beginn...,is it scary?,"it has evolved from beginnings in scandinavia,...",entailment,entailment,not-generic;on-topic;uncooperative;faithful,1
4,4,popping of the kernels was achieved by hand on...,do you know how many flavours there are?,we know about a few varieties,generic,neutral,generic;on-topic;cooperative;hallucination-per...,0
...,...,...,...,...,...,...,...,...
831,831,"religious debates over the ""harry potter"" seri...",i do like some fantasy novels myself. i've nev...,j.k.rowling wrote the novels in the 1800s he a...,entailment,entailment,not-generic;on-topic;cooperative;faithful,1
832,832,almost as soon as automobiles had been invente...,that's so cool. i guess that's just our compet...,it is almost as soon as automobiles had been i...,entailment,entailment,not-generic;on-topic;uncooperative;faithful,1
833,833,"the twin albums ""use your illusion i"" and ""use...","wow, now that is impressive, what albums were ...","i think i'll have to check out the bbc album ""...",hallucination,neutral,not-generic;on-topic;cooperative;hallucination...,0
834,834,"skiing can be a means of transport, a recreati...",wow! skiing has been around that long? i guess...,"oh yeah! it's a means of transport, a recreati...",entailment,entailment,not-generic;on-topic;cooperative;faithful,1


Unnamed: 0.1,Unnamed: 0,grounding,generated_text,label
0,0,Physical attractiveness is the degree to which...,"Yes, it has, but what i think is just vanity t...",1
1,1,Physical attractiveness is the degree to which...,"Yes, it has, but anything you think is just va...",1
2,2,Physical attractiveness is the degree to which...,"Yes, it has, but one I think is just vanity to...",1
3,3,This has led to an interrelation of human fitn...,"Yes, it has, but some I think is just vanity t...",1
4,4,Evidence suggests that cognitive behavioral th...,It has been suggested that cognitive behaviora...,1
...,...,...,...,...
8684,8684,Red is the colour at the end of the visible sp...,Red is at the beginning of the visible spectru...,0
8685,8685,Red is the colour at the end of the visible sp...,Most people who love the color red are seen to...,0
8686,8686,The series will conclude with its eighth seaso...,"I have heard so much about Game of Thrones, I ...",1
8687,8687,The series will conclude with its eighth seaso...,The series will conclude with its eighth seaso...,0


Unnamed: 0.1,Unnamed: 0,id,grounding,generated_text,label
0,0,91198,Colin Kaepernick . Kaepernick began his profes...,Colin Kaepernick became a starting quarterback...,0
1,1,194462,Katherine Matilda `` Tilda '' Swinton ( born 5...,Tilda Swinton is a vegan.,0
2,2,137334,Soul Food is a 1997 American comedy-drama film...,Fox 2000 Pictures released the film Soul Food.,1
3,3,166626,"Anne Rice . Born in New Orleans , Rice spent m...",Anne Rice was born in New Jersey.,0
4,4,111897,Telemundo ( [ teleˈmundo ] ) is an American Sp...,Telemundo is a English-language television net...,0
...,...,...,...,...,...
18204,18204,17554,Amy Jade Winehouse ( 14 September 1983 -- 23 J...,Amy Winehouse died on 23 July 2011.,1
18205,18205,26217,Java . Formed mostly as the result of volcanic...,Java was formed by volcanic eruptions.,1
18206,18206,145641,Michael Hutchence . On the morning of 22 Novem...,Michael Hutchence died on a boat.,0
18207,18207,87517,"The Cyclades ( [ ˈsɪklədiːz ] Κυκλάδες , [ ci...",The Cyclades are located to the southeast of G...,1


Unnamed: 0.1,Unnamed: 0,bbcid,model,grounding,generated_text,label
0,0,34687720,BERTS2S,France's Dubuisson carded a 67 to tie with ove...,rory mcilroy will take a one-shot lead into th...,0
1,1,21267591,BERTS2S,Sheikh Ali Salman told the BBC that for nation...,the leader of bahrain\'s main opposition party...,1
2,2,29347895,BERTS2S,He died at his home in Cambridge following an ...,veteran classical music conductor christopher ...,0
3,3,37618111,BERTS2S,"In the year to the end of March, 57 victims of...",the number of homicides recorded by police in ...,0
4,4,37895159,BERTS2S,The Cherries went down 2-1 at Sunderland on Sa...,bournemouth manager eddie howe says his side a...,0
...,...,...,...,...,...,...
2495,2495,33721182,TranS2S,The Briton finished more than two minutes ahea...,paralympic champion sarah storey won her secon...,0
2496,2496,39017131,TranS2S,The victim was fatally injured outside a prope...,a 23-year-old man has died after a shooting ou...,0
2497,2497,40634994,TranS2S,Marcy Smith was woken up by her son David to f...,an 11-year-old boy has died in a house fire in...,0
2498,2498,35082344,TranS2S,The venture's backers include Tesla Motors and...,one of the world\'s biggest physicists has war...,0


Unnamed: 0.1,Unnamed: 0,model,grounding,generated_text,label
0,0,dodeca,In the gradual process of families improving t...,yes they are very well behaved . they are also...,0
1,1,memnet,In the gradual process of families improving t...,"i ' m not sure , but they ' ve been around for...",0
2,2,dodeca,Ross went from being a public-television perso...,he became famous on youtube in the 20th century .,0
3,3,memnet,Ross went from being a public-television perso...,he became popular in the 1980 ' s and 1980s .,1
4,4,dodeca,Stamp collecting proved to be an almost perfec...,stamp collecting was proven to be a perfect ho...,0
...,...,...,...,...,...
1083,1083,memnet,Dance is a performing art form consisting of p...,dance is a performing art form that involves s...,1
1084,1084,dodeca,Dance can be categorized and described by its ...,there are categories for it ' s choreography a...,1
1085,1085,memnet,Dance can be categorized and described by its ...,dance is categorized by its choreography and r...,1
1086,1086,dodeca,"Subsequently, in the new millennium, the popul...","yes , it is mostly sampling . it has also incr...",0


Unnamed: 0.1,Unnamed: 0,grounding,generated_text,label
0,0,Vitamin and mineral supplements are becoming m...,` the typical western diet is heavily processe...,1
1,1,England will send an under 20 team to the toul...,Aidy boothroyd will be the man in charge of th...,1
2,2,A southern iowa chiropractor accused of accept...,A chiropractor in iowa has surrendered his lic...,0
3,3,You'd have thought the celebrations would've s...,The new england patriots beat seattle seahawks...,0
4,4,"Surkhet, nepal ( cnn ) ten years ago, with her...",Nominations are open for cnn heroes 2015. Doyn...,0
...,...,...,...,...
230,230,( cnn ) did former new england patriot aaron h...,Aaron hernandez has pleaded not guilty to murd...,0
231,231,( cnn ) call it a little piece of heaven for a...,`` july 13th 2014 was the absolute worst day o...,0
232,232,Kim sears looks set to be a glowing bride afte...,Kim sears will marry her long-term partner lat...,0
233,233,David beckham's 40th birthday celebrations nex...,"David beckham's 40th birthday is on saturday, ...",0


Unnamed: 0.1,Unnamed: 0,grounding,generated_text,label
0,0,A g4s security van has been robbed outside a b...,Two security guards have been threatened durin...,1
1,1,London's first history day will be held on the...,Big ben's 150th anniversary has been chosen as...,0
2,2,India finished the opening day of the final te...,India's batsmen dominated the first day of the...,0
3,3,Winger dean cox says he will have to remain pa...,Former leyton orient striker dean cox says he ...,0
4,4,"A man, who downloaded thousands of images of c...",A man who admitted downloading and viewing chi...,1
...,...,...,...,...
234,234,Wales should be central in the debate on the u...,Welsh secretary stephen crabb has said wales s...,1
235,235,There has been a large increase in the number ...,The number of illegal immigrants detained or a...,0
236,236,"Fear of discrimination means 84,000 deaf and h...",Deaf people are being discriminated against at...,0
237,237,"Up to 4,000 people in wales could be affected ...",The future of bhs's pension scheme could be de...,1


Unnamed: 0.1,Unnamed: 0,id,grounding,generated_text,label
0,0,8764fb95bfad8ee849274873a92fb8d6b400eee2,Paul Merson has restarted his row with Andros ...,paul merson was brought on with only seven min...,0
1,1,8764fb95bfad8ee849274873a92fb8d6b400eee2,Paul Merson has restarted his row with Andros ...,paul merson has restarted his row with andros ...,1
2,2,8764fb95bfad8ee849274873a92fb8d6b400eee2,Paul Merson has restarted his row with Andros ...,paul merson has restarted his row with andros ...,1
3,3,8764fb95bfad8ee849274873a92fb8d6b400eee2,Paul Merson has restarted his row with Andros ...,paul merson has restarted his row with andros ...,1
4,4,8764fb95bfad8ee849274873a92fb8d6b400eee2,Paul Merson has restarted his row with Andros ...,paul merson has restarted his row with andros ...,1
...,...,...,...,...,...
1595,1595,e880fda4c25289f8325574246f0f8ed4ff5eb26b,A timewarp home which has remained unchanged s...,a timewarp home which has remained unchanged s...,1
1596,1596,e880fda4c25289f8325574246f0f8ed4ff5eb26b,A timewarp home which has remained unchanged s...,"The collector 's paradise in Horfield , Bristo...",1
1597,1597,e880fda4c25289f8325574246f0f8ed4ff5eb26b,A timewarp home which has remained unchanged s...,"the collector’s paradise in horfield , bristol...",1
1598,1598,e880fda4c25289f8325574246f0f8ed4ff5eb26b,A timewarp home which has remained unchanged s...,"the collector 's paradise in horfield , bristo...",1


Unnamed: 0.1,Unnamed: 0,id,grounding,generated_text,label
0,0,5ea2d97bc9e77c0009cda36d_1,Among the more than 512 guests and musical per...,Dragon Con had less than 1000 guests .,0
1,1,5ea2d97bc9e77c0009cda36d_2,Among the more than 6000 guests and musical pe...,Dragon Con had less than 1000 guests .,0
2,2,5ea2d97bc9e77c0009cda36d_3,Among the more than 512 guests and musical per...,Dragon Con had over 5000 guests .,0
3,3,5ea2d97bc9e77c0009cda36d_4,Among the more than 6000 guests and musical pe...,Dragon Con had over 5000 guests .,1
4,4,5ee3932bc9e77c0008cca539_1,"As of , more than cases of COVID-19 have been ...",COVID-19 has reached less than 185 countries .,0
...,...,...,...,...,...
63049,63049,5ed50881c9e77c000848ebc2_4,X-Men : Apocalypse is a 2016 American superher...,X-Men : Apocalypse is a video game .,0
63050,63050,5ed50880c9e77c000848d8ad_1,Yandex is a Russian book about specialized Int...,Yandex is only a book .,1
63051,63051,5ed50880c9e77c000848d8ad_2,Yandex -LRB- -LSB- ` yʌndɛks -RSB- Яндекс -RRB...,Yandex is only a book .,0
63052,63052,5ed50880c9e77c000848d8ad_3,Yandex -LRB- -LSB- ` yʌndɛks -RSB- Яндекс -RRB...,Yandex is a multinational technology company .,1


In [5]:
from tqdm.notebook import tqdm
import os

In [None]:
for metric_name, metric in metric_functions.items():
    for dataset_name, dataset_path in dataset_dict.items():
        print("Processing Metric " + metric_name + " on dataset " + dataset_name )
        save_folder = "/home/benjamin/work/datasets/MIRAGE/results/true/" + dataset_name +"/" + metric_name + "/"
        if os.path.exists(save_folder):
            print("Already done")
            continue
        data = load_dataset(dataset_path)
        
        metric(data, dataset_name, "grounding", "generated_text", save_folder)

Processing Metric FactCC on dataset begin
Already done
Processing Metric FactCC on dataset dialfact
Already done
Processing Metric FactCC on dataset fever
Already done
Processing Metric FactCC on dataset mnbm
Already done
Processing Metric FactCC on dataset q2
Already done
Processing Metric FactCC on dataset qags_cnndm
Already done
Processing Metric FactCC on dataset qags_xsum
Already done
Processing Metric FactCC on dataset summeval
Already done
Processing Metric FactCC on dataset vitc
Already done
Processing Metric FEQA on dataset begin
Already done
Processing Metric FEQA on dataset dialfact
Already done
Processing Metric FEQA on dataset fever
Already done
Processing Metric FEQA on dataset mnbm
Already done
Processing Metric FEQA on dataset q2
Already done
Processing Metric FEQA on dataset qags_cnndm
Already done
Processing Metric FEQA on dataset qags_xsum
Already done
Processing Metric FEQA on dataset summeval
Already done
Processing Metric FEQA on dataset vitc
Already done
Processi

Map:   0%|          | 0/8689 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Saving the dataset (0/1 shards):   0%|          | 0/8689 [00:00<?, ? examples/s]

Dataset saved in Hugging Face format at /home/benjamin/work/datasets/MIRAGE/results/true/dialfact/FactAcc/
Processing Metric FactAcc on dataset fever


Map:   0%|          | 0/18209 [00:00<?, ? examples/s]

In [None]:
factcc_results = evaluate_factcc(xsumfaith_dataset, dataset_name, "document", "summary", map_kwargs_factcc, save_folder)
trueTeacher_results = evaluate_trueTeacher(xsumfaith_dataset, dataset_name, "document", "summary", map_kwargs_trueTeacher, save_folder)
feqa_results = evaluate_feqa(xsumfaith_dataset, dataset_name, "document", "summary", map_kwargs_feqa, save_folder)
factacc_results = evaluate_factacc(xsumfaith_dataset, dataset_name, "document", "summary", map_kwargs_factcc, save_folder)

# Results can be accessed as variables
print(factcc_results, trueTeacher_results, feqa_results, factacc_results)

In [None]:
import pandas as pd

df = pd.read_csv("../../datasets/true/begin_dev_download.csv")

true_dataset_folder = "../../datasets/true/"

true_dataset_dict = {
    "begin": true_dataset_folder + "begin_dev_download.csv",
    "dialfact": true_dataset_folder + "dialfact_valid_download.csv",
    "fever": true_dataset_folder + "fever_dev_download.csv",
    "mnbm": true_dataset_folder + "mnbm_download.csv",
    "q2": true_dataset_folder + "q2_download.csv",
    "qags_cnndm": true_dataset_folder + "qags_cnndm_download.csv",
    "qags_xsum": true_dataset_folder + "qags_xsum_download.csv",
    "summeval": true_dataset_folder + "summeval_download.csv",
    "vitc": true_dataset_folder + "vitc_dev_download.csv"
}

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("valhalla/t5-base-qg-hl")
model = AutoModelForSeq2SeqLM.from_pretrained("valhalla/t5-base-qg-hl")

In [None]:
model.config.n_positions