## [Task 1] Remove unaswerable QA pairs

Write your own script to remove unaswerable QA pairs from both train and validation sets.

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import json
from pandas.io.json import json_normalize
import warnings
warnings.filterwarnings('ignore')
import torch


## Dataset Download


In [3]:
import os
import urllib.request
from tqdm import tqdm

class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)
        
def download_url(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)

def download_data(data_path, url_path, suffix):    
    if not os.path.exists(data_path):
        os.makedirs(data_path)
        
    data_path = os.path.join(data_path, f'{suffix}.json')

    if not os.path.exists(data_path):
        print(f"Downloading CoQA {suffix} data split... (it may take a while)")
        download_url(url=url_path, output_path=data_path)
        print("Download completed!")

In [4]:
# Train data
train_url = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
download_data(data_path='coqa', url_path=train_url, suffix='train')

# Test data
test_url = "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
download_data(data_path='coqa', url_path=test_url, suffix='test')  

##Data Inspection

In [5]:
data=json.load((open('/content/coqa/train.json')))
data['data'][0]['questions'][0]

{'input_text': 'When was the Vat formally opened?', 'turn_id': 1}

In [6]:
#stessa cosa giù
#data=json.load((open('/content/coqa/train.json')))
#qas=json_normalize(data['data'], ['questions'],['source','id','story'])
#ans=json_normalize(data['data'], ['answers'],['id'])
#train_df = pd.merge(qas,ans, left_on=['id','turn_id'],right_on=['id','turn_id'] )
#train_df.loc[ 10000: 108647,['turn_id','input_text_x','input_text_y','span_text'] ]

In [7]:
'''cols = ["text","question","answer","span_text"]

coqa=pd.read_json('/content/coqa/train.json')

comp_list = []
for index, row in coqa.iterrows():
    for i in range(len(row["data"]["questions"])):
        temp_list = []
        if row["data"]["answers"][i]["input_text"] != 'unknown':
          temp_list.append(row["data"]["story"])
          temp_list.append(row["data"]["questions"][i]["input_text"])
          temp_list.append(row["data"]["answers"][i]["input_text"])
          temp_list.append(row["data"]["answers"][i]["span_text"])
          comp_list.append(temp_list)

#create pandas DataFrame
new_df = pd.DataFrame(comp_list, columns=cols) 
#save in csv format
new_df.to_csv("CoQA_data.csv", index=False)
#read and use it as csv
data = pd.read_csv("CoQA_data.csv")
data.iloc[0:100]
'''

'cols = ["text","question","answer","span_text"]\n\ncoqa=pd.read_json(\'/content/coqa/train.json\')\n\ncomp_list = []\nfor index, row in coqa.iterrows():\n    for i in range(len(row["data"]["questions"])):\n        temp_list = []\n        if row["data"]["answers"][i]["input_text"] != \'unknown\':\n          temp_list.append(row["data"]["story"])\n          temp_list.append(row["data"]["questions"][i]["input_text"])\n          temp_list.append(row["data"]["answers"][i]["input_text"])\n          temp_list.append(row["data"]["answers"][i]["span_text"])\n          comp_list.append(temp_list)\n\n#create pandas DataFrame\nnew_df = pd.DataFrame(comp_list, columns=cols) \n#save in csv format\nnew_df.to_csv("CoQA_data.csv", index=False)\n#read and use it as csv\ndata = pd.read_csv("CoQA_data.csv")\ndata.iloc[0:100]\n'

In [8]:
def get_df(fname):
    with open(fname, 'r') as file:
        f = json.load(file)
        
    return pd.DataFrame(f['data'])

In [9]:
coqa = get_df('/content/coqa/train.json')

def clean_df(df):
    # we need only story question and answer column
    data = {
        'text' : [],
        'questions' : [],
        'answers' : [],
        'span_text' : [],
    }
    
    for r in df.iterrows():
        ques_leng = len(r[1].questions)
        for i in range(ques_leng):
            data['text'].append(r[1].story.replace('\n', ''))
            data['questions'].append(r[1]["questions"][i]['input_text'].replace('\n', ''))
            data['answers'].append(r[1]['answers'][i]['input_text'].replace('\n', ''))
            data['span_text'].append(r[1]['answers'][i]['span_text'])
                       
    return pd.DataFrame(data)



In [10]:
data = clean_df(coqa)

print("Number of question and answers: ", len(data))
data.head()

Number of question and answers:  108647


Unnamed: 0,text,questions,answers,span_text
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,It was formally established in 1475,Formally established in 1475
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,research,he Vatican Library is a research library
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,"history, and law",Vatican Library is a research library for hist...
3,"The Vatican Apostolic Library (), more commonl...",and?,"philosophy, science and theology",Vatican Library is a research library for hist...
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,a project,"March 2014, the Vatican Library began an initi..."


In [11]:
df = data.loc[data['answers']!='unknown']
print("Number of question and answers without unkown answering: ", len(df))

Number of question and answers without unkown answering:  107276


## [Task 2] Train, Validation and Test splits

CoQA only provides a train and validation set since the test set is hidden for evaluation purposes.

We'll consider the provided validation set as a test set. <br>
$\rightarrow$ Write your own script to:
* Split the train data in train and validation splits (80% train and 20% val)
* Perform splits such that a dialogue appears in one split only! (i.e., split at dialogue level)
* Perform splitting using the following seed for reproducibility: 42

#### Reproducibility Memo

Check back tutorial 2 on how to fix a specific random seed for reproducibility!

In [12]:
from sklearn.model_selection import train_test_split


# Random split
train, val = train_test_split(df,
                              train_size=0.80,
                              test_size=0.20,
                              random_state=42)

print('Dataset splits statistics: ')
print(f'Train data: {train.shape}')
print(f'Validation data: {val.shape}')


Dataset splits statistics: 
Train data: (85820, 4)
Validation data: (21456, 4)


In [13]:
train.head()

Unnamed: 0,text,questions,answers,span_text
54860,Where did that number come from? Eleven and Tw...,So how did they get to 28?,he took one day from each of the 30-day months...,he took one day from each of the 30-day months...
69607,"Abidjan, Ivory Coast (CNN) -- The European Uni...",How much was the package in value?,180 million euros,180 million euros
94456,"ATLANTA, Georgia (CNN) -- Michele Trobaugh reg...",Did she think Adams was untrustworthy?,No,She says she trusted him right away.
94333,"CHAPTER V--""BLOODY AS THE HUNTER"" The lads lay...",Who was he talking to?,Matcham,"""Ye but deride me,"" answered Matcham"
47220,"The Pleistocene (, often colloquially referred...",What does Pleistocene mean literally?,"""Most New.""","""Most New"""


## [Task 3] Model definition

Write your own script to define the following transformer-based models from [huggingface](https://HuggingFace.co/).

* [M1] DistilRoBERTa (distilberta-base)
* [M2] BERTTiny (bert-tiny)

**Note**: Remember to install the ```transformers``` python package!

**Note**: We consider small transformer models for computational reasons!

In [40]:
from transformers import BertForQuestionAnswering,AutoTokenizer,BertTokenizer,DistilBertTokenizer, DistilBertForQuestionAnswering,pipeline

## [Task 4] Question generation with text passage $P$ and question $Q$

We want to define $f_\theta(P, Q)$. 

Write your own script to implement $f_\theta$ for each model: M1 and M2.

#### Formulation

Consider a dialogue on text passage $P$. 

For each question $Q_i$ at dialogue turn $i$, your model should take $P$ and $Q_i$ and generate $A_i$.

In [44]:

def question_answer(question, text, qa_model = 'berttiny'):
    answer = None
    if qa_model == 'berttiny':
        print('model :==> BERTTiny')
        modelname = 'deepset/bert-base-cased-squad2'
        model = BertForQuestionAnswering.from_pretrained(modelname)
        tokenizer = BertTokenizer.from_pretrained(modelname)
        input_ids = tokenizer.encode(str(question), str(text))
        tokens = tokenizer.convert_ids_to_tokens(input_ids)
        sep_idx = input_ids.index(tokenizer.sep_token_id)
        num_seg_a = sep_idx+1
        num_seg_b = len(input_ids) - num_seg_a
        segment_ids = [0]*num_seg_a + [1]*num_seg_b
        output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))

        answer_start = torch.argmax(output.start_logits)
        answer_end = torch.argmax(output.end_logits)

        if answer_end >= answer_start:
            answer = tokens[answer_start]
            for i in range(answer_start+1, answer_end+1):
                if tokens[i][0:2] == "##":
                    answer += tokens[i][2:]
                else:
                    answer += " " + tokens[i]

            if answer.startswith('[CLS]'):
                return None
    else:
        print('model :==> distilbert')
        modelname = 'distilbert-base-cased-distilled-squad'
        nlp = pipeline('question-answering', model=modelname,tokenizer=modelname)
        QA_input = {
            'question': question,
            'context':  text
        }
        res = nlp(QA_input)
        answer = res['answer']
    
    return answer


In [45]:
def get_sample_qa(df):
    sample = df.sample(1)
    while len(sample.text.values[0].split()) > 512 and len(sample.questions.values[0].split()) < 2:
        sample = df.sample(1)
        
    print(f'''

    {'  Text  '.center(80, '=')}

    {sample.text.values[0]}

    {'  Question  '.center(80, '=')}

    {sample.questions.values[0]}

    {'  Answer  '.center(80, '=')}

    {sample.answers.values[0]}

    ''')
    
    return sample

In [46]:
sample = get_sample_qa(train)
distilbert_ans = question_answer(sample.questions.values[0], sample.text.values[0],'distilbert')
print({'  predicted answer  '.center(80, '=')})
print(distilbert_ans)




    Baptism (from the Greek noun βάπτισμα "baptisma"; see below) is a Christian sacrament of admission and adoption, almost invariably with the use of water, into the Christian Church generally. The canonical Gospels report that Jesus was baptized—a historical event to which a high degree of certainty can be assigned. Baptism has been called a holy sacrament and an ordinance of Jesus Christ. In some denominations, baptism is also called christening, but for others the word "christening" is reserved for the baptism of infants. Baptism has also given its name to the Baptist churches and denominations. The usual form of baptism among the earliest Christians was for the candidate to be immersed, either totally (submerged completely under the water) or partially (standing or kneeling in water while water was poured on him or her). While John the Baptist's use of a deep river for his baptism suggests immersion, "The fact that he chose a permanent and deep river suggests that more than a t

Downloading:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/249M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Baptism


In [48]:
sample = get_sample_qa(train)
berttiny_ans = question_answer(sample.questions.values[0], sample.text.values[0])
print({'  predicted answer  '.center(80, '=')})
print(berttiny_ans)




    Frankfurt, officially Frankfurt am Main (Literally "Frankfurt on the Main", ), is a metropolis and the largest city in the German state of Hesse and the fifth-largest city in Germany, with a 2015 population of 732,688 within its administrative boundaries, and 2.3 million in its urban area. The city is at the centre of the larger Frankfurt Rhine-Main Metropolitan Region, which has a population of 5.5 million and is Germany's second-largest metropolitan region after Rhine-Ruhr. Since the enlargement of the European Union in 2013, the geographic centre of the EU is about to the east of Frankfurt's CBD, the Bankenviertel. Frankfurt is culturally and ethnically diverse, with around half of the population, and a majority of young people, having a migration background. A quarter of the population are foreign nationals, including many expatriates. Frankfurt is an alpha world city and a global hub for commerce, culture, education, tourism and transportation. It's the site of many global 

## [Task 5] Question generation with text passage $P$, question $Q$ and dialogue history $H$

We want to define $f_\theta(P, Q, H)$. Write your own script to implement $f_\theta$ for each model: M1 and M2.

#### Formulation

Consider a dialogue on text passage $P$. 

For each question $Q_i$ at dialogue turn $i$, your model should take $P$, $Q_i$, and $H = \{ Q_0, A_0, \dots, Q_{i-1}, A_{i-1} \}$ to generate $A_i$.

### Prove random, cose brutte

In [18]:
from allennlp_models.rc.tools import squad

In [None]:
from typing import List, Callable, Dict

In [16]:
from tensorflow import keras
from tensorflow.keras import layers

In [14]:
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [17]:
!pip install allennlp-models

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
def create_bert_model(compile_info: List[Dict],
                      pretrained_model_name_or_path: str) -> keras.Model:
    """
    Create a Keras model given a list of layer information

    :param layers_info: a list of dictionaries, one for each layer
    :param compile_info: dictionary containing compile information

    :return
        model: the built keras sequential model
    """
    # Load pre-trained model
    bert = BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path,
                                                                 num_labels=2)
    
    model.summary()
    model.compile(**compile_info)
    return bert

In [None]:
pretrained_model_name_or_path = 'deepset/bert-base-cased-squad2'
compile_info = {
    'optimizer': keras.optimizers.Adam(learning_rate=1e-4),
    'loss': 'sparse_categorical_crossentropy',
    'metrics': [keras.metrics.SparseCategoricalAccuracy()],
}
# Training
training_info = {
    'verbose': 1,
    'epochs': 3,
    'batch_size': 16,
}
# Inference
prediction_info = {
    'batch_size': 16,
    'verbose': 1
}

model = create_bert_model(compile_info=compile_info,
                          pretrained_model_name_or_path=pretrained_model_name_or_path)

In [None]:
import random

def set_reproducibility(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

In [None]:
seeds = [42]
avg_metric_info = {}

for seed in seeds:
    print(f'Running with seed: {seed}')
    set_reproducibility(seed)
    
    model = create_bert_model(compile_info=compile_info,
                          pretrained_model_name_or_path=pretrained_model_name_or_path)
    model = train_model(model=model, x_train=x_train, y_train=y_train,
                        x_val=x_val, y_val=y_val, training_info=training_info,
                        show=False)

    test_predictions = predict_data(model=model, x=x_test,
                                          prediction_info=prediction_info)
    test_predictions = np.argmax(test_predictions, axis=-1)

    metric_info = evaluate_predictions(predictions=test_predictions,
                                       y=y_test,
                                       metrics=metrics,
                                       metric_names=metric_names)
    for key, value in metric_info.items():
        avg_metric_info.setdefault(key, []).append(value)

avg_metric_info = {key: np.mean(value) for key, value in avg_metric_info.items()}
print(f'Metrics info: \n{avg_metric_info}')