In [1]:
!pip install transformers
!pip install tensorflow_addons
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [104]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
from typing import *

import os
import urllib.request
from tqdm import tqdm
from copy import deepcopy
import tensorflow as tf
import tensorflow_addons as tfa
import torch

from transformers import BertForQuestionAnswering, TFAutoModelForQuestionAnswering
from transformers import AutoTokenizer, BertTokenizer

squad_v2=False

In [62]:
class DownloadProgressBar(tqdm):
  def update_to(self, b=1, bsize=1, tsize=None):
    if tsize is not None:
      self.total = tsize
    self.update(b*bsize - self.n)

def download_url(url, output_path):
  with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t:
    urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)

def download_data(data_path, url_path, suffix):
  if not os.path.exists(data_path):
    os.makedirs(data_path)

  data_path = os.path.join(data_path,f'{suffix}.json')

  if not os.path.exists(data_path):
    print(f"Downloading CoQA {suffix} data split... (it may take a while)")
    download_url(url=url_path, output_path=data_path)
    print("Download Completed!")

In [63]:
#Train Data
train_url = "https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json"
download_data(data_path='coqa',url_path=train_url, suffix='train')

#Test Data
test_url = "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json"
download_data(data_path='coqa', url_path = test_url, suffix='test')

In [64]:
train_data = json.load((open('/content/coqa/train.json')))
qas = pd.json_normalize(train_data['data'], ['questions'], ['source', 'id', 'story'])
ans = pd.json_normalize(train_data['data'], ['answers'],['id'])
train_df = pd.merge(qas,ans, left_on=['id','turn_id'], right_on=['id','turn_id'])

In [65]:
train_df['q_first_word']=train_df['input_text_x'].str.lower().str.extract(r'(\w+)')
train_df['q_first_two_words']=train_df['input_text_x'].str.lower().str.extract(r'^((?:\S+\s+){1}\S+).*')

In [66]:
train_df = train_df.loc[train_df['input_text_y']!='unknown']

In [67]:
test_data = json.load((open('/content/coqa/test.json')))
qas = pd.json_normalize(test_data['data'], ['questions'], ['source', 'id', 'story'])
ans = pd.json_normalize(test_data['data'], ['answers'],['id'])
test_df = pd.merge(qas,ans, left_on=['id','turn_id'], right_on=['id','turn_id'])
test_df = test_df.loc[test_df['input_text_y']!='unknown']

In [68]:
train, val = train_test_split(train_df, test_size=0.2, random_state=42)
train.head()

Unnamed: 0,input_text_x,turn_id,bad_turn_x,source,id,story,span_start,span_end,span_text,input_text_y,bad_turn_y,q_first_word,q_first_two_words
54860,So how did they get to 28?,20,,race,39dd6s19jpbtyxnmal6qgea8wr2ze3,Where did that number come from? Eleven and Tw...,1639,1740,he took one day from each of the 30-day months...,he took one day from each of the 30-day months...,,so,so how
69607,How much was the package in value?,9,,cnn,3ii4upycoj7fsz8vructj3gjsr7qdt,"Abidjan, Ivory Coast (CNN) -- The European Uni...",80,98,180 million euros,180 million euros,,how,how much
94456,Did she think Adams was untrustworthy?,6,,cnn,3wq3b2kge8gywyqusjv8nckbhrp1bi,"ATLANTA, Georgia (CNN) -- Michele Trobaugh reg...",426,462,She says she trusted him right away.,No,,did,did she
94333,Who was he talking to?,3,,gutenberg,3qapzx2qn4d41w5gd7yx8eyxhj320q,"CHAPTER V--""BLOODY AS THE HUNTER"" \n\nThe lads...",1208,1244,"""Ye but deride me,"" answered Matcham",Matcham,,who,who was
47220,What does Pleistocene mean literally?,15,,wikipedia,3nvc2eb65qzqj9xkpfnbjgx90ke3yk,"The Pleistocene (, often colloquially referred...",1410,1420,"""Most New""","""Most New.""",,what,what does


In [69]:
train = train[['id','story','input_text_x', 'input_text_y', 'span_text', 'span_start','span_end']]
val = val[['id','story','input_text_x', 'input_text_y', 'span_text', 'span_start','span_end']]
test_df = test_df[['id','story','input_text_x', 'input_text_y', 'span_text', 'span_start','span_end']]
train.rename(columns={'input_text_x': 'questions', 'input_text_y': 'answers', 'span_text': 'reasons'}, inplace=True)
val.rename(columns={'input_text_x': 'questions', 'input_text_y': 'answers', 'span_text': 'reasons'}, inplace=True)
test_df.rename(columns={'input_text_x': 'questions', 'input_text_y': 'answers', 'span_text': 'reasons'}, inplace=True)
display(train.head(),val.head(),test_df.head())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,id,story,questions,answers,reasons,span_start,span_end
54860,39dd6s19jpbtyxnmal6qgea8wr2ze3,Where did that number come from? Eleven and Tw...,So how did they get to 28?,he took one day from each of the 30-day months...,he took one day from each of the 30-day months...,1639,1740
69607,3ii4upycoj7fsz8vructj3gjsr7qdt,"Abidjan, Ivory Coast (CNN) -- The European Uni...",How much was the package in value?,180 million euros,180 million euros,80,98
94456,3wq3b2kge8gywyqusjv8nckbhrp1bi,"ATLANTA, Georgia (CNN) -- Michele Trobaugh reg...",Did she think Adams was untrustworthy?,No,She says she trusted him right away.,426,462
94333,3qapzx2qn4d41w5gd7yx8eyxhj320q,"CHAPTER V--""BLOODY AS THE HUNTER"" \n\nThe lads...",Who was he talking to?,Matcham,"""Ye but deride me,"" answered Matcham",1208,1244
47220,3nvc2eb65qzqj9xkpfnbjgx90ke3yk,"The Pleistocene (, often colloquially referred...",What does Pleistocene mean literally?,"""Most New.""","""Most New""",1410,1420


Unnamed: 0,id,story,questions,answers,reasons,span_start,span_end
90158,3uj1cz6izhpw128f4sjfgr7sxvrs53,"Hong Kong, officially the Hong Kong Special Ad...",Are they involved with China in these groups?,no,such as the Asia-Pacific Economic Cooperation ...,1440,1559
90995,3ouygizwr7y0t36mf5994r6qtxgp0u,Volleyball has become a worldwide sport that i...,Did it remain popular for that age group?,Yes,popular with all age groups,48,75
88481,3ryc5t2d73totxql9isoon7d2tsrpj,(CNN)The suspect behind the knife attack on th...,was he invited to the event?,no,wasn't on the list of those invited,1553,1588
6129,3hutx6f6vunp4dxzfs08yfuffl8o2p,A Sudanese woman sentenced to die for refusing...,What about her mother's religion?,she was Ethiopian Orthodox,her mother was Ethiopian Orthodox,1366,1399
14367,3ty7zaog5fkzic962d418akrztkk0a,Baronets are a rank in the British aristocracy...,What was the cost?,"£1,095","£1,095",511,517


Unnamed: 0,id,story,questions,answers,reasons,span_start,span_end
0,3dr23u6we5exclen4th8uq9rb42tel,"Once upon a time, in a barn near a farm house,...",What color was Cotton?,white,a little white kitten named Cotton,59,93
1,3dr23u6we5exclen4th8uq9rb42tel,"Once upon a time, in a barn near a farm house,...",Where did she live?,in a barn,"in a barn near a farm house, there lived a lit...",18,80
2,3dr23u6we5exclen4th8uq9rb42tel,"Once upon a time, in a barn near a farm house,...",Did she live alone?,no,Cotton wasn't alone,196,215
3,3dr23u6we5exclen4th8uq9rb42tel,"Once upon a time, in a barn near a farm house,...",Who did she live with?,with her mommy and 5 sisters,with her mommy and 5 other sisters,281,315
4,3dr23u6we5exclen4th8uq9rb42tel,"Once upon a time, in a barn near a farm house,...",What color were her sisters?,orange and white,her sisters were all orange with beautiful whi...,428,490


In [70]:
from datasets import Dataset

data_train = Dataset.from_pandas(train)
data_val = Dataset.from_pandas(val)

In [71]:
from datasets import ClassLabel, Sequence
import random
import pandas as pd
from IPython.display import display, HTML


def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(
        dataset
    ), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset) - 1)
        while pick in picks:
            pick = random.randint(0, len(dataset) - 1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
        elif isinstance(typ, Sequence) and isinstance(typ.feature, ClassLabel):
            df[column] = df[column].transform(
                lambda x: [typ.feature.names[i] for i in x]
            )
    display(HTML(df.to_html()))

In [72]:
show_random_elements(data_train)

Unnamed: 0,id,story,questions,answers,reasons,span_start,span_end,__index_level_0__
0,3fui0jhjpxyp360w0uultm1wq81339,"(CNN) -- Sidney Frank made millions marketing Jagermeister and other alcohol brands. Three years after his death, he's a big hit with students at the Ivy League college he briefly attended. \n\nSidney Frank, shown accepting an honorary degree in 2005, gave $100 million to Brown University. \n\nHe's a big hit not because of what he sold but because he's given dozens of them what he couldn't afford as a young man: an education at Rhode Island's Brown University. \n\nOn Sunday, 49 students from low-income families became the first four-year Sidney E. Frank Scholars to graduate from Brown, owing virtually nothing except gratitude to the late liquor magnate. \n\n""The world of difference that he made for each and every one of us is unbelievable, incredible,"" one of the Frank Scholars, 22-year-old Shane Reil, said Sunday. \n\nFrank -- who left Brown after one year in the late 1930s because he couldn't afford to stay -- gave the school a $100 million endowment in 2004. He stipulated that the fund's income go exclusively to covering all tuition and expenses for the neediest of Brown's admitted applicants. Hear graduates say how their dreams came true » \n\nFor this year's graduates, tuition and expenses came to a four-year total of about $180,000 each. The median annual income of the recipients' families was $18,984. \n\nThe gift was the largest single one ever given to Brown and one of the largest ever given for undergraduate scholarships in the United States, according to the school. \n\nReil, a history major who is preparing to co-chair a student conference on U.S.-South Korean relations and aspires to work in politics or foreign service, says the scholarship was the stuff of dreams.",Who is Shane Reil?,a history major .,"Reil, a history major who is preparing to co-chair a student conference on U.S.-South Korean relations and aspires to work in politics or foreign service, says the scholarship was the stuff of dreams.",1490,1690,66565
1,3vzlgyjeyla24xe35qwi43vfd6oxz4,"LONDON, England (CNN) -- The death of a teenage girl in a Welsh village in an apparent copycat suicide has raised fears she may have been part of an Internet death cult already blamed for the deaths of six young men. \n\nNatasha Randall, 17, who was found hanged in her bedroom in Blaengarw, near Bridgend, south Wales, on Thursday, was the seventh person believed to have killed themselves in the local area in the past 12 months, the UK's Press Association reported. \n\nPolice are examining Randall's computer after the teenager posted messages on a social networking site, Bebo, prior to her death dedicated to 20-year-old Liam Clarke, who was found hanged in a Bridgend park last month. \n\nThe message read: ""RIP Clarky boy!! gonna miss ya! always remember the gd times! love ya x. Me too!"" \n\nMessages have also been posted on Randall's page since her death, PA said. ""RIP tash - can't believe you done it!"" one said. Another read: ""Heyaa Babe. Just Poppin In To Say I Let My Balloon Off With A Message On It, Hope You Got It Ok And It Made You Laugh Up There."" \n\nFive more men aged between 17 and 27 have been found hanged in the area since January 2007. \n\nSpeaking to the Daily Mail newspaper, Liam Clarke's father, Kevin Clarke, said the seven who had killed themselves appeared to have known each other. \n\n""We don't know if it is some weird cult or copycat suicides or if they have had some bizarre pact to kill themselves,"" Clarke said.",According to who?,UK's Press Association,the UK's Press Association reported,430,465,22613
2,3pb5a5bd0v68y1d7xl4vpx2l0qwg7q,"CHAPTER XIII \n\nTHE WESTERN EXPRESS \n\nThe old miser was very much excited and began to pace the floor of his cottage. \n\n""Yes, I better tell the police, that's what I better do,"" he muttered. \n\n""There won't be any necessity to tell the police--if it was really my brother who did it,"" said Sam. \n\n""Why not, I'd like to know?"" challenged Hiram Duff. ""He ain't no better'n other folks."" \n\n""If he took the box, I and my family will see to it that you are repaid for your loss, Mr. Duff,"" answered the youngest Rover. \n\n""Humph! Do you guarantee that?"" demanded the old miser, suspiciously. \n\n""Yes."" \n\n""And you can take his word for it, sir,"" added Songbird. ""The Rovers are well-known and wealthy, and they will do exactly as they promise. \n\n""I've heard that name before. Didn't you have some trouble with the railroad company?"" asked Hiram Duff. ""About a busted-up flying machine?"" \n\n""Yes,"" replied Sam. \n\n""And got the best of that skinflint lawyer, Belright Fogg?"" \n\n""We made Mr. Fogg pay for the biplane, yes."" \n\n""I know all about it,"" chuckled Hiram Duff. ""Served Fogg right. And he lost his job with the railroad company, too."" The old man pursed up his lips. ""Well, if you'll give me your word that you will settle with me I won't go to the police. But I want every cent that is coming to me, understand that."" \n\n""You'll get it--if my brother took the box,"" answered Sam. ""But listen to me. First of all I want to find my brother. I think he ought to be under a doctor's care.""",Whose sibling is that?,Rover.,answered the youngest Rover.,482,512,82442
3,320duz38g7m1iwe9yutssn7urg5gjw,"Lying in the sun on a rock, the cougar saw Jeb and his son, Tom, before they saw it. Jeb put his bag down quickly and pulled his jacket open with both hands, making himself look big to the cougar. It worked. The cougar hesitated, ready to attack Jeb, but ready to forget the whole thing, too. \n\nJeb let go of his jacket, grasped Tom and held him across his body, making a cross. Now the cougar's enemy looked even bigger, and it rose up, ready to move away, but unfortunately Tom got scared and struggled free of Jeb. \n\n""Tom, no!"" shouted his father. \n\nBut Tom broke and ran and that's the last thing you do with a cougar. The second Tom broke free, Jeb threw himself on the cougar, just as it jumped from the rock. They hit each other in mid-air and both fell. The cougar was on Jeb in a flash, forgetting about Tom, which was what Jeb wanted. \n\nCougars are not as big as most people think and a determined man stands a chance, even with just his fists. As the cougar's claws got into his left shoulder, Jeb swung his fist at its eyes and hit hard. The animal howled and put its head back. Jeb followed up with his other fist. Then out of the corner of his eye, Jeb saw Tom. The boy was running back to help his father. \n\n""Knife, Tom"" shouted Jeb. \n\nThe boy ran to his father's bag, while Jeb stated shouting as well as hitting, to keep the cougar's attention away from Tom. Tom got the knife and ran over to Jeb. The cougar was moving its head in and out, trying to find a way through the wall Jeb was making out of his arms. Tom swung with the knife, into the cougar's back. It howled horribly and ran off into the mountains. \n\nThe whole fight had taken about thirty seconds.",Does he have a child?,Yes.,"Tom, no!"" shouted his father. \n",521,552,75165
4,3cn4lgxd5xob15goptsutlpfefyy4e,"""Ceci, wake up. It's an earthquake!"" That's what Cecilia Wallace heard her mother shouting on the early morning of February 27th. \n\nCecilia is a 7th-grader. She, her parents and her brother, Sam, were in Chile's capital city, Santiago, the day a big earthquake hit Chile. And like just about everyone else, they were shaken out of their sleep. \n\n""It was so frightening,"" Sam wrote. ""The shaking was so huge that I will never go on a ride again."" Cecilia and Sam wrote about their earthquake experiences. Their reports were later posted on the website. \n\nCecilia, Sam and their parents were staying in an apartment on the 15th floor of a building. They were lucky. Their building stayed standing, because it was built to withstand earthquakes. \n\nNot everyone was as lucky as the Wallace family. More than 800 people died. Many older buildings fell down during the earthquake. \n\nThe damage in Santiago wasn't as bad as in other parts of Chile. So the supermarkets were open for business on the morning of the quake. But it wasn't business as usual. ""The supermarkets have been crazy with people rushing to buy their food for the next while,"" Sam wrote. \n\nNot everyone was able to get money to buy food that morning. So Cecilia and Sam made food bags to _ to people who were begging outside the supermarket. ""We gave some to a kid of my age. I made sure he got cookies and bread."" Sam and Cecilia's mother wrote that the kids also collected money for the Red Cross. \n\nIt's certainly an experience Cecilia, Sam and their parents will never forget. Thankfully, they lived to tell their stories.",Where did the family live in Chile?,"in Chile's capital city, Santiago","in Chile's capital city, Santiago",201,234,52491
5,3lpw2n6lkt2cgf0jtxefvspgiwju50,".British people are famous for drinking tea. But brother and sister, Sarah and Bobby Green, became young millionaires when they opened a chain of American-style coffee shops in the UK. Having the idea: It started when Sarah took a weekend trip to New York to visit her brother Bobby. One evening, in a Thai restaurant, Sarah told Bobby how much she wished she could buy American-style coffee in London. Bobby suggested they started their own coffee shop. Sarah fell in love with the idea. Doing the Research: Back in London, she spent a whole day on the London subway, getting off the train at different stations to taste the coffee. ""It was terrible, and I knew there was a gap in the market."" In 1995, they opened their first Coffee Republic shop in central London. Making it work: The first year was very difficult. British people were not used to the names of American coffees, like latte and macchiato. But being successful was their dream and they were not going to give up. Today, there are over 100 Coffee Republic shops all over the country and the company has PS30 million a year. Advice for others: Sarah has now written a best-selling book about their experience, calledAnyone Can Do It ! She hopes it will help other young people to start their own businesses. She says, ""If you think you have the energy, then get out and follow your dream.""",What do you need to do to follow your dream?,Get out and follow it,"""If you think you have the energy, then get out and follow your dream.""",1284,1355,83517
6,3xm0hyn6nkzzktlgnc8opg8un5dpeh,"CHAPTER XXV \n\nMARCHING ORDERS \n\nA silence followed. To Mike, lying in bed, holding his breath, it seemed a long silence. As a matter of fact it lasted for perhaps ten seconds. Then Mr. Wain spoke. \n\n""You have been out, James?"" \n\nIt is curious how in the more dramatic moments of life the inane remark is the first that comes to us. \n\n""Yes, sir,"" said Wyatt. \n\n""I am astonished. Exceedingly astonished."" \n\n""I got a bit of a start myself,"" said Wyatt. \n\n""I shall talk to you in my study. Follow me there."" \n\n""Yes, sir."" \n\nHe left the room, and Wyatt suddenly began to chuckle. \n\n""I say, Wyatt!"" said Mike, completely thrown off his balance by the events of the night. \n\nWyatt continued to giggle helplessly. He flung himself down on his bed, rolling with laughter. Mike began to get alarmed. \n\n""It's all right,"" said Wyatt at last, speaking with difficulty. ""But, I say, how long had he been sitting there?"" \n\n""It seemed hours. About an hour, I suppose, really."" \n\n""It's the funniest thing I've ever struck. Me sweating to get in quietly, and all the time him camping out on my bed!"" \n\n""But look here, what'll happen?"" \n\nWyatt sat up. \n\n""That reminds me. Suppose I'd better go down."" \n\n""What'll he do, do you think?"" \n\n""Ah, now, what!"" \n\n""But, I say, it's awful. What'll happen?"" \n\n""That's for him to decide. Speaking at a venture, I should say----"" \n\n""You don't think----?"" \n\n""The boot. The swift and sudden boot. I shall be sorry to part with you, but I'm afraid it's a case of 'Au revoir, my little Hyacinth.' We shall meet at Philippi. This is my Moscow. To-morrow I shall go out into the night with one long, choking sob. Years hence a white-haired bank-clerk will tap at your door when you're a prosperous professional cricketer with your photograph in _Wisden_. That'll be me. Well, I suppose I'd better go down. We'd better all get to bed _some_ time to-night. Don't go to sleep.""",Who said something after it?,Mr. Wain,As a matter of fact it lasted for perhaps ten seconds. Then Mr. Wain spoke.,121,197,18671
7,3ywrv122cszv3xjlrvli7cz7km4u8m,"CHAPTER XVII \n\nTHREE DAYS \n\nLincoln awaited Graham in an apartment beneath the flying stages. He seemed curious to learn all that had happened, pleased to hear of the extraordinary delight and interest which Graham took in flying. Graham was in a mood of enthusiasm. ""I must learn to fly,"" he cried. ""I must master that. I pity all poor souls who have died without this opportunity. The sweet swift air! It is the most wonderful experience in the world."" \n\n""You will find our new times full of wonderful experiences,"" said Lincoln. ""I do not know what you will care to do now. We have music that may seem novel."" \n\n""For the present,"" said Graham, ""flying holds me. Let me learn more of that. Your aeronaut was saying there is some trades union objection to one's learning."" \n\n""There is, I believe,"" said Lincoln. ""But for you--! If you would like to occupy yourself with that, we can make you a sworn aeronaut to-morrow."" \n\nGraham expressed his wishes vividly and talked of his sensations for a while. ""And as for affairs,"" he asked abruptly. ""How are things going on?"" \n\nLincoln waved affairs aside. ""Ostrog will tell you that to-morrow,"" he said. ""Everything is settling down. The Revolution accomplishes itself all over the world. Friction is inevitable here and there, of course; but your rule is assured. You may rest secure with things in Ostrog's hands."" \n\n""Would it be possible for me to be made a sworn aeronaut, as you call it, forthwith--before I sleep?"" said Graham, pacing. ""Then I could be at it the very first thing to-morrow again....""",whose rule is assured?,Graham,Graham,924,930,105780
8,3ftyuglfsulqzdpx72oqlslsvfw5da,"Hellen Keller was born in 1880 in the USA. When she was about 19 months old, she got very ill. After many weeks, the doctor told her parents, ""Your daughter is better, but now she can't see and she can't hear."" Her mother and her father were very sad. After a few years , things got worse. There was no way for Helen to speak to other people. She heard nothing. She didn't understand anything. Then one day a teacher came. Her name was Anne Sullivan. She lived with Helen and her family. The teacher helped Helen learn words. Helen was a very bright child and soon she learned to spell her first word. When she was older, she went to college . Helen was a very old woman when she died. The world remembers her today as a brave and wonderful person. She was blind and deaf, but she found a way to see and hear. It helped many people in the world.",When?,1880,in 1880,23,30,80107
9,3auqqel7u5tdyn3i1hi8ajv8ft30vs,"CHAPTER V. \n\nIN LOWER EGYPT. \n\n""I am going on a journey,"" Ameres said to his son a few days after the return from the farm. ""I shall take you with me, Chebron, for I am going to view the progress of a fresh canal that is being made on our estate in Goshen. The officer who is superintending it has doubts whether, when the sluices are opened, it will altogether fulfill its purpose, and I fear that some mistake must have been made in the levels. I have already taught you the theory of the work; it is well that you should gain some practical experience in it; for there is no more useful or honorable profession than that of carrying out works by which the floods of the Nile are conveyed to the thirsty soil."" \n\n""Thank you, father. I should like it greatly,"" Chebron replied in a tone of delight, for he had never before been far south of Thebes. ""And may Amuba go with us?"" \n\n""Yes; I was thinking of taking him,"" the high priest said. ""Jethro can also go, for I take a retinue with me. Did I consult my own pleasure I would far rather travel without this state and ceremony; but as a functionary of state I must conform to the customs. And, indeed, even in Goshen it is as well always to travel in some sort of state. The people there are of a different race to ourselves. Although they have dwelt a long time in the land and conform to its customs, still they are notoriously a stubborn and obstinate people, and there is more trouble in getting the public works executed there than in any other part of the country.""",And what would he attain on this trip hopefully?,practical experience,; it is well that you should gain some practical experience in it;,495,562,2665


In [73]:
model_checkpoint = "distilroberta-base"
batch_size = 16
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [74]:
pad_on_right = tokenizer.padding_side == "right"

In [75]:
def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["questions" if pad_on_right else "story"],
        examples["story" if pad_on_right else "questions"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=400,
        stride=200,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = {"answers": examples["reasons"][sample_index], "span_start":examples["span_start"][sample_index], "span_end":examples["span_end"][sample_index]}
        # If no answers are given, set the cls_index as answer.
        if answers["span_start"] == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["span_start"]
            end_char = answers["span_end"]

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (
                offsets[token_start_index][0] <= start_char
                and offsets[token_end_index][1] >= end_char
            ):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while (
                    token_start_index < len(offsets)
                    and offsets[token_start_index][0] <= start_char
                ):
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [76]:
data_train_smaller = Dataset.from_dict(data_train[:500])
data_val_smaller = Dataset.from_dict(data_val[:500])

In [77]:
tokenized_datasets = data_train_smaller.map(
    prepare_train_features, batched=True, remove_columns=data_train.column_names
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [78]:
tokenized_datasets_v = data_val_smaller.map(
    prepare_train_features, batched=True, remove_columns=data_val.column_names
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [79]:
model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFRobertaForQuestionAnswering.

Some layers of TFRobertaForQuestionAnswering were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['qa_outputs']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [80]:
learning_rate = 2e-5
num_train_epochs = 2
weight_decay = 0.01

In [81]:
train_set = model.prepare_tf_dataset(
    tokenized_datasets,
    shuffle=True,
    batch_size=batch_size,
)

validation_set = model.prepare_tf_dataset(
    tokenized_datasets_v,
    shuffle=False,
    batch_size=batch_size,
)

In [82]:
from transformers import create_optimizer

total_train_steps = len(train_set) * num_train_epochs

optimizer, schedule = create_optimizer(
    init_lr=learning_rate, num_warmup_steps=0, num_train_steps=total_train_steps
)

In [83]:
import tensorflow as tf

model.compile(optimizer=optimizer, jit_compile=True, metrics=["accuracy"])

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [84]:
model.fit(
    train_set,
    validation_data=validation_set,
    epochs=num_train_epochs
)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fed6dcf29d0>

In [26]:
batch = next(iter(validation_set))
output = model.predict_on_batch(batch)
output.keys()

odict_keys(['start_logits', 'end_logits'])

In [27]:
output.start_logits.shape, output.end_logits.shape

((16, 400), (16, 400))

In [28]:
import numpy as np

np.argmax(output.start_logits, -1), np.argmax(output.end_logits, -1)

(array([264,   0,   0, 291, 296, 167,   0, 339, 314,   0,   0, 262, 164,
          0, 296,  56]),
 array([131, 203, 103, 116, 152, 128, 172,  35, 135, 144, 167, 120, 150,
        187, 259, 238]))

In [29]:
n_best_size = 20

In [30]:
import numpy as np

start_logits = output.start_logits[0]
end_logits = output.end_logits[0]
# Gather the indices the best start/end logits:
start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
valid_answers = []
for start_index in start_indexes:
    for end_index in end_indexes:
        if (
            start_index <= end_index
        ):  # We need to refine that test to check the answer is inside the context
            valid_answers.append(
                {
                    "score": start_logits[start_index] + end_logits[end_index],
                    "text": "",  # We need to find a way to get back the original substring corresponding to the answer in the context
                }
            )

In [33]:
def prepare_validation_features(examples):
    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["questions" if pad_on_right else "story"],
        examples["story" if pad_on_right else "questions"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=400,
        stride=200,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [34]:
validation_features = data_val_smaller.map(
    prepare_validation_features,
    batched=True,
    remove_columns=data_val_smaller.column_names,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [35]:
validation_dataset = model.prepare_tf_dataset(
     validation_features,
     shuffle=False,
     batch_size=batch_size,
)

In [36]:
raw_predictions = model.predict(validation_dataset)



In [37]:
raw_predictions

TFQuestionAnsweringModelOutput(loss=None, start_logits=array([[ 0.4271657 ,  0.30052865,  0.14134446, ..., -0.01493698,
        -0.01493686, -0.01493686],
       [ 0.36933422,  0.17928202, -0.07047553, ...,  0.13830562,
         0.00629536,  0.18118   ],
       [ 0.38566452,  0.16822846, -0.02404393, ..., -0.11820608,
        -0.11820603, -0.11820603],
       ...,
       [ 0.41097334,  0.23812789,  0.13180749, ..., -0.04167751,
        -0.04167754, -0.04167754],
       [ 0.36087972,  0.04035559, -0.03299645, ...,  0.10421532,
        -0.04431308,  0.12851638],
       [ 0.37638766,  0.02158862, -0.06588422, ..., -0.10202938,
        -0.10202915, -0.10202915]], dtype=float32), end_logits=array([[-0.5190739 , -0.66327655, -0.67198646, ..., -0.54919183,
        -0.54919195, -0.54919195],
       [-0.5273227 , -0.6123692 , -0.5719215 , ..., -0.71364063,
        -0.52982175, -0.6679364 ],
       [-0.5315317 , -0.6139098 , -0.56993294, ..., -0.46260598,
        -0.46260613, -0.46260613],
     

In [38]:
max_answer_length = 40

In [39]:
start_logits = output.start_logits[0]
end_logits = output.end_logits[0]
offset_mapping = validation_features[0]["offset_mapping"]

#first feature comes from first example. For more general case we need to match example_id to example index
context = data_val_smaller[0]["story"]


#Gather indices best start/end logits:
start_indexes = np.argsort(start_logits)[-1: -n_best_size - 1: -1].tolist()
end_indexes = np.argsort(end_logits)[-1: -n_best_size - 1: -1].tolist()
valid_answers = []

for start_index in start_indexes:
  for end_index  in end_indexes:
    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
    # to part of the input_ids that are not in the context.
    if (start_index >= len(offset_mapping) or end_index >= len(offset_mapping) or offset_mapping[start_index] is None or offset_mapping[end_index] is None):
      continue

    # Don't consider answers with a length that is either < 0 or > max_answer_length.
    if(end_index < start_index or end_index - start_index + 1 > max_answer_length):
      continue
    if(start_index <= end_index):
      start_char = offset_mapping[start_index][0]
      end_char = offset_mapping[end_index][1]
      valid_answers.append({"score": start_logits[start_index] + end_logits[end_index],
                           "text": context[start_char:end_char]})
      
valid_answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:n_best_size]
valid_answers

[{'score': 0.22252774,
  'text': '. \n\nHong Kong used to be a British colony with the perpetual cession of Hong Kong Island from the Qing Empire after the First Opium War (1839'},
 {'score': 0.17686221,
  'text': '. Except in military defence and foreign affairs, Hong Kong maintains its independent executive, legislative and judiciary'},
 {'score': 0.15881693,
  'text': '. \n\nHong Kong used to be a British colony with the perpetual cession of Hong Kong'},
 {'score': 0.15745232,
  'text': '. \n\nHong Kong used to be a British colony with the perpetual cession of Hong Kong Island from the Qing Empire after the First Opium War (18'},
 {'score': 0.14902288,
  'text': '. \n\nHong Kong used to be a British colony with the perpetual cession of Hong'},
 {'score': 0.14461243,
  'text': '. \n\nUnder the principle of "one country, two systems", Hong Kong maintains a separate political and economic system from China'},
 {'score': 0.14208171,
  'text': '. \n\nHong Kong used to be a British colony

In [40]:
data_val_smaller[0]["answers"]

'no'

In [55]:
data_val_smaller[0]['questions']

'Are they involved with China in these groups?'

In [57]:
import collections

examples = data_val_smaller
features = validation_features

example_id_to_index = {k: i for i,k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
  features_per_example[example_id_to_index[feature["example_id"]]].append(i)

In [102]:
def postprocess_qa_predictions(examples, features, all_start_logits, all_end_logits, n_best_size=20, max_answer_length=30):
  
  example_id_to_index = {k: i for i,k in enumerate(examples["id"])}
  features_per_example = collections.defaultdict(list)
  for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)
  
  predictions = collections.OrderedDict()

  print(f"Post-processing{len(examples)} example preditions split into {len(features)} features.")


  for example_index, example in enumerate(tqdm(examples)):
    #indices of the features associated to current example
    feature_indices = features_per_example[example_index]
    min_null_score = None #used if squad_v2 is true
    valid_answers = []

    #loop through all features associated to current example
    for feature_index in feature_indices:
      #grab predictions of model for this feature.
      start_logits = all_start_logits[feature_index]
      end_logits = all_end_logits[feature_index]

      #map positions in out logits to span of texts inoriginal story
      cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
      feature_null_score = start_logits[cls_index]+end_logits[cls_index]
      if min_null_score is None or min_null_score < feature_null_score:
        min_null_score = feature_null_score
      
      #Go through all possibilities for 'n_best_size' greater start and end logits
      start_indexes = np.argsort(start_logits)[-1: -n_best_size - 1: -1].tolist()
      end_indexes = np.argsort(end_logits)[-1: -n_best_size - 1: -1].tolist()
      for start_index in start_indexes:
        for end_index  in end_indexes:
          # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
          # to part of the input_ids that are not in the context.
          if (start_index >= len(offset_mapping) or end_index >= len(offset_mapping) or offset_mapping[start_index] is None or offset_mapping[end_index] is None):
            continue

          # Don't consider answers with a length that is either < 0 or > max_answer_length.
          if(end_index < start_index or end_index - start_index + 1 > max_answer_length):
            continue
          if(start_index <= end_index):
            start_char = offset_mapping[start_index][0]
            end_char = offset_mapping[end_index][1]
            valid_answers.append({"score": start_logits[start_index] + end_logits[end_index],
                                "text": context[start_char:end_char]})
    if len(valid_answers)>0:
      best_answer = sorted(valid_answers,key=lambda x: x["score"], reverse=True)[0]
    else:
      best_answer = {"text": "", "score": 0}
    
    if not squad_v2:
      predictions[example["id"]]=best_answer["text"]
    else:
      answer = (best_answer["text"] if best_answer["score"] > min_null_score else "")
      predictions[example["id"]] = answer

  return predictions

In [105]:
final_predictions = postprocess_qa_predictions(data_val_smaller, validation_features, raw_predictions["start_logits"], raw_predictions["end_logits"],)

Post-processing500 example preditions split into 127 features.


100%|██████████| 500/500 [00:00<00:00, 1545.00it/s]


In [107]:
from datasets import load_metric
metric = load_metric("squad")

In [113]:
if squad_v2:
    formatted_predictions = [
        {"id": k, "prediction_text": v, "no_answer_probability": 0.0}
        for k, v in final_predictions.items()
    ]
else:
    formatted_predictions = [
        {"id": k, "prediction_text": v} for k, v in final_predictions.items()
    ]
references = [
    {"id": ex["id"], "answers": ex["answers"]} for ex in data_val_smaller
]
metric.compute(predictions=formatted_predictions, references=references)
print(formatted_predictions)
print(references)

TypeError: ignored

In [116]:
from transformers import pipeline
question_answerer = pipeline("question-answering", "distilroberta-base", framework="tf") 

All model checkpoint layers were used when initializing TFRobertaForQuestionAnswering.

Some layers of TFRobertaForQuestionAnswering were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['qa_outputs']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [117]:
question_answerer(context=data_val_smaller["story"], question=data_val_smaller["questions"])

[{'score': 1.879495903267525e-05,
  'start': 232,
  'end': 245,
  'answer': 'nationalities'},
 {'score': 4.3813564843731e-05,
  'start': 1305,
  'end': 1317,
  'answer': 'players must'},
 {'score': 8.743636135477573e-05,
  'start': 1560,
  'end': 1623,
  'answer': 'on the list of those invited for the event, according to police'},
 {'score': 1.4067890333535615e-05,
  'start': 0,
  'end': 10,
  'answer': 'A Sudanese'},
 {'score': 1.25002870845492e-05,
  'start': 198,
  'end': 249,
  'answer': 'created the hereditary Order of Baronets in England'},
 {'score': 2.5469362299190834e-05, 'start': 188, 'end': 189, 'answer': '\n'},
 {'score': 1.404230988555355e-05, 'start': 247, 'end': 251, 'answer': 'Sen.'},
 {'score': 4.708084452431649e-05,
  'start': 1375,
  'end': 1391,
  'answer': 'when the Burmese'},
 {'score': 1.5132605767576024e-05, 'start': 167, 'end': 168, 'answer': '\n'},
 {'score': 5.7525248848833144e-05,
  'start': 1089,
  'end': 1159,
  'answer': 'not just a filmmaker who followed