# TAPAS

https://huggingface.co/docs/transformers/model_doc/tapas <br>
https://paperswithcode.com/method/tapas <br>
https://arxiv.org/abs/2004.02349v2 <br>
https://ai.googleblog.com/2020/04/using-neural-networks-to-find-answers.html <br>

In [None]:
! rm -r transformers
! git clone https://github.com/huggingface/transformers.git
! cd transformers
! pip install ./transformers

In [None]:
! pip install torch-scatter==latest+cu101 -f https://pytorch-geometric.com/whl/torch-1.7.0.html

In [2]:
import requests, zipfile, io
import os

def download_files(dir_name):
    if not os.path.exists(dir_name): 
        # 28 training examples from the SQA training set + table csv data
        urls = ["https://www.dropbox.com/s/2p6ez9xro357i63/sqa_train_set_28_examples.zip?dl=1",
                "https://www.dropbox.com/s/abhum8ssuow87h6/table_csv.zip?dl=1"
        ]
        for url in urls:
            r = requests.get(url)
            z = zipfile.ZipFile(io.BytesIO(r.content))
            z.extractall()

dir_name = "sqa_data"
download_files(dir_name)

### Prep Data

In [6]:
! pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.0.10-py2.py3-none-any.whl (242 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.1/242.1 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.10


In [10]:
import pandas as pd

# data = pd.read_excel("sqa_train_set_28_examples.xlsx")
train_data = pd.read_csv('sqa_data/train.tsv', sep='\t')
train_data.head()

Unnamed: 0,id,annotator,position,question,table_file,answer_coordinates,answer_text
0,nt-639,0,0,where are the players from?,table_csv/203_149.csv,"['(0, 4)', '(1, 4)', '(2, 4)', '(3, 4)', '(4, ...","['Louisiana State University', 'Valley HS (Las..."
1,nt-639,0,1,which player went to louisiana state university?,table_csv/203_149.csv,"['(0, 1)']",['Ben McDonald']
2,nt-639,1,0,who are the players?,table_csv/203_149.csv,"['(0, 1)', '(1, 1)', '(2, 1)', '(3, 1)', '(4, ...","['Ben McDonald', 'Tyler Houston', 'Roger Salke..."
3,nt-639,1,1,which ones are in the top 26 picks?,table_csv/203_149.csv,"['(0, 1)', '(1, 1)', '(2, 1)', '(3, 1)', '(4, ...","['Ben McDonald', 'Tyler Houston', 'Roger Salke..."
4,nt-639,1,2,"and of those, who is from louisiana state univ...",table_csv/203_149.csv,"['(0, 1)']",['Ben McDonald']


In [11]:
train_data.keys()

Index(['id', 'annotator', 'position', 'question', 'table_file',
       'answer_coordinates', 'answer_text'],
      dtype='object')

In [9]:
len(train_data)

14541

In [12]:
import ast

def _parse_answer_coordinates(answer_coordinate_str):
    """
    Parses the answer_coordinates of a question.
    Args:
    answer_coordinate_str: A string representation of a Python list of tuple
      strings.
      For example: "['(1, 4)','(1, 3)', ...]"
    """

    try:
        answer_coordinates = []
        # make a list of strings
        coords = ast.literal_eval(answer_coordinate_str)
        # parse each string as a tuple
        for row_index, column_index in sorted(
            ast.literal_eval(coord) for coord in coords):
            answer_coordinates.append((row_index, column_index))
    except SyntaxError:
        raise ValueError('Unable to evaluate %s' % answer_coordinate_str)
  
    return answer_coordinates


def _parse_answer_text(answer_text):
    """
    Populates the answer_texts field of `answer` by parsing `answer_text`.
    Args:
    answer_text: A string representation of a Python list of strings.
      For example: "[u'test', u'hello', ...]"
    answer: an Answer object.
    """
    try:
        answer = []
        for value in ast.literal_eval(answer_text):
            answer.append(value)
    except SyntaxError:
        raise ValueError('Unable to evaluate %s' % answer_text)

    return answer

In [13]:
train_data['answer_coordinates'] = train_data['answer_coordinates'].apply(lambda coords_str: _parse_answer_coordinates(coords_str))
train_data['answer_text'] = train_data['answer_text'].apply(lambda txt: _parse_answer_text(txt))

train_data.head(10)

Unnamed: 0,id,annotator,position,question,table_file,answer_coordinates,answer_text
0,nt-639,0,0,where are the players from?,table_csv/203_149.csv,"[(0, 4), (1, 4), (2, 4), (3, 4), (4, 4), (5, 4...","[Louisiana State University, Valley HS (Las Ve..."
1,nt-639,0,1,which player went to louisiana state university?,table_csv/203_149.csv,"[(0, 1)]",[Ben McDonald]
2,nt-639,1,0,who are the players?,table_csv/203_149.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Ben McDonald, Tyler Houston, Roger Salkeld, J..."
3,nt-639,1,1,which ones are in the top 26 picks?,table_csv/203_149.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Ben McDonald, Tyler Houston, Roger Salkeld, J..."
4,nt-639,1,2,"and of those, who is from louisiana state univ...",table_csv/203_149.csv,"[(0, 1)]",[Ben McDonald]
5,nt-639,2,0,who are the players in the top 26?,table_csv/203_149.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Ben McDonald, Tyler Houston, Roger Salkeld, J..."
6,nt-639,2,1,"of those, which one was from louisiana state u...",table_csv/203_149.csv,"[(0, 1)]",[Ben McDonald]
7,nt-11649,0,0,what are all the names of the teams?,table_csv/204_135.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Cordoba CF, CD Malaga, Granada CF, UD Las Pal..."
8,nt-11649,0,1,"of these, which teams had any losses?",table_csv/204_135.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Cordoba CF, CD Malaga, Granada CF, UD Las Pal..."
9,nt-11649,0,2,"of these teams, which had more than 21 losses?",table_csv/204_135.csv,"[(15, 1)]",[CD Villarrobledo]


In [14]:
def get_sequence_id(example_id, annotator):
    if "-" in str(annotator):
        raise ValueError('"-" not allowed in annotator.')
    return f"{example_id}-{annotator}"

train_data['sequence_id'] = train_data.apply(lambda x: get_sequence_id(x.id, x.annotator), axis=1)
train_data.head()

Unnamed: 0,id,annotator,position,question,table_file,answer_coordinates,answer_text,sequence_id
0,nt-639,0,0,where are the players from?,table_csv/203_149.csv,"[(0, 4), (1, 4), (2, 4), (3, 4), (4, 4), (5, 4...","[Louisiana State University, Valley HS (Las Ve...",nt-639-0
1,nt-639,0,1,which player went to louisiana state university?,table_csv/203_149.csv,"[(0, 1)]",[Ben McDonald],nt-639-0
2,nt-639,1,0,who are the players?,table_csv/203_149.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Ben McDonald, Tyler Houston, Roger Salkeld, J...",nt-639-1
3,nt-639,1,1,which ones are in the top 26 picks?,table_csv/203_149.csv,"[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1...","[Ben McDonald, Tyler Houston, Roger Salkeld, J...",nt-639-1
4,nt-639,1,2,"and of those, who is from louisiana state univ...",table_csv/203_149.csv,"[(0, 1)]",[Ben McDonald],nt-639-1


In [15]:
# let's group table-question pairs by sequence id, and remove some columns we don't need 
grouped = train_data.groupby(by='sequence_id').agg(lambda x: x.tolist())
grouped = grouped.drop(columns=['id', 'annotator', 'position'])
grouped['table_file'] = grouped['table_file'].apply(lambda x: x[0])
grouped.head(10)

Unnamed: 0_level_0,question,table_file,answer_coordinates,answer_text
sequence_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ns-1006-0,"[what are the game titles?, which of the games...",table_csv/203_583.csv,"[[(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, ...","[[Buggy Grand Prix: Kattobi! Dai-Sakusen, Gunb..."
ns-1006-1,"[what are the psikyo titles?, which of these h...",table_csv/203_583.csv,"[[(0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, ...","[[Buggy Grand Prix: Kattobi! Dai-Sakusen, Gunb..."
ns-1006-2,"[what are the notes for the listed games?, whi...",table_csv/203_583.csv,"[[(0, 4), (1, 4), (2, 4), (3, 4), (4, 4), (5, ...","[[, , Released and published in Europe by Play..."
ns-1013-0,[which countries received more than 10 medals?...,table_csv/204_922.csv,"[[(1, 1), (2, 1), (3, 1)], [(1, 1)]]","[[Colombia, Dominican Republic, Peru], [Colomb..."
ns-1013-1,[which countries were in taekwondo at the 2013...,table_csv/204_922.csv,"[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, ...","[[Venezuela, Colombia, Dominican Republic, Per..."
ns-1013-2,[what are the total number of medals each coun...,table_csv/204_922.csv,"[[(0, 5), (1, 5), (2, 5), (3, 5), (4, 5), (5, ...","[[10, 17, 11, 14, 8, 3, 5, 3, 1, 1], [17], [Co..."
ns-1035-0,"[who are all of the drivers?, how many points ...",table_csv/204_641.csv,"[[(0, 2), (1, 2), (2, 2), (3, 2), (4, 2), (5, ...","[[Jim Clark, Denny Hulme, Chris Amon, Jack Bra..."
ns-1035-1,[who where the drivers at the 1967 british gra...,table_csv/204_641.csv,"[[(0, 2), (1, 2), (2, 2), (3, 2), (4, 2), (5, ...","[[Jim Clark, Denny Hulme, Chris Amon, Jack Bra..."
ns-1035-2,"[who are all of the drivers?, how many points ...",table_csv/204_641.csv,"[[(0, 2), (1, 2), (2, 2), (3, 2), (4, 2), (5, ...","[[Jim Clark, Denny Hulme, Chris Amon, Jack Bra..."
ns-1042-0,"[what are the land areas?, of those, which are...",table_csv/203_459.csv,"[[(0, 5), (1, 5), (2, 5), (3, 5), (4, 5), (5, ...","[[155 (60), 144 (55), 144 (56), 142 (55), 142 ..."


In [16]:
# path to the directory containing all csv files
table_csv_path = "table_csv"

item = grouped.iloc[0]
table = pd.read_csv(table_csv_path + item.table_file[9:]).astype(str) 

display(table)
print("")
print(item.question)

Unnamed: 0,Title,Release,6th Gen,Handheld,Note
0,Buggy Grand Prix: Kattobi! Dai-Sakusen,2003,PlayStation 2,,
1,Gunbird Special Edition / Gunbird 1&2,2004,PlayStation 2,,
2,Psikyo Shooting Collection Vol. 1: Strikers 19...,2004,PlayStation 2,,Released and published in Europe by Play It as...
3,Psikyo Shooting Collection Vol. 2: Sengoku Ace...,2004,PlayStation 2,,
4,Psikyo Shooting Collection Vol. 3: Sol Divide ...,2004,PlayStation 2,,
5,Taisen Hot Gimmick: Cosplay Mahjong,2004,PlayStation 2,,
6,Sengoku Cannon,2005,,PSP,
7,Taisen Hot Gimmick: Axes-Jong,2005,PlayStation 2,,



['what are the game titles?', 'which of the games have notes?']


### Tokenization

In [17]:
import torch
from transformers import TapasTokenizer

# initialize the tokenizer
tokenizer = TapasTokenizer.from_pretrained("google/tapas-base")

In [18]:
encoding = tokenizer(table=table, queries=item.question, answer_coordinates=item.answer_coordinates, answer_text=item.answer_text,
                     truncation=True, padding="max_length", return_tensors="pt")
encoding.keys()

dict_keys(['input_ids', 'labels', 'numeric_values', 'numeric_values_scale', 'token_type_ids', 'attention_mask'])

In [19]:
tokenizer.decode(encoding["input_ids"][0])

'[CLS] what are the game titles? [SEP] title release 6th gen handheld note buggy grand prix : kattobi! dai - sakusen 2003 playstation 2 [EMPTY] [EMPTY] gunbird special edition / gunbird 1 & 2 2004 playstation 2 [EMPTY] [EMPTY] psikyo shooting collection vol. 1 : strikers 1945 i & ii 2004 playstation 2 [EMPTY] released and published in europe by play it as 1945 i & ii : the arcade games. psikyo shooting collection vol. 2 : sengoku ace & sengoku blade 2004 playstation 2 [EMPTY] [EMPTY] psikyo shooting collection vol. 3 : sol divide & dragon blaze 2004 playstation 2 [EMPTY] [EMPTY] taisen hot gimmick : cosplay mahjong 2004 playstation 2 [EMPTY] [EMPTY] sengoku cannon 2005 [EMPTY] psp [EMPTY] taisen hot gimmick : axes - jong 2005 playstation 2 [EMPTY] [EMPTY] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [22]:
# assert encoding["token_type_ids"].shape == (3, 512, 7)
assert encoding["token_type_ids"][0][:,3].sum() == 0

In [23]:
print(item.answer_text[0])

['Buggy Grand Prix: Kattobi! Dai-Sakusen', 'Gunbird Special Edition / Gunbird 1&2', 'Psikyo Shooting Collection Vol. 1: Strikers 1945 I & II', 'Psikyo Shooting Collection Vol. 2: Sengoku Ace & Sengoku Blade', 'Psikyo Shooting Collection Vol. 3: Sol Divide & Dragon Blaze', 'Taisen Hot Gimmick: Cosplay Mahjong', 'Sengoku Cannon', 'Taisen Hot Gimmick: Axes-Jong']


In [25]:
# for id, prev_label in zip (encoding["input_ids"][1], encoding["token_type_ids"][1][:,3]):
#     if id != 0: # we skip padding tokens
#         print(tokenizer.decode([id]), prev_label.item())

### Dataset

In [37]:
class TableDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        item = self.df.iloc[idx]
        table = pd.read_csv(table_csv_path + item.table_file[9:]).astype(str) # TapasTokenizer expects the table data to be text only
        if item.position != 0:
          # use the previous table-question pair to correctly set the prev_labels token type ids
            previous_item = self.df.iloc[idx-1]
            encoding = self.tokenizer(table=table, 
                                    queries=[previous_item.question, item.question], 
                                    answer_coordinates=[previous_item.answer_coordinates, item.answer_coordinates], 
                                    answer_text=[previous_item.answer_text, item.answer_text],
                                    padding="max_length",
                                    truncation=True,
                                    return_tensors="pt"
            )
            # use encodings of second table-question pair in the batch
            encoding = {key: val[-1] for key, val in encoding.items()}
        else:
            # this means it's the first table-question pair in a sequence
            encoding = self.tokenizer(table=table, 
                                    queries=item.question, 
                                    answer_coordinates=item.answer_coordinates, 
                                    answer_text=item.answer_text,
                                    padding="max_length",
                                    truncation=True,
                                    return_tensors="pt"
            )
            # remove the batch dimension which the tokenizer adds 
            encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        return encoding

    def __len__(self):
        return len(self.df)

In [58]:
train_dataset = TableDataset(df=train_data, tokenizer=tokenizer)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=2)

In [48]:
train_dataset[0]["token_type_ids"].shape

torch.Size([512, 7])

In [49]:
train_dataset[1]["input_ids"].shape

torch.Size([512])

In [50]:
batch = next(iter(train_dataloader))

In [51]:
batch["input_ids"].shape

torch.Size([16, 512])

In [52]:
batch["token_type_ids"].shape

torch.Size([16, 512, 7])

### Decoding & Verification

In [36]:
tokenizer.decode(batch["input_ids"][0])

'[CLS] where are the players from? [SEP] pick player team position school 1 ben mcdonald baltimore orioles rhp louisiana state university 2 tyler houston atlanta braves c valley hs ( las vegas, nv ) 3 roger salkeld seattle mariners rhp saugus ( ca ) hs 4 jeff jackson philadelphia phillies of simeon hs ( chicago, il ) 5 donald harris texas rangers of texas tech university 6 paul coleman saint louis cardinals of frankston ( tx ) hs 7 frank thomas chicago white sox 1b auburn university 8 earl cunningham chicago cubs of lancaster ( sc ) hs 9 kyle abbott california angels lhp long beach state university 10 charles johnson montreal expos c westwood hs ( fort pierce, fl ) 11 calvin murray cleveland indians 3b w. t. white high school ( dallas, tx ) 12 jeff juden houston astros rhp salem ( ma ) hs 13 brent mayne kansas city royals c cal state fullerton 14 steve hosey san francisco giants of fresno state university 15 kiki jones los angeles dodgers rhp hillsborough hs ( tampa, fl ) 16 greg bloss

In [39]:
#first example should not have any prev_labels set
assert batch["token_type_ids"][0][:,3].sum() == 0

In [40]:
tokenizer.decode(batch["input_ids"][1])

'[CLS] which player went to louisiana state university? [SEP] pick player team position school 1 ben mcdonald baltimore orioles rhp louisiana state university 2 tyler houston atlanta braves c valley hs ( las vegas, nv ) 3 roger salkeld seattle mariners rhp saugus ( ca ) hs 4 jeff jackson philadelphia phillies of simeon hs ( chicago, il ) 5 donald harris texas rangers of texas tech university 6 paul coleman saint louis cardinals of frankston ( tx ) hs 7 frank thomas chicago white sox 1b auburn university 8 earl cunningham chicago cubs of lancaster ( sc ) hs 9 kyle abbott california angels lhp long beach state university 10 charles johnson montreal expos c westwood hs ( fort pierce, fl ) 11 calvin murray cleveland indians 3b w. t. white high school ( dallas, tx ) 12 jeff juden houston astros rhp salem ( ma ) hs 13 brent mayne kansas city royals c cal state fullerton 14 steve hosey san francisco giants of fresno state university 15 kiki jones los angeles dodgers rhp hillsborough hs ( tamp

In [41]:
assert batch["labels"][0].sum() == batch["token_type_ids"][1][:,3].sum()
print(batch["token_type_ids"][1][:,3].sum())

tensor(132)


In [42]:
for id, prev_label in zip(batch["input_ids"][1], batch["token_type_ids"][1][:,3]):
    if id != 0:
        print(tokenizer.decode([id]), prev_label.item())

[CLS] 0
which 0
player 0
went 0
to 0
louisiana 0
state 0
university 0
? 0
[SEP] 0
pick 0
player 0
team 0
position 0
school 0
1 0
ben 0
mcdonald 0
baltimore 0
orioles 0
r 0
##hp 0
louisiana 1
state 1
university 1
2 0
tyler 0
houston 0
atlanta 0
braves 0
c 0
valley 1
hs 1
( 1
las 1
vegas 1
, 1
n 1
##v 1
) 1
3 0
roger 0
sal 0
##kel 0
##d 0
seattle 0
mariners 0
r 0
##hp 0
sa 1
##ug 1
##us 1
( 1
ca 1
) 1
hs 1
4 0
jeff 0
jackson 0
philadelphia 0
phillies 0
of 0
simeon 1
hs 1
( 1
chicago 1
, 1
il 1
) 1
5 0
donald 0
harris 0
texas 0
rangers 0
of 0
texas 1
tech 1
university 1
6 0
paul 0
coleman 0
saint 0
louis 0
cardinals 0
of 0
franks 1
##ton 1
( 1
tx 1
) 1
hs 1
7 0
frank 0
thomas 0
chicago 0
white 0
sox 0
1b 0
auburn 1
university 1
8 0
earl 0
cunningham 0
chicago 0
cubs 0
of 0
lancaster 1
( 1
sc 1
) 1
hs 1
9 0
kyle 0
abbott 0
california 0
angels 0
l 0
##hp 0
long 1
beach 1
state 1
university 1
10 0
charles 0
johnson 0
montreal 0
expo 0
##s 0
c 0
westwood 1
hs 1
( 1
fort 1
pierce 1
, 1
fl 1
) 

### Define Model

In [45]:
from transformers import TapasForQuestionAnswering

model = TapasForQuestionAnswering.from_pretrained("google/tapas-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device)

Some weights of TapasForQuestionAnswering were not initialized from the model checkpoint at google/tapas-base and are newly initialized: ['column_output_weights', 'output_bias', 'output_weights', 'column_output_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TapasForQuestionAnswering(
  (tapas): TapasModel(
    (embeddings): TapasEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(1024, 768)
      (token_type_embeddings_0): Embedding(3, 768)
      (token_type_embeddings_1): Embedding(256, 768)
      (token_type_embeddings_2): Embedding(256, 768)
      (token_type_embeddings_3): Embedding(2, 768)
      (token_type_embeddings_4): Embedding(256, 768)
      (token_type_embeddings_5): Embedding(256, 768)
      (token_type_embeddings_6): Embedding(10, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.07, inplace=False)
    )
    (encoder): TapasEncoder(
      (layer): ModuleList(
        (0): TapasLayer(
          (attention): TapasAttention(
            (self): TapasSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)


### Train Model

In [59]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(10):  # loop over the dataset multiple times
    print("Epoch:", epoch)
    for idx, batch in enumerate(train_dataloader):
        # get the inputs;
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch["labels"].to(device)

        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
                       labels=labels)
        loss = outputs.loss
        print("Loss:", loss.item())
        loss.backward()
        optimizer.step()



Epoch: 0


RuntimeError: CUDA out of memory. Tried to allocate 24.00 MiB (GPU 0; 14.76 GiB total capacity; 8.25 GiB already allocated; 9.75 MiB free; 8.32 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

### Inference

In [50]:
import collections
import numpy as np

def compute_prediction_sequence(model, data, device):
    """Computes predictions using model's answers to the previous questions."""
  
    # prepare data
    input_ids = data["input_ids"].to(device)
    attention_mask = data["attention_mask"].to(device)
    token_type_ids = data["token_type_ids"].to(device)

    all_logits = []
    prev_answers = None

    num_batch = data["input_ids"].shape[0]

    for idx in range(num_batch):
        if prev_answers is not None:
            coords_to_answer = prev_answers[idx]
            # Next, set the label ids predicted by the model
            prev_label_ids_example = token_type_ids_example[:,3] # shape (seq_len,)
            model_label_ids = np.zeros_like(prev_label_ids_example.cpu().numpy()) # shape (seq_len,)

            # for each token in the sequence:
            token_type_ids_example = token_type_ids[idx] # shape (seq_len, 7)
            for i in range(model_label_ids.shape[0]):
                segment_id = token_type_ids_example[:,0].tolist()[i]
                col_id = token_type_ids_example[:,1].tolist()[i] - 1
                row_id = token_type_ids_example[:,2].tolist()[i] - 1
                if row_id >= 0 and col_id >= 0 and segment_id == 1:
                    model_label_ids[i] = int(coords_to_answer[(col_id, row_id)])

            # set the prev label ids of the example (shape (1, seq_len) )
            token_type_ids_example[:,3] = torch.from_numpy(model_label_ids).type(torch.long).to(device)   

    prev_answers = {}
    # get the example
    input_ids_example = input_ids[idx] # shape (seq_len,)
    attention_mask_example = attention_mask[idx] # shape (seq_len,)
    token_type_ids_example = token_type_ids[idx] # shape (seq_len, 7)
    # forward pass to obtain the logits
    outputs = model(input_ids=input_ids_example.unsqueeze(0), 
                    attention_mask=attention_mask_example.unsqueeze(0), 
                    token_type_ids=token_type_ids_example.unsqueeze(0))
    logits = outputs.logits
    all_logits.append(logits)

    # convert logits to probabilities (which are of shape (1, seq_len))
    dist_per_token = torch.distributions.Bernoulli(logits=logits)
    probabilities = dist_per_token.probs * attention_mask_example.type(torch.float32).to(dist_per_token.probs.device) 

    # Compute average probability per cell, aggregating over tokens.
    # Dictionary maps coordinates to a list of one or more probabilities
    coords_to_probs = collections.defaultdict(list)
    prev_answers = {}
    for i, p in enumerate(probabilities.squeeze().tolist()):
        segment_id = token_type_ids_example[:,0].tolist()[i]
        col = token_type_ids_example[:,1].tolist()[i] - 1
        row = token_type_ids_example[:,2].tolist()[i] - 1
        if col >= 0 and row >= 0 and segment_id == 1:
            coords_to_probs[(col, row)].append(p)

    # Next, map cell coordinates to 1 or 0 (depending on whether the mean prob of all cell tokens is > 0.5)
    coords_to_answer = {}
    for key in coords_to_probs:
        coords_to_answer[key] = np.array(coords_to_probs[key]).mean() > 0.5
    prev_answers[idx+1] = coords_to_answer

    logits_batch = torch.cat(tuple(all_logits), 0)

    return logits_batch

In [51]:
data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 
        'Age': ["56", "45", "59"],
        'Number of movies': ["87", "53", "69"],
        'Date of birth': ["7 february 1967", "10 june 1996", "28 november 1967"]}
queries = ["How many movies has George Clooney played in?", "How old is he?", "What's his date of birth?"]

table = pd.DataFrame.from_dict(data)

inputs = tokenizer(table=table, queries=queries, padding='max_length', return_tensors="pt")
logits = compute_prediction_sequence(model, inputs, device)

In [52]:
predicted_answer_coordinates, = tokenizer.convert_logits_to_predictions(inputs, logits.cpu().detach())

In [55]:
# handy helper function in case inference on Pandas dataframe
answers = []
for coordinates in predicted_answer_coordinates:
    if len(coordinates) == 1:
    # only a single cell:
        answers.append(table.iat[coordinates[0]])
    else:
        # multiple cells
        cell_values = []
    for coordinate in coordinates:
        cell_values.append(table.iat[coordinate])
    answers.append(", ".join(cell_values))

display(table)
print("")
for query, answer in zip(queries, answers):
    print(query)
    print("Predicted answer: " + answer)

Unnamed: 0,Actors,Age,Number of movies,Date of birth
0,Brad Pitt,56,87,7 february 1967
1,Leonardo Di Caprio,45,53,10 june 1996
2,George Clooney,59,69,28 november 1967



How many movies has George Clooney played in?
Predicted answer: 56, 87, 45, 53, 59, 69
How old is he?
Predicted answer: Leonardo Di Caprio, George Clooney
What's his date of birth?
Predicted answer: 7 february 1967, 10 june 1996, 28 november 1967


### Evaluation

In [57]:
test_data = pd.read_csv('sqa_data/test.tsv', sep='\t')

In [58]:
test_data.head()

Unnamed: 0,id,annotator,position,question,table_file,answer_coordinates,answer_text
0,nu-597,0,0,where did the championships take place?,table_csv/204_758.csv,"['(0, 3)', '(1, 3)', '(2, 3)', '(3, 3)', '(4, ...","['Memphis, Tennessee, USA', 'Coral Springs, Fl..."
1,nu-597,0,1,and on what dates?,table_csv/204_758.csv,"['(0, 2)', '(1, 2)', '(2, 2)', '(3, 2)', '(4, ...","['February 15, 1993', 'May 17, 1993', 'July 26..."
2,nu-597,0,2,"what about just atlanta, georgia, usa?",table_csv/204_758.csv,"['(7, 2)']","['May 2, 1994']"
3,nu-597,0,3,"now, which other event took place in that month?",table_csv/204_758.csv,"['(8, 3)']","['Pinehurst, USA']"
4,nu-597,1,0,what are the location names in the championship?,table_csv/204_758.csv,"['(0, 3)', '(1, 3)', '(2, 3)', '(3, 3)', '(4, ...","['Memphis, Tennessee, USA', 'Coral Springs, Fl..."


In [59]:
test_data.keys()

Index(['id', 'annotator', 'position', 'question', 'table_file',
       'answer_coordinates', 'answer_text'],
      dtype='object')

In [64]:
mini_test = test_data.loc[0:10]

In [65]:
test_dataset = TableDataset(df=mini_test, tokenizer=tokenizer)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=2)