In [None]:
import pandas as pd
import numpy as np
import time

import torch
from transformers import TapasTokenizer, TapasConfig, TapasForQuestionAnswering, AdamW

# initialize the tokenizer
tokenizer = TapasTokenizer.from_pretrained("google/tapas-base")

In [None]:
## import the designed question dataset 
data = pd.read_csv('question.csv')
data

In [None]:
from ast import literal_eval

data['answer_coordinates'] = data['answer_coordinates'].apply(lambda coords_str: literal_eval(coords_str))
data['answer_text'] = data['answer_text'].apply(lambda txt: literal_eval(txt))
data['float_answer'] = data['float_answer'].apply(lambda x: [[float(x)]] if isinstance(x,str) else [[x]])
data['question'] = data['question'].apply(lambda question: literal_eval(question))

In [None]:
## set the batch size = 6 based on my own gpu memory. RTX2070

class TableDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        item = data.iloc[idx]
        table = pd.read_csv(item.table_file).astype(str)  
        encoding = self.tokenizer(
            table=table,
            queries=item.question,
            answer_coordinates=item.answer_coordinates,
            answer_text=item.answer_text,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        
        encoding = {key: val.squeeze(0) for key, val in encoding.items()}
        encoding["float_answer"] = torch.tensor(item.float_answer)
        return encoding

    def __len__(self):
        return len(self.data)


train_dataset = TableDataset(data, tokenizer)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=6)

### start to fine-tune this model

In [None]:
# this is the default WTQ configuration
config = TapasConfig(
    num_aggregation_labels=4,
    use_answer_as_supervision=True,
    answer_loss_cutoff=0.664694,
    cell_selection_preference=0.207951,
    huber_loss_delta=0.121194,
    init_cell_selection_weights_to_zero=True,
    select_one_column=True,
    allow_empty_column_selection=False,
    temperature=0.0352513,
)
model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()

start = time.time() 
for epoch in range(10):  # loop over the dataset multiple times
    print("Epoch:", epoch)
    for batch in train_dataloader:
        # get the inputs;
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels = batch["labels"].to(device)
        numeric_values = batch["numeric_values"].to(device)
        numeric_values_scale = batch["numeric_values_scale"].to(device)
        float_answer = batch["float_answer"].to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels,
            numeric_values=numeric_values,
            numeric_values_scale=numeric_values_scale,
            float_answer=float_answer,
        )
        loss = outputs.loss
        print("Loss:", loss.item())
        loss.backward()
        optimizer.step()
end = time.time()

print(f'Time elapsed: {(end-start)/60} minutes')

### save the fine-tuned model

In [None]:

output_model = 'models/fine_tuned.pth'

# save
def save(model, optimizer):
    # save
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, output_model)

save(model, optimizer)

# load
#checkpoint = torch.load(output_model, map_location='cpu')
#model.load_state_dict(checkpoint['model_state_dict'])
#optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

### prediction

In [None]:
import collections
import numpy as np

def compute_prediction_sequence(model, data, device):
  
  # prepare data
    input_ids = data["input_ids"].to(device)
    attention_mask = data["attention_mask"].to(device)
    token_type_ids = data["token_type_ids"].to(device)

    all_logits = []
    prev_answers = None

    num_batch = data["input_ids"].shape[0]

    for idx in range(num_batch):

        if prev_answers is not None:
            coords_to_answer = prev_answers[idx]
            prev_label_ids_example = token_type_ids_example[:,3] 
            model_label_ids = np.zeros_like(prev_label_ids_example.cpu().numpy()) 

   
            token_type_ids_example = token_type_ids[idx]
            for i in range(model_label_ids.shape[0]):
                segment_id = token_type_ids_example[:,0].tolist()[i]
                col_id = token_type_ids_example[:,1].tolist()[i] - 1
                row_id = token_type_ids_example[:,2].tolist()[i] - 1
                if row_id >= 0 and col_id >= 0 and segment_id == 1:
                    model_label_ids[i] = int(coords_to_answer[(col_id, row_id)])

 
            token_type_ids_example[:,3] = torch.from_numpy(model_label_ids).type(torch.long).to(device)   

    prev_answers = {}
    # get the example
    input_ids_example = input_ids[idx] # shape (seq_len,)
    attention_mask_example = attention_mask[idx] # shape (seq_len,)
    token_type_ids_example = token_type_ids[idx] # shape (seq_len, 7)
    # forward pass to obtain the logits
    outputs = model(input_ids=input_ids_example.unsqueeze(0), 
                attention_mask=attention_mask_example.unsqueeze(0), 
                token_type_ids=token_type_ids_example.unsqueeze(0))
    logits = outputs.logits
    all_logits.append(logits)

    # convert logits to probabilities (which are of shape (1, seq_len))
    dist_per_token = torch.distributions.Bernoulli(logits=logits)
    probabilities = dist_per_token.probs * attention_mask_example.type(torch.float32).to(dist_per_token.probs.device) 

    # Compute average probability per cell, aggregating over tokens.
    # Dictionary maps coordinates to a list of one or more probabilities
    coords_to_probs = collections.defaultdict(list)
    prev_answers = {}
    for i, p in enumerate(probabilities.squeeze().tolist()):
        segment_id = token_type_ids_example[:,0].tolist()[i]
        col = token_type_ids_example[:,1].tolist()[i] - 1
        row = token_type_ids_example[:,2].tolist()[i] - 1
        if col >= 0 and row >= 0 and segment_id == 1:
            coords_to_probs[(col, row)].append(p)

    # Next, map cell coordinates to 1 or 0 (depending on whether the mean prob of all cell tokens is > 0.5)
    coords_to_answer = {}
    for key in coords_to_probs:
        coords_to_answer[key] = np.array(coords_to_probs[key]).mean() > 0.5
    prev_answers[idx+1] = coords_to_answer

    logits_batch = torch.cat(tuple(all_logits), 0)

    return logits_batch

In [None]:
inputs = tokenizer(table=ddd, queries=queries, padding='max_length', return_tensors="pt")
logits = compute_prediction_sequence(model, inputs, device)