## **Basic Example of TAPAS Fine-tuning.**

The goal was to understand the code, not actually perform fine-tuning. Therefore, the datasets used are just an example of how should one prepare the data.

In [1]:
!pip install wandb



In [2]:
import ast
import torch
import pandas as pd
from transformers import TapasTokenizer, TapasConfig, TapasForQuestionAnswering, Trainer, TrainingArguments

#change your environment to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
#google/tapas-base-finetuned-wtq
#google/tapas-base-finetuned-sqa
#google/tapas-base-finetuned-tabfact
#google/tapas-base-finetuned-wikisql-supervised

config = TapasConfig.from_pretrained("google/tapas-base-finetuned-wtq")
model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config).to(device)

tokenizer = TapasTokenizer.from_pretrained("google/tapas-base", truncation=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of TapasForQuestionAnswering were not initialized from the model checkpoint at google/tapas-base and are newly initialized: ['aggregation_classifier.bias', 'aggregation_classifier.weight', 'column_output_bias', 'column_output_weights', 'output_bias', 'output_weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
class TableDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        item = data.iloc[idx]
        table = pd.read_csv(table_csv_path + item.table_file, sep = ';').astype(
            str
        )

        encoding = self.tokenizer(
            table=table,
            queries=item.question,
            answer_coordinates=ast.literal_eval(item.answer_coordinates),
            answer_text=item.answer_text,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        encoding = {key: val.squeeze(0) for key, val in encoding.items()}

        #adding the float_answer which is also required (weak supervision for aggregation case)
        encoding["float_answer"] = torch.tensor(item.float_answer)

        #aggregation_label is required for tapas-base-finetuned-wikisql
        #encoding["aggregation_label"] = torch.tensor(item.aggregation_label)

        return encoding

    def __len__(self):
        return len(self.data)

table_csv_path = '/content/'
qa_csv_path = '/content/questions.csv'
data = pd.read_csv(qa_csv_path, sep = ";")
train_dataset = TableDataset(data, tokenizer)

In [5]:
config = TapasConfig(
    num_aggregation_labels=4,
    use_answer_as_supervision=True,
    answer_loss_cutoff=0.65,
    cell_selection_preference=0.20,
    huber_loss_delta=0.121194,
    init_cell_selection_weights_to_zero=True,
    select_one_column=True,
    allow_empty_column_selection=False,
    temperature=0.03,
    #temperature=0.02,
    model_max_length = 512,
    padding="max_length",
)

training_args = TrainingArguments(
    output_dir="./tapas-finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    num_train_epochs=3,
    weight_decay=0.01,
)

model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config).to(device)

Some weights of TapasForQuestionAnswering were not initialized from the model checkpoint at google/tapas-base and are newly initialized: ['aggregation_classifier.bias', 'aggregation_classifier.weight', 'column_output_bias', 'column_output_weights', 'output_bias', 'output_weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
import wandb
wandb.init(mode="disabled")

#define a different test set
table_csv_test = '/content/questions.csv'
data_test = pd.read_csv(table_csv_test, sep = ";")

test_dataset = TableDataset(data_test[0:1], tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset = train_dataset,
    eval_dataset = train_dataset,
)

In [7]:
trainer.train()

  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


Epoch,Training Loss,Validation Loss
1,No log,No log
2,No log,No log
3,No log,No log


  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
Non-default generation parameters: {'temperature': 0.03}
  text = normalize_for_match(row[col_index].text)
  cel

TrainOutput(global_step=3, training_loss=3.7545458475748696, metrics={'train_runtime': 109.7974, 'train_samples_per_second': 0.082, 'train_steps_per_second': 0.027, 'total_flos': 2368084543488.0, 'train_loss': 3.7545458475748696, 'epoch': 3.0})

In [8]:
trainer.save_model("/content/models/qa_model")

Non-default generation parameters: {'temperature': 0.03}


In [9]:
df_history = pd.DataFrame(trainer.state.log_history)
df_history

Unnamed: 0,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch,step,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
0,7.0342,0.426,0.142,1.0,1,,,,,
1,6.6532,0.451,0.15,2.0,2,,,,,
2,5.7937,0.518,0.173,3.0,3,,,,,
3,,,,3.0,3,109.7974,0.082,0.027,2368085000000.0,3.754546


In [10]:
predictions, labels, metrics = trainer.predict(train_dataset, metric_key_prefix="predict")

  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


In [11]:
data_ = {
    "Titles": ["Glorious visions in animation and performance","Medicine for the 99 percent","What is so special about the human brain?","The mysterious science of pain","The uncomplicated truth about women's sexuality"],
    "Views": ["946000","310677","3082440","887739","2303625"],
    "Speakers": ["Miwa Matreyek","Thomas Pogge","Suzana Herculano-Houzel","Joshua Pate","Sarah Barmak"],
    "Duration": ["671","1085","811","287","680"],
    "Comments": ["148","121","1050","0","17"],
    "Events": ["TEDGlobal 2010","TEDxCanberra","TEDGlobal 2013","TED-Ed","TEDxToronto"]
}

queries_ = ["what is the title with more comments?","what is the event with the longest duration?"]

model_ = TapasForQuestionAnswering.from_pretrained("/content/models/qa_model")

tokenizer = TapasTokenizer.from_pretrained("google/tapas-base")

table_ = pd.DataFrame.from_dict(data_)
inputs_ = tokenizer(table=table_, queries=queries_, padding="max_length", return_tensors="pt")
outputs_ = model_(**inputs_)

predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
    inputs_, outputs_.logits.detach(), outputs_.logits_aggregation.detach()
)

  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


In [12]:
id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]
answers = []

for coordinates in predicted_answer_coordinates:
    if len(coordinates) == 1:
        # only a single cell:
        answers.append(table_.iat[coordinates[0]])
    else:
        # multiple cells
        cell_values = []

        for coordinate in coordinates:
            cell_values.append(table_.iat[coordinate])

        answers.append(", ".join(cell_values))

In [13]:
for query, answer, predicted_agg in zip(queries_, answers, aggregation_predictions_string):
    print(query)
    print("Predicted answer: " + predicted_agg + " > " + answer)

what is the title with more comments?
Predicted answer: NONE > 
what is the event with the longest duration?
Predicted answer: NONE > 
