In [1]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM,
    HfArgumentParser,
    TapexTokenizer, BartForConditionalGeneration
)
import os
import sys
import pandas as pd
from io import StringIO

src_path = os.path.abspath(os.path.join(os.path.dirname("__file__"), "../src"))
datasets_path = os.path.abspath(os.path.join(os.path.dirname("__file__"), "../datasets"))

if src_path not in sys.path:
    sys.path.append(src_path)

if datasets_path not in sys.path:
    sys.path.append(datasets_path)

from parsers.argument_classes import DatasetArguments
from utils.datasets_loader import load_datasets

In [2]:
def _convert_csv_string_to_table(csv_string: str) -> pd.DataFrame:
    """
    Convert a csv string to a table, including the header
    """
    df = pd.read_csv(StringIO(csv_string), delimiter=",", on_bad_lines="skip")
    columns = df.columns.astype(str).tolist()
    columns = [col.replace("Unnamed: 0", "") for col in columns]
    rows = df.values.astype(str).tolist()
    return pd.DataFrame(rows, columns = columns)

def main():
    parser = HfArgumentParser(DatasetArguments)
    dataset_args = parser.parse_dict({
    "dataset_root_dir" : "../datasets",
    "train_max_samples_for_each_dataset" : 10,
    "dataset_names": ["wtq"],
    })[0]
    
    # Load datasets
    def filter_function(example):
        if dataset_args.max_table_row_num is not None and example["table_row_num"] > dataset_args.max_table_row_num:
            return False
        if dataset_args.max_table_width is not None and example["table_width"] > dataset_args.max_table_width:
            return False
        return True
    datasets = load_datasets(dataset_args, filter_function=filter_function)
    # Tokenizer
    # tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
    
    # # Model
    # model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/tapex-large-finetuned-wtq")
    tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base")
    model = BartForConditionalGeneration.from_pretrained("microsoft/tapex-base")
    question = datasets["test"]['question'][0]
    table = datasets["test"]["table"][0]

    df = _convert_csv_string_to_table(table)
    encoding = tokenizer(df, question, padding='max_length', truncation=True, return_tensors="pt")

    outputs = model.generate(**encoding)
    predicted_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    print("Question:", question)
    print("Answer:", predicted_answer)


In [3]:
main()

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/14149 [00:00<?, ? examples/s]

Map:   0%|          | 0/3515 [00:00<?, ? examples/s]

Map:   0%|          | 0/4344 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14149 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3515 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4344 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/988 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]



Question: did dmitry mikhailovich golitsyn or andrey kirillovich razumovsky serve as ambassador longer?
Answer: [', dmitry mikhailovich golitsyn, andrey kirillovich']
