## Import libraries

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
from sklearn.metrics import f1_score

## Load Dataset

In [None]:
dataset = load_dataset("khalidalt/tydiqa-goldp", "english")

In [None]:
idx = 0

start_index = dataset['train'][idx]['answers']['start_byte'][0]
end_index = dataset['train'][idx]['answers']['limit_byte'][0]

print(f"Question: {dataset['train'][idx]['question_text']}")
print(f"\nContext (truncated): {dataset['train'][idx]['passage_text'][0:512]} ...")
print(f"\nAnswer: {dataset['train'][idx]['passage_text'][start_index:end_index]}")

## Preprocessing (Tokenize)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
tokenizer.model_max_length = 512

In [None]:
flattened_train_data = dataset['train'].flatten()
flattened_test_data =  dataset['validation'].flatten()

In [None]:
def process_samples(sample):
    tokenized_data = tokenizer(sample['passage_text'], sample['question_text'], truncation="only_first", padding="max_length")

    input_ids = tokenized_data["input_ids"]

    cls_index = input_ids.index(tokenizer.cls_token_id)

    if sample["answers.start_byte"][0] == -1:
        start_position = cls_index
        end_position = cls_index
    else:
        gold_text = sample["passage_text"][sample["answers.start_byte"][0]:sample["answers.limit_byte"][0]]
        start_char = sample["answers.start_byte"][0]
        end_char = sample["answers.limit_byte"][0]

        if sample['passage_text'][start_char-1:end_char-1] == gold_text:
            start_char = start_char - 1
            end_char = end_char - 1  
        elif sample['passage_text'][start_char-2:end_char-2] == gold_text:
            start_char = start_char - 2
            end_char = end_char - 2

        start_token = tokenized_data.char_to_token(start_char)
        end_token = tokenized_data.char_to_token(end_char - 1)

        if start_token is None:
            start_token = tokenizer.model_max_length
        if end_token is None:
            end_token = tokenizer.model_max_length

        start_position = start_token
        end_position = end_token

    return {'input_ids': tokenized_data['input_ids'],
          'attention_mask': tokenized_data['attention_mask'],
          'start_positions': start_position,
          'end_positions': end_position}


In [None]:
processed_train_data = flattened_train_data.map(process_samples)
processed_test_data = flattened_test_data.map(process_samples)

## Model

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")

In [None]:
columns_to_return = ['input_ids','attention_mask', 'start_positions', 'end_positions']

processed_train_data.set_format(type='pt', columns=columns_to_return)
processed_test_data.set_format(type='pt', columns=columns_to_return)

## Compute Metrics

In [None]:
def compute_f1_metrics(pred):
    start_labels = pred.label_ids[0]
    start_preds = pred.predictions[0].argmax(-1)
    end_labels = pred.label_ids[1]
    end_preds = pred.predictions[1].argmax(-1)

    f1_start = f1_score(start_labels, start_preds, average='macro')
    f1_end = f1_score(end_labels, end_preds, average='macro')

    return {
        'f1_start': f1_start,
        'f1_end': f1_end,
    }

In [None]:
training_args = TrainingArguments(
    output_dir='model_results',     
    overwrite_output_dir=True,
    num_train_epochs=3,              
    per_device_train_batch_size=8,   
    per_device_eval_batch_size=8,    
    warmup_steps=20,                 
    weight_decay=0.01,               
    logging_steps=50
)

trainer = Trainer(
    model=model,                        
    args=training_args,                 
    train_dataset=processed_train_data, 
    eval_dataset=processed_test_data, 
    compute_metrics=compute_f1_metrics
)

trainer.train()

In [None]:
trainer.evaluate(processed_test_data)

## Predict Answer

In [None]:
text = r"""
Stocks (also capital stock, or sometimes interchangeably, shares) consist of all the shares[a] by which ownership of a corporation or company is divided.[1] A single share of the stock means fractional ownership of the corporation in proportion to the total number of shares. This typically entitles the shareholder (stockholder) to that fraction of the company's earnings, proceeds from liquidation of assets (after discharge of all senior claims such as secured and unsecured debt),[3] or voting power, often dividing these up in proportion to the amount of money each stockholder has invested. Not all stock is necessarily equal, as certain classes of stock may be issued, for example, without voting rights, with enhanced voting rights, or with a certain priority to receive profits or liquidation proceeds before or after other classes of shareholders.

Stock can be bought and sold privately or on stock exchanges. Such transactions are closely overseen by governments and regulatory bodies to prevent fraud, protect investors, and benefit the larger economy. The stocks are deposited with the depositories in the electronic format also known as Demat account. As new shares are issued by a company, the ownership and rights of existing shareholders are diluted in return for cash to sustain or grow the business. Companies can also buy back stock, which often lets investors recoup the initial investment plus capital gains from subsequent rises in stock price. Stock options issued by many companies as part of employee compensation do not represent ownership, but represent the right to buy ownership at a future time at a specified price. This would represent a windfall to the employees if the option were exercised when the market price is higher than the promised price, since if they immediately sold the stock they would keep the difference (minus taxes).

Stock bought and sold in private markets fall within the private equity realm of finance.
"""

questions = ["What does a single share of stock represent in terms of ownership?",
             "How are transactions of stocks overseen, and what is the purpose of government and regulatory oversight?",
             "What is the significance of companies buying back stock, and how does it impact investors?",
             "In what way do stock options issued as part of employee compensation differ from actual ownership, and what potential windfall do employees get if the options are exercised?"]

In [None]:
def answer_by_model(questions):
    for question in questions:
        inputs = tokenizer.encode_plus(question, text, return_tensors="pt")
    
        input_ids = inputs["input_ids"].tolist()[0]
        inputs.to("cuda")
    
        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        answer_model = model(**inputs)
        
        start_logits = answer_model['start_logits'].cpu().detach().numpy()
        answer_start = np.argmax(start_logits)  
        
        end_logits = answer_model['end_logits'].cpu().detach().numpy()
        answer_end = np.argmax(end_logits) + 1
    
        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    
        print(f"Question: {question}")
        print(f"Answer: {answer}\n")

In [None]:
answer_by_model(questions)