In [1]:
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering

tokenizer = RobertaTokenizerFast.from_pretrained('nur-dev/roberta-kaz-large')
model = RobertaForQuestionAnswering.from_pretrained('nur-dev/roberta-kaz-large')

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at nur-dev/roberta-kaz-large and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
from datasets import load_dataset, concatenate_datasets

# valid_ds = load_dataset("issai/kazqad", "kazqad", split="validation")
test_ds = load_dataset("issai/kazqad", "kazqad", split="test")
test_ds = test_ds.select(range(1000))
# dataset = load_dataset("Kyrmasch/sKQuAD", "kazqad", split="train")
# dataset2 = load_dataset("issai/kazqad", "nq-translate-kk", split="train")
# 
# dataset = concatenate_datasets([dataset1, dataset2, valid_ds])

In [3]:
dataset = load_dataset("Kyrmasch/sKQuAD", "default", split="train")

In [4]:
def tokenize_function(examples):
    return tokenizer(
        examples["question"],
        examples["context"],
        padding="max_length",
        truncation=True,
        max_length=384,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

In [5]:
def prepare_train_features(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    
    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    
    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        
        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answer"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            
            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1
            
            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)
    
    return tokenized_examples


In [6]:
def add_answer_start(example):
    context = example['context']
    answer_text = example['answer']
    
    # Find the position of the answer in the context
    start_idx = context.find(answer_text)
    
    if start_idx == -1:
        # Answer not found in context
        start_idx = None
    example['answers'] = {
        'text': [answer_text],
        'answer_start': [start_idx] if start_idx is not None else []
    }
    return example

In [7]:
dataset = dataset.map(add_answer_start)

In [8]:
def filter_missing_answers(example):
    return len(example['answers']['answer_start']) > 0

dataset = dataset.filter(filter_missing_answers)

In [9]:
def prepare_train_features(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    
    # Initialize start and end positions
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    
    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        
        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        
        if len(answers["answer_start"]) == 0:
            # If no answer is found, set start and end positions to CLS index
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            
            # Find the start and end token indices
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1
            
            # If the answer is not fully inside the context, label as CLS index
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Move the token indices to the answer boundaries
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_position = token_start_index - 1
                
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_position = token_end_index + 1
                
                tokenized_examples["start_positions"].append(start_position)
                tokenized_examples["end_positions"].append(end_position)
    
    return tokenized_examples


In [10]:
tokenized_datasets = dataset.map(
    prepare_train_features,
    batched=True,
    remove_columns=dataset.column_names
)

In [11]:
tokenized_datasets = dataset.map(
    prepare_train_features,
    batched=True,
    remove_columns=dataset.column_names
)

In [12]:
tokenized_test_datasets = test_ds.map(
    prepare_train_features,
    batched=True,
    remove_columns=test_ds.column_names
)

In [13]:
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    start_logits, end_logits = logits
    
    start_preds = np.argmax(start_logits, axis=-1)
    end_preds = np.argmax(end_logits, axis=-1)
    
    f1 = f1_score(labels[0], start_preds, average="weighted")
    accuracy = accuracy_score(labels[0], start_preds)
    
    return {"f1": f1, "accuracy": accuracy}


In [14]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    #save_strategy="epoch",
    learning_rate=3e-06,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    adam_beta1=0.8,
    adam_beta2=0.999,
    num_train_epochs=30,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    report_to="tensorboard",
    #deepspeed="ds_config.json",
    bf16=True
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_test_datasets,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


[2024-11-03 10:23:45,431] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/user/anaconda3/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/user/anaconda3/compiler_compat/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status


In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,2.7232,4.299444,0.086783,0.097804
2,1.9889,4.228851,0.114204,0.134731
3,1.378,4.492192,0.109722,0.11976
4,1.1768,4.677872,0.113187,0.115768
5,0.7673,4.92091,0.112967,0.117764
6,0.5657,5.239927,0.10225,0.106786
7,0.4936,5.703248,0.106928,0.111776
8,0.3711,6.121781,0.112708,0.118762
9,0.2188,6.783863,0.106145,0.100798
10,0.1596,7.432183,0.107202,0.10479


TrainOutput(global_step=4860, training_loss=0.406978646350005, metrics={'train_runtime': 1069.1003, 'train_samples_per_second': 18.127, 'train_steps_per_second': 4.546, 'total_flos': 1.349875613058048e+16, 'train_loss': 0.406978646350005, 'epoch': 30.0})

In [None]:
import torch.nn.utils.prune as prune
def prune_model(model, amount=0.2):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            # Apply pruning to linear layers
            prune.l1_unstructured(module, name='weight', amount=amount)
    return model


In [None]:
model = prune_model(model, amount=0.2)
def remove_pruning(model):
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):
            prune.remove(module, 'weight')
    return model


In [None]:
import torch.quantization

def quantize_model_dynamic(model):
    quantized_model = torch.quantization.quantize_dynamic(
        model,  # the model to quantize
        {torch.nn.Linear},  # layers to quantize
        dtype=torch.qint8  # data type for quantized weights
    )
    return quantized_model


In [None]:
quantized_model = quantize_model_dynamic(model)
torch.save(quantized_model.state_dict(), 'quantized_model.pth')
model.eval()
model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.prepare(model, inplace=True)
