<a href="https://colab.research.google.com/github/arct297/datathon/blob/main/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install transformers datasets evaluate torch

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m9

In [6]:
import argparse
from datasets import load_dataset
import evaluate  # For computing metrics
from transformers import (
    RobertaTokenizerFast,
    RobertaForQuestionAnswering,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    RobertaConfig,
)
import numpy as np
import torch
import gc

# hackathon models
MODEL_NAME = "nur-dev/roberta-kaz-large"
FINE_TUNE_DATASET = "Kyrmasch/sKQuAD"
TEST_DATASET = "issai/kazqad"

# Load tokenizer and set device
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)
device = "cuda" if torch.cuda.is_available() else "cpu"

def preprocess_data(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation=True,
        padding="max_length",
        return_offsets_mapping=True,
    )
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(inputs["offset_mapping"]):
        if "answers" in examples and len(examples["answers"][i]["text"]) > 0:
            answer = examples["answers"][i]
            start_char = answer["answer_start"][0]
            end_char = start_char + len(answer["text"][0])
        else:
            start_char = 0
            end_char = 0

        sequence_ids = inputs.sequence_ids(i)
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

        try:
            start_index = next(i for i, offset in enumerate(offsets) if offset[0] <= start_char < offset[1])
            end_index = next(i for i, offset in enumerate(offsets) if offset[0] < end_char <= offset[1])
        except StopIteration:
            start_index = context_start
            end_index = context_start

        start_positions.append(start_index)
        end_positions.append(end_index)

    inputs.update({"start_positions": start_positions, "end_positions": end_positions})
    return inputs

def compute_metrics(pred):
    metric = evaluate.load("squad")
    start_logits, end_logits = pred.predictions
    start_positions, end_positions = pred.label_ids

    predictions = []
    references = []

    for i in range(len(start_logits)):
        start_index = np.argmax(start_logits[i]).item()
        end_index = np.argmax(end_logits[i]).item()

        if end_index < start_index:
            end_index = start_index

        predicted_answer_tokens = tokenizer.convert_ids_to_tokens(start_logits[i].argmax().item())
        decoded_prediction = tokenizer.decode([start_index] + [end_index], skip_special_tokens=True)

        true_answer_start = start_positions[i]
        true_answer_end = end_positions[i]

        decoded_reference = tokenizer.decode([true_answer_start] + [true_answer_end], skip_special_tokens=True)

        predictions.append({
            "id": str(i),
            "prediction_text": decoded_prediction
        })
        references.append({
            "id": str(i),
            "answers": {
                "text": [decoded_reference],
                "answer_start": [true_answer_start]
            }
        })

    return metric.compute(predictions=predictions, references=references)

# Main function
def main(learning_rate, batch_size, num_epochs, output_dir):
    # Clear cache
    gc.collect()
    torch.cuda.empty_cache()

    # evaluation
    train_dataset = load_dataset(FINE_TUNE_DATASET, split="train")
    tokenized_train_dataset = train_dataset.map(preprocess_data, batched=True)

    test_dataset = load_dataset(TEST_DATASET, split="test[:1000]")
    tokenized_test_dataset = test_dataset.map(preprocess_data, batched=True)

    config = RobertaConfig.from_pretrained("roberta-large")
    model = RobertaForQuestionAnswering(config)
    model.to(device)
    model.gradient_checkpointing_enable()

    data_collator = DataCollatorWithPadding(tokenizer)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.1,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_dir=f"{output_dir}/logs",
        logging_steps=100,
        fp16=True,
        gradient_accumulation_steps=1
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_test_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # trainer
    trainer.train()
    eval_results = trainer.evaluate()

    # Log results
    print("Evaluation results:", eval_results)

    # Save model and tokenizer
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

# entry point
if __name__ == "__main__":
    learning_rate = 5e-5
    batch_size = 8
    num_epochs = 3
    output_dir = "./output"

    main(learning_rate, batch_size, num_epochs, output_dir)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.85M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/486 [00:00<?, ?B/s]

datatset.csv:   0%|          | 0.00/427k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/4.88k [00:00<?, ?B/s]

(…)ing-comprehension-v1.0-kk-train.jsonl.gz:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

(…)omprehension-v1.0-kk-validation.jsonl.gz:   0%|          | 0.00/456k [00:00<?, ?B/s]

(…)ding-comprehension-v1.0-kk-test.jsonl.gz:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3163 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/764 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2713 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Exact Match,F1
1,2.3774,13.125302,34.2,1.7
2,0.9107,17.602142,33.3,1.2
3,0.2877,16.942347,33.0,1.2


Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

Evaluation results: {'eval_loss': 13.1253023147583, 'eval_exact_match': 34.2, 'eval_f1': 1.7, 'eval_runtime': 29.0356, 'eval_samples_per_second': 34.44, 'eval_steps_per_second': 4.305, 'epoch': 3.0}


In [29]:
import torch
import torch.nn.functional as F
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering
import os
import re

MODEL_DIR = os.path.abspath("./output")

tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_DIR)
model = RobertaForQuestionAnswering.from_pretrained(MODEL_DIR)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

def extract_relevant_sentence(question, context):
    question_keywords = re.findall(r'\b\w+\b', question)  # Разбиваем вопрос на слова
    sentences = context.split('.')  # Разбиваем контекст на предложения

    relevant_sentence = ""
    max_overlap = 0

    for sentence in sentences:
        overlap = sum(1 for word in question_keywords if word in sentence)
        if overlap > max_overlap:
            max_overlap = overlap
            relevant_sentence = sentence.strip()

    return relevant_sentence

def answer_question(question, context):
    relevant_sentence = extract_relevant_sentence(question, context)

    inputs = tokenizer.encode_plus(
        question,
        relevant_sentence,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

    start_index = torch.argmax(start_logits, dim=1).item()
    end_index = torch.argmax(end_logits, dim=1).item()

    if end_index < start_index:
        end_index = start_index

    end_index = min(start_index + 20, input_ids.size(1) - 1)

    answer_ids = input_ids[0][start_index:end_index + 1]
    answer = tokenizer.decode(answer_ids, skip_special_tokens=True).strip()

    return answer

context = """
Қазақстан Орталық Азиядағы ең үлкен ел және әлемде аумағы бойынша тоғызыншы орында.
Ол мұнай, газ, көмір, металдар және минералдар сияқты табиғи ресурстардың үлкен қорына ие.
Ел сондай-ақ жасыл энергетиканы дамытуға және халықаралық экологиялық келісімдер аясында
ынтымақтастықты нығайтуға белсенді түрде ұмтылады.
"""

questions = [
  "Қазақстан аумағы бойынша қай орында орналасқан?",
  "Қазақстанда қандай табиғи ресурстардың үлкен қоры бар?",

]

for question in questions:
  answer = answer_question(question, context)
  print(f"""Вопрос: "{question}"\nОтвет: "{answer}"\n""")


Вопрос: "Қазақстан аумағы бойынша қай орында орналасқан?"
Ответ: "Қазақстан Орталық Азиядағы ең үлкен ел және әлемде аумағы бойынша тоғызыншы орында"

Вопрос: "Қазақстанда қандай табиғи ресурстардың үлкен қоры бар?"
Ответ: "Ол мұнай, газ, көмір, металдар және минералдар сияқты табиғи ресурстардың үлкен қорына ие"

