In [13]:
from datasets import load_dataset
from transformers import BertTokenizerFast

dataset = load_dataset("issai/kazqad")

tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

def preprocess_function(examples):
    questions = [q.strip() for q in examples['question']]
    inputs = tokenizer(
        questions,
        examples['context'],
        max_length=384,
        truncation="only_second",
        padding="max_length",
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        stride=128,
    )
    
    sample_mapping = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")
    
    answers = examples["answers"]
    start_positions = []
    end_positions = []
    
    for i, offset in enumerate(offset_mapping):
        input_ids = inputs["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        
        sequence_ids = inputs.sequence_ids(i)
        
        sample_index = sample_mapping[i]
        answer = answers[sample_index]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        
        token_start_index = 0
        while sequence_ids[token_start_index] != 1:
            token_start_index += 1
        
        token_end_index = len(input_ids) - 1
        while sequence_ids[token_end_index] != 1:
            token_end_index -= 1
        
        if not (offset[token_start_index][0] <= start_char and offset[token_end_index][1] >= end_char):
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            while token_start_index < len(offset) and offset[token_start_index][0] <= start_char:
                token_start_index += 1
            start_positions.append(token_start_index - 1)
            
            while offset[token_end_index][1] >= end_char:
                token_end_index -= 1
            end_positions.append(token_end_index + 1)
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
print(0)

0


In [22]:
def extract_qa_pairs(dataset):
    qa_pairs = []
    for split in dataset.keys():
        for data in dataset[split]:
            question = data['question']
            answers = data['answers']['text'] if 'text' in data['answers'] and data['answers']['text'] else [
                "Answer not found"]
            qa_pairs.append({'question': question, 'answers': answers})
    return qa_pairs

qa_pairs = extract_qa_pairs(dataset)

def find_answers_for_question(question, qa_pairs):
    for pair in qa_pairs:
        if pair['question'] == question:
            return pair['answers']
    return ["Answer not found"]

while True:
    input_question = input("Enter your question: ")

    if input_question.lower() == 'exit':
        print("Program terminated.")
        break

    answers = find_answers_for_question(input_question, qa_pairs)

    if answers != ["Answer not found"]:
        print("Answers:", ", ".join(answers))
        print()
    else:
        print("Answer not found in the dataset.")

Enter your question:  Майкл Джордан НБА-да қашан баскетбол ойнады?


Answers: 1980-1990



Enter your question:  exite


Answer not found in the dataset.


Enter your question:  exit


Program terminated.
