<a href="https://colab.research.google.com/github/aghadavood/NLP/blob/main/QaPersian_bert_fa_base_uncased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model_name = "HooshvareLab/bert-fa-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)


In [None]:


def tokenize_dataset(dataset, tokenizer):
    def preprocess_function(examples):
        questions = examples['question']
        contexts = examples['context']
        answers = examples['answers']

        inputs = tokenizer(questions, contexts, truncation=True, padding='max_length', max_length=512)

        start_positions = []
        end_positions = []

        for i in range(len(questions)):
            context = contexts[i]
            if len(answers[i]['text']) == 0 or len(answers[i]['answer_start']) == 0:
                start_positions.append(0)
                end_positions.append(0)
                continue

            answer = answers[i]['text'][0]
            answer_start = answers[i]['answer_start'][0]
            answer_end = answer_start + len(answer)

            # Tokenize context with the fast tokenizer
            tokenized_context = tokenizer(context, truncation=True, padding='max_length', max_length=512)

            # Find start and end token positions
            start_token_idx = tokenized_context.char_to_token(answer_start)
            end_token_idx = tokenized_context.char_to_token(answer_end - 1)

            # Handle edge case where char_to_token returns None
            if start_token_idx is None:
                start_token_idx = 0
            if end_token_idx is None:
                end_token_idx = len(tokenized_context.input_ids) - 1

            start_positions.append(start_token_idx)
            end_positions.append(end_token_idx)

        inputs.update({'start_positions': start_positions, 'end_positions': end_positions})
        return inputs

    tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset['train'].column_names)
    return tokenized_datasets

# Load the dataset
from datasets import load_dataset
dataset = load_dataset("SajjadAyoubi/persian_qa")

# Tokenize the dataset
tokenized_datasets = tokenize_dataset(dataset, tokenizer)
print(tokenized_datasets)


In [None]:
from transformers import BertForQuestionAnswering, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

trainer.train()


In [None]:
import torch
from transformers import BertTokenizerFast, BertForQuestionAnswering

# Ensure the model and inputs are on the same device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



def evaluate_example(question, context, model, tokenizer):
    # Move inputs to the same device as the model
    inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True).to(device)
    outputs = model(**inputs)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    start_idx = torch.argmax(start_logits)
    end_idx = torch.argmax(end_logits)
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx + 1]))
    return answer

# Example question and context
question = "عنوان مقاله چیست؟"
context = "این مقاله به بررسی تاثیرات آب و هوایی بر رشد گیاهان می‌پردازد."
answer = evaluate_example(question, context, model, tokenizer)
print("Predicted Answer:", answer)


In [None]:
def tokenize_example(question, context):
    inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True)
    return inputs

# Example question and context
question = "پایتخت اسپانیا کجاست؟"
context = "مادرید، پایتخت اسپانیا، یکی از زیباترین شهرهای جهان است."

tokenized_inputs = tokenize_example(question, context)
print("Tokenized Inputs:", tokenized_inputs)


In [None]:
import torch
from transformers import BertTokenizerFast, BertForQuestionAnswering

# Initialize the tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('HooshvareLab/bert-fa-base-uncased')
model = BertForQuestionAnswering.from_pretrained('HooshvareLab/bert-fa-base-uncased').cuda()

# Example question and context
question = "پایتخت اسپانیا کجاست؟"
context = "مادرید، پایتخت اسپانیا، یکی از زیباترین شهرهای جهان است."

# Tokenize inputs
inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True)
inputs = {key: value.cuda() for key, value in inputs.items()}  # Ensure inputs are on the correct device

# Get model outputs
outputs = model(**inputs)
start_logits = outputs.start_logits
end_logits = outputs.end_logits

# Determine the start and end of the answer
start_idx = torch.argmax(start_logits)
end_idx = torch.argmax(end_logits)

# Convert token IDs back to the answer string
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx + 1]))
print("Predicted Answer:", answer)
