# Fine-tuning the mGPT Model for Armenian Q&A Task


In [1]:
!pip install datasets > /dev/null
!pip install accelerate -U > /dev/null
!pip install transformers[torch] > /dev/null
!pip install bitsandbytes > /dev/null

In [2]:
from google.colab import files
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from transformers import AutoModelForCausalLM
import torch
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch
import os

# Set environment variable to avoid memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


In [3]:
uploaded = files.upload()
data_path = next(iter(uploaded))

Saving gpt_data.csv to gpt_data.csv


### Data Preparation

In [None]:
armenian = pd.read_csv(data_path)
armenian.dropna(subset=['question', 'answer'], inplace=True)

def clean_text(text):
    """Removes specific keywords from text strings."""
    return text.replace("question", "").replace("answer", "").strip()

armenian['question'] = armenian['question'].apply(clean_text)
armenian['answer'] = armenian['answer'].apply(clean_text)
armenian = armenian[armenian['question'] != ""]
armenian = armenian[armenian['answer'] != ""]

# Convert DataFrame to Hugging Face dataset
dataset = Dataset.from_pandas(armenian)


### Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/mGPT-1.3B-armenian")

def tokenize_function(examples):
    """Tokenizes questions and answers for model input."""
    inputs = ["question: " + q + " <answer>" for q in examples['question']]
    targets = [a for a in examples['answer']]
    model_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation=True)
    labels = tokenizer(targets, max_length=128, padding="max_length", truncation=True)
    labels["input_ids"] = [[-100 if token == tokenizer.pad_token_id else token for token in label]
                           for label in labels["input_ids"]]
    return {'input_ids': model_inputs['input_ids'], 'attention_mask': model_inputs['attention_mask'],
            'labels': labels['input_ids']}

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["question", "answer"])
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
temp_val_test_split = train_test_split['test'].train_test_split(test_size=0.5)
validation_dataset = temp_val_test_split['train']
test_dataset = temp_val_test_split['test']
dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})

### Model Training

In [37]:
model = AutoModelForCausalLM.from_pretrained("ai-forever/mGPT-1.3B-armenian", torch_dtype=torch.float32)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=64,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    save_steps=500,
    logging_dir='./logs',
    logging_steps=50,
    max_grad_norm=1.0,
    fp16=True,  # Mixed precision
    report_to="none",
    optim="adamw_8bit"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['validation'],
    tokenizer=tokenizer
)

# Train and save the model
if torch.cuda.is_available():
    torch.cuda.empty_cache()
trainer.train()
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")




vocab.json:   0%|          | 0.00/1.89M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

Map:   0%|          | 0/10279 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
0,2.7105,2.690527
1,2.4381,2.585707
2,2.3951,2.554629
3,2.2826,2.55454
4,2.2357,2.596454


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

In [38]:
# Load and use the fine-tuned model
model_path = "/content/drive/MyDrive/fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Mounted at /content/drive


('/content/drive/MyDrive/fine_tuned_model/tokenizer_config.json',
 '/content/drive/MyDrive/fine_tuned_model/special_tokens_map.json',
 '/content/drive/MyDrive/fine_tuned_model/vocab.json',
 '/content/drive/MyDrive/fine_tuned_model/merges.txt',
 '/content/drive/MyDrive/fine_tuned_model/added_tokens.json',
 '/content/drive/MyDrive/fine_tuned_model/tokenizer.json')

In [44]:
def generate_answer(question, max_length=128, num_return_sequences=1, temperature=0.7):
    """Generates an answer to a given question using the fine-tuned model."""
    prompt = "question: " + question + " <answer>"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=max_length, truncation=True, padding="max_length")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_length + 50,
        num_return_sequences=num_return_sequences,
        temperature=temperature,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True).replace(prompt, "").strip()
    return answer

# Example use of the fine-tuned model
question = "ով է քիմ քարդաշյանը?"  # "Who is Kim Kardashian?" in Armenian
answer = generate_answer(question)
print("Generated Answer:", answer)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generated Answer: Ղի����ֶ�֣խՂ������դ�ա� �ա�ի�ննկ��ե�ըա�� �


### How mGPT used to work

In [None]:
# def ask_question(model, tokenizer, question_text, max_new_tokens=200):  # Increased token limit
#     inputs = tokenizer.encode_plus("question: " + question_text + " <answer>", return_tensors='pt', padding=True, truncation=True, max_length=128)
#     answer_ids = model.generate(inputs['input_ids'],
#                                 attention_mask=inputs['attention_mask'],
#                                 max_new_tokens=max_new_tokens,  # Increased token generation limit
#                                 num_return_sequences=1,
#                                 temperature=0.9)  # Optionally tweak this for more randomness
#     answer_text = tokenizer.decode(answer_ids[0], skip_special_tokens=True)
#     return answer_text


# # Test the model with an example question
# example_question = "ինչպես է կոչվում Հարրի Փոթերի առաջին վեպի անունը:"
x
# answer = ask_question(model, tokenizer, example_question)
# print("Generated Answer:", answer)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Answer: question: ինչպես է կոչվում Հարրի Փոթերի առաջին վեպի անունը: <answer> Որոշ հեղինակներ կարծում են, որ վեպը սկսվում է հենց Հարրի Փոթերի մասին գրած նամակով, որը նա ուղարկել էր Հոլմսին միայն վ�


### Try BPE

In [4]:

# df = pd.read_csv(data_path)
# armenian = df[['question_arm', 'answer_arm']]
# armenian.columns = ['question', 'answer']
# # Collect text directly from DataFrame
# text_data = armenian['question'].tolist() + armenian['answer'].tolist()

# # Train a BPE tokenizer
# from tokenizers import Tokenizer
# from tokenizers.models import BPE
# from tokenizers.pre_tokenizers import Whitespace
# from tokenizers.trainers import BpeTrainer

# tokenizer = Tokenizer(BPE())
# tokenizer.pre_tokenizer = Whitespace()
# trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

# # Train the tokenizer using the list of text data
# tokenizer.train_from_iterator(text_data, trainer)

# # Save the tokenizer
# tokenizer.save("armenian_bpe_tokenizer.json")


In [5]:
# from tokenizers import Tokenizer

# # Load the tokenizer
# tokenizer = Tokenizer.from_file("armenian_bpe_tokenizer.json")

# # Example Armenian text
# example_text = "Ի՞նչ է երկաթը։"  # "What is iron?" in Armenian

# # Encode the text
# encoded_output = tokenizer.encode(example_text)

# # Print token IDs and decoded tokens
# print("Token IDs:", encoded_output.ids)
# print("Tokens:", encoded_output.tokens)
# print("Decoded Text:", tokenizer.decode(encoded_output.ids))


Token IDs: [125, 154, 392, 161, 260, 15510, 194]
Tokens: ['Ի', '՞', 'նչ', 'է', 'երկ', 'աթը', '։']
Decoded Text: Ի ՞ նչ է երկ աթը ։
