## **Load text data:**

In [1]:
import torch

from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from torch.utils.data import Dataset, DataLoader

In [2]:
file_path = 'ferdousi.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

data_inputs = []
labels1 = []

for i in range(3, len(lines), 2):
    data_input = lines[i - 1].strip()
    label = lines[i].strip()

    data_inputs.append(data_input)
    labels1.append(label)

print("Data Inputs:")
print(data_inputs[:5])

print("\nOutputs:")
print(labels1[:5])

X_train, X_test, y_train, y_test = train_test_split(data_inputs, labels1, test_size=0.2, random_state=42)


Data Inputs:
['به نام خداوند جان و خرد', 'خداوند نام و خداوند جای', 'خداوند کیوان و گردان سپهر', 'ز نام و نشان و گمان برترست', 'به بینندگان آفریننده را']

Outputs:
['کزین برتر اندیشه برنگذرد', 'خداوند روزی ده رهنمای', 'فروزنده ماه و ناهید و مهر', 'نگارندهٔ بر شده پیکرست', 'نبینی مرنجان دو بیننده را']


In [None]:
class PoemDataset(Dataset):
    def __init__(self, data_inputs, data_outputs, tokenizer, max_length=8):
        self.data_inputs = data_inputs
        self.data_outputs = data_outputs
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.dataset_size = len(data_inputs)

        self.tokenizer.add_special_tokens({'pad_token': '<pad>', "unk_token": '<unk>'})

    def __len__(self):
        return len(self.data_inputs)

    def __getitem__(self, idx):
        idx = idx % self.dataset_size

        input_text = self.data_inputs[idx]
        output_text = self.data_outputs[idx]

        input_ids = self.tokenizer.encode(input_text, max_length=self.max_length, truncation=True, padding='max_length', return_tensors='pt')
        labels = self.tokenizer.encode(output_text, max_length=self.max_length, truncation=True, padding='max_length', return_tensors='pt')

        return {
            'input_ids': input_ids.squeeze(),
            'labels': labels.squeeze()
        }

In [None]:
model_name = "HooshvareLab/gpt2-fa"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
data_inputs = X_train
data_outputs = y_train
dataset = PoemDataset(data_inputs, data_outputs, tokenizer)
batch_size = 128
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [17]:
num_epochs = 10
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    total_loss = 0
    total_batches = 0
    batsize = len(dataloader)
    for batch_index, batch in enumerate(dataloader):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, labels=labels)
        loss = outputs.loss

        total_loss += loss.item()
        total_batches += 1

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if (batch_index + 1) % 10 == 0: 
            average_loss = total_loss / total_batches
            print(f"Batch {batch_index + 1}/{batsize}, Loss: {average_loss:.4f}")

model.save_pretrained("fine_tuned_gpt2_fa")
tokenizer.save_pretrained("fine_tuned_gpt2_fa")



Epoch 1/10
Batch 10/311, Loss: 8.8037
Batch 20/311, Loss: 7.7912
Batch 30/311, Loss: 7.3302
Batch 40/311, Loss: 7.0510
Batch 50/311, Loss: 6.8698
Batch 60/311, Loss: 6.7403
Batch 70/311, Loss: 6.6477
Batch 80/311, Loss: 6.5724
Batch 90/311, Loss: 6.5137
Batch 100/311, Loss: 6.4696
Batch 110/311, Loss: 6.4290
Batch 120/311, Loss: 6.3944
Batch 130/311, Loss: 6.3646
Batch 140/311, Loss: 6.3424
Batch 150/311, Loss: 6.3211
Batch 160/311, Loss: 6.2991
Batch 170/311, Loss: 6.2828
Batch 180/311, Loss: 6.2698
Batch 190/311, Loss: 6.2558
Batch 200/311, Loss: 6.2422
Batch 210/311, Loss: 6.2293
Batch 220/311, Loss: 6.2169
Batch 230/311, Loss: 6.2044
Batch 240/311, Loss: 6.1931
Batch 250/311, Loss: 6.1823
Batch 260/311, Loss: 6.1734
Batch 270/311, Loss: 6.1650
Batch 280/311, Loss: 6.1554
Batch 290/311, Loss: 6.1484
Batch 300/311, Loss: 6.1423
Batch 310/311, Loss: 6.1350

Epoch 2/10
Batch 10/311, Loss: 5.8712
Batch 20/311, Loss: 5.8375
Batch 30/311, Loss: 5.8508
Batch 40/311, Loss: 5.8391
Batch 50/

('fine_tuned_gpt2_fa/tokenizer_config.json',
 'fine_tuned_gpt2_fa/special_tokens_map.json',
 'fine_tuned_gpt2_fa/vocab.json',
 'fine_tuned_gpt2_fa/merges.txt',
 'fine_tuned_gpt2_fa/added_tokens.json')

In [29]:
test_data_inputs = X_test
test_data_outputs = y_test

test_dataset = PoemDataset(test_data_inputs, test_data_outputs, tokenizer)

test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model.eval()
total_correct = 0
total_samples = 0

with torch.no_grad():
    for batch_index, batch in enumerate(test_dataloader):
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, labels=labels)
        logits = outputs.logits

        predictions = torch.argmax(logits, dim=-1)

        total_correct += torch.sum(predictions == labels).item()
        total_samples += labels.numel()

accuracy = total_correct / total_samples
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 56.4307


In [28]:
inputs_text = [" که رستم سلام داد همی", " همی رفت به انجا ببیند", "تو ای مرد سالم ببین این چنین"]
max_length = 13

for input_text in inputs_text:
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    with torch.no_grad():
        output_ids = model.generate(input_ids.to(device), max_length=max_length, num_beams=5, temperature=0.7, repetition_penalty=1000000.0)

    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print("input:")
    print(generated_text[:len(input_text)])
    print("Generated Text:")
    print(generated_text[len(input_text):])
    print()


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.


input:
 که رستم سلام داد همی
Generated Text:
 به شاد رنج اوی تو و یاد



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.


input:
 همی رفت به انجا ببیند
Generated Text:
 با نهفت و مغز پوشیده بگ

input:
تو ای مرد سالم ببین این چنین
Generated Text:
یم تندان زیر دشت

