In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from os import path
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
from torch import tensor

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"using {device} device")

using cuda device


In [3]:
file_path = '/kaggle/input/ferdousi-txt/ferdousi.txt'

# read the file and igoore the first 2 lines
with open(file_path, 'r') as f:
    lines = f.readlines()[2:]

print(f"number of lines: {len(lines)}")


number of lines: 99217


In [4]:
# Create dataset    
class FerdousiDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        # self.transform = transform
        self.lines = self.load_lines()
        self.data = self._load_data()

    def load_lines(self):
        with open(self.root_dir, 'r') as f:
            lines = f.readlines()
        
        lines = [line.strip('\n') for line in lines[2:]]
        lines = lines[:2 * (len(lines) // 2)]
        return lines
    
    def __len__(self):
        return self.data['len']
    
    def _load_data(self):
        lines = self.lines
        
        # Split into santzes and lengths
        data ={
           'stanza_1': lines[0::2],
            'stanza_2': lines[1::2],
            'len': len(lines) // 2,
        }
        return data

    def __getitem__(self, idx):
        data = self.data
        if idx >= data['len']:
            raise IndexError("index out of range")
            
        return data['stanza_1'][idx], data['stanza_2'][idx]
        



In [5]:
# Create dataset
dataset = FerdousiDataset(file_path)

# Split into train and test
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Print samples from the dataset
for i in range(5):
    print(f"stanza 1: {dataset[i][0]}")
    print(f"stanza 2: {dataset[i][1]}")
    # print(f"len: {dataset[i][2]}")
    print()

stanza 1: به نام خداوند جان و خرد
stanza 2: کزین برتر اندیشه برنگذرد

stanza 1: خداوند نام و خداوند جای
stanza 2: خداوند روزی ده رهنمای

stanza 1: خداوند کیوان و گردان سپهر
stanza 2: فروزنده ماه و ناهید و مهر

stanza 1: ز نام و نشان و گمان برترست
stanza 2: نگارندهٔ بر شده پیکرست

stanza 1: به بینندگان آفریننده را
stanza 2: نبینی مرنجان دو بیننده را



In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig

model_name = "HooshvareLab/gpt2-fa"

In [7]:
print(dataset[1][1])

خداوند روزی ده رهنمای


In [48]:
special_tokens = {
    "pad_token": "<pad>",
    "unk_token": "<unk>",
    "bos_token": "<bos>",
    "eos_token": "<eos>",
    "sep_token": "<sep>",
}

# Make tokenizer specific to Persian
opt ={
    **special_tokens,
    "padding_side": "right",
    "model_max_length": 20,
    "model_name": model_name,
}

tokenizer = AutoTokenizer.from_pretrained(model_name, **opt)

tokenizer.add_special_tokens(special_tokens)

config = AutoConfig.from_pretrained(model_name)

print(config)

GPT2Config {
  "_name_or_path": "HooshvareLab/gpt2-fa",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 5,
  "embd_pdrop": 0.1,
  "eos_token_id": 5,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50,
      "top_k": 50,
      "top_p": 0.95
    }
  },
  "transformers_version": "4.36.2",
  "use_cache": true,
  "vocab_size": 42001
}



In [9]:
class TokenizedDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.encodings = {'input_ids': [], 'labels': [], 'attention_mask': []}
        
        for stanza_1, stanza_2 in tqdm(dataset, desc='Tokenizing'):
            input_enc = tokenizer(
                f"{tokenizer.bos_token}{stanza_1}{tokenizer.sep_token}{stanza_2}{tokenizer.eos_token}", 
                truncation=True,
                padding="max_length"
            )

            target_enc = tokenizer(
                f"{tokenizer.bos_token}{stanza_1}{tokenizer.sep_token}{stanza_2}{tokenizer.eos_token}", 
                truncation=True,
                padding="max_length"
            )
            
            self.encodings['input_ids'].append(tensor(input_enc['input_ids']))
            self.encodings['labels'].append(tensor(target_enc['input_ids']))
            self.encodings['attention_mask'].append(tensor(input_enc['attention_mask']))

    def __getitem__(self, idx):
        return {key: value[idx].clone().detach() for key, value in self.encodings.items()}

    def __len__(self):
        return len(self.dataset)

In [10]:
tokenized_dataset_train = TokenizedDataset(train_dataset, tokenizer)
tokenized_dataset_test = TokenizedDataset(test_dataset, tokenizer)

batch_size = 128

train_dataloader = DataLoader(
    tokenized_dataset_train,
    batch_size=batch_size,
    shuffle=True,
    num_workers=os.cpu_count(),
    pin_memory=False
)
test_dataloader = DataLoader(
    tokenized_dataset_test,
    batch_size=batch_size,
    shuffle=True,
    num_workers=os.cpu_count(),
    pin_memory=False
)

if os.cpu_count() > 1:
    os.environ["TOKENIZERS_PARALLELISM"] = "true"


Tokenizing: 100%|██████████| 39686/39686 [00:14<00:00, 2757.49it/s]
Tokenizing: 100%|██████████| 9922/9922 [00:03<00:00, 2565.13it/s]


In [11]:
print(f"number of train batches: {len(train_dataloader)}")
# print encodings
print(train_dataloader.dataset.encodings.keys())

number of train batches: 311
dict_keys(['input_ids', 'labels', 'attention_mask'])


In [12]:
model = AutoModelForCausalLM.from_config(config)

model.resize_token_embeddings(len(tokenizer))
model = model.to(device)

model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(42003, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=42003, bias=False)
)

In [13]:
# Sample from the data loader
batch = next(iter(train_dataloader))

# Access the input_ids, labels, and attention_mask from the batch
input_ids = batch['input_ids']
labels = batch['labels']
attention_mask = batch['attention_mask']

# Print the shapes of the tensors
print(f"input_ids shape: {input_ids.shape}")
print(f"labels shape: {labels.shape}")
print(f"attention_mask shape: {attention_mask.shape}")


input_ids shape: torch.Size([128, 50])
labels shape: torch.Size([128, 50])
attention_mask shape: torch.Size([128, 50])


In [14]:
num_epochs = 20
loss_dict = {'train_losses': [], 'val_losses': [], 'gen_bleu': [], 'perplexity': []}

optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

for epoch in range(num_epochs):
    train_losses = []
    val_losses = []
    gen_bleu = []
    perplexity = []

    # Training
    model.train()
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, labels=labels, attention_mask=attention_mask)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())

    # Validation
    model.eval()
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            outputs = model(input_ids=input_ids, labels=labels, attention_mask=attention_mask)
            loss = outputs.loss

            val_losses.append(loss.item())

    # Update loss_dict
    loss_dict['train_losses'].append(sum(train_losses) / len(train_losses))
    loss_dict['val_losses'].append(sum(val_losses) / len(val_losses))
    loss_dict['gen_bleu'].append(0)  # Replace with your BLEU score calculation
    loss_dict['perplexity'].append(0)  # Replace with your perplexity calculation

    # Print epoch results
    print(f"Epoch {epoch + 1}/{num_epochs}:")
    print(f"Train Loss: {loss_dict['train_losses'][-1]}")
    print(f"Val Loss: {loss_dict['val_losses'][-1]}")
    print()

    # Adjust learning rate
    scheduler.step()

# End of training loop

# Print one sample
sample_batch = next(iter(train_dataloader))
sample_input_ids = sample_batch['input_ids'].to(device)
sample_labels = sample_batch['labels'].to(device)
sample_attention_mask = sample_batch['attention_mask'].to(device)

sample_outputs = model(input_ids=sample_input_ids, labels=sample_labels, attention_mask=sample_attention_mask)
sample_loss = sample_outputs.loss

print(f"Sample Loss: {sample_loss.item()}")


Epoch 1/20:
Train Loss: 2.3532668677940247
Val Loss: 2.0957179497449827

Epoch 2/20:
Train Loss: 1.9384739096141705
Val Loss: 1.8498658216916597

Epoch 3/20:
Train Loss: 1.7709338814499294
Val Loss: 1.7182630101839702

Epoch 4/20:
Train Loss: 1.6701014820019149
Val Loss: 1.6328510703184667

Epoch 5/20:
Train Loss: 1.5832772339271963
Val Loss: 1.5537218726598299

Epoch 6/20:
Train Loss: 1.5001554673124355
Val Loss: 1.4861500905110285

Epoch 7/20:
Train Loss: 1.4222040172558505
Val Loss: 1.4157999662252574

Epoch 8/20:
Train Loss: 1.3485908661624626
Val Loss: 1.3565743978206928

Epoch 9/20:
Train Loss: 1.2795051815425469
Val Loss: 1.308380962946476

Epoch 10/20:
Train Loss: 1.2206347889455569
Val Loss: 1.2748454564656966

Epoch 11/20:
Train Loss: 1.1663779641271022
Val Loss: 1.2410783095237536

Epoch 12/20:
Train Loss: 1.1200137655834677
Val Loss: 1.217805871596703

Epoch 13/20:
Train Loss: 1.0754886911612997
Val Loss: 1.2011007849986737

Epoch 14/20:
Train Loss: 1.0331716407150318
Val L

In [54]:
def generate_sequence(poem, tokenizer, model, device):
    gen_encoding = {'input_ids': 0, 'attention_mask': 0}
    input_ = tokenizer(
        tokenizer.bos_token + poem + tokenizer.sep_token , # Masked auto encoder
        truncation=True,
        padding="max_length",
        return_tensors='pt'
    )
    gen_encoding['input_ids'] = input_["input_ids"].to(device)
    gen_encoding['attention_mask'] = torch.ones_like(gen_encoding['input_ids']).to(device)
    outputs = model.generate(
        **gen_encoding,
        top_k=50,
        max_length=30,
        num_beams=5,
        no_repeat_ngram_size=2,
        num_return_sequences=1,
        do_sample = False,
        pad_token_id=tokenizer.eos_token_id, # Open end generation
    )
    
    in_stanza = tokenizer.decode(gen_encoding['input_ids'][0], skip_special_tokens=True)
    out_stanza = tokenizer.decode(outputs[0], skip_special_tokens=True)
    out_stanza = out_stanza.replace(in_stanza, in_stanza + '  ,  ')
    print('Generated sequence: ')
    print(out_stanza)
    print("  ")

In [60]:
poems = [
    'توانا بود هر که دانا بود',
    'به نام خداوند جان و خرد',
    'هنر نزد ایرانیان است و بس',
]

not_poems = [
    'بفرستین مریض اول رو تو',
    'دکی دکی دکی دکی دکی جون',
    'محلمون داداشی نوبنیاد تا قاراشی',
    'سعیدا مرد نکونام نمیرد هرگز',
    'امشب بدجور حالم خراب',
]


In [61]:
for poem in poems:
    print("---------")
    generate_sequence(poem, tokenizer, model, device)

---------
Generated sequence: 
توانا بود هر که دانا بود  ,   نیستید در سر سر زمین
  
---------
Generated sequence: 
به نام خداوند جان و خرد  ,   درل گرد و دل دل روان
  
---------
Generated sequence: 
هنر نزد ایرانیان است و بس  ,   سترخیزید در از سرش
  


In [62]:
for poem in not_poems:
    print("----")
    generate_sequence(poem, tokenizer, model, device)

----
Generated sequence: 
بفرستین مریض اول رو تو  ,   سترخیزلان اخترگ و دل
  
----
Generated sequence: 
دکی دکی دکی دکی دکی جون  ,   در در لب رود و روان
  
----
Generated sequence: 
محلمون داداشی نوبنیاد تا قاراشی  ,  ید در لب گردن افراختند
  
----
Generated sequence: 
سعیدا مرد نکونام نمیرد هرگز  ,  که آن بر مغل دلگ و تن
  
----
Generated sequence: 
امشب بدجور حالم خراب  ,   بر در در میدان او سر کنار
  
