<a href="https://colab.research.google.com/github/alicmu2024/Generating-Persian-Poems-Using-a-Tiny-GPT-like-Model/blob/main/Persian_Poem_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
%cp /content/drive/MyDrive/GenAI/NanoGPT_Persian/archive.zip .
%cp /content/drive/MyDrive/GenAI/Practicing/best_model.pth .

In [3]:
# Extracting the dataset from archive
import os
import zipfile

zip_file_path = './archive.zip'
extraction_path = './Dataset'
os.makedirs(extraction_path, exist_ok=True)
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extraction_path)

print("Extraction completed!")

Extraction completed!


In [4]:
import glob

def CombineTextFiles(input_folder, output_file):
    """
    Combine all .txt files in the input folder into a single output file.

    :param input_folder: Path to the folder containing .txt files
    :param output_file: Path to the output file
    """
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for txt_file in glob.glob(os.path.join(input_folder, '*.txt')):
            with open(txt_file, 'r', encoding='utf-8') as infile:
                outfile.write(infile.read() + "\n")
    print(f"Combined text files into {output_file}")

# Usage
CombineTextFiles('./Dataset', './combined_data.txt')

Combined text files into ./combined_data.txt


In [None]:
# charecter-based tokenization training
! python main.py --combined_file ./combined_data.txt \
--tokenizer char --max_vocab_size 41 \
--batch_size 128 --learning_rate 1e-4 --max_iters 2500 \
--eval_interval 200 --patience 300 \
--eval_iters 50 --accumulation_steps 4 \
--scheduler constant --step_size 500 --gamma 0.1 \
--pretrained_weights '/content/best_modell.pth' \
--temperature 1.0 --top_k 50 --top_p 0.95 --min_line_length 20

Number of tokens for training: 31610680
Number of tokens for validation: 3512298
Vocabulary size: 41
Loaded pretrained weights from /content/best_modell.pth
10.87 M parameters
Training Progress:   0% 0/2500 [00:00<?, ?it/s]step 0: train loss 1.4268, val loss 1.4441
Current learning rate: 0.000100
Training Progress:   8% 200/2500 [06:36<1:10:17,  1.83s/it]step 200: train loss 1.4064, val loss 1.4379
Current learning rate: 0.000100
Training Progress:  16% 400/2500 [13:17<1:04:13,  1.83s/it]step 400: train loss 1.4071, val loss 1.4361
Current learning rate: 0.000100
Training Progress:  24% 600/2500 [19:58<58:00,  1.83s/it]step 600: train loss 1.3997, val loss 1.4336
Current learning rate: 0.000100
Training Progress:  32% 800/2500 [26:39<51:54,  1.83s/it]step 800: train loss 1.3982, val loss 1.4352
Current learning rate: 0.000100
Training Progress:  40% 1000/2500 [33:20<45:54,  1.84s/it]step 1000: train loss 1.3946, val loss 1.4324
Current learning rate: 0.000100
Training Progress:  48% 12

In [None]:
import torch
from model import GPTLanguageModel
from dataset import CharTokenizer

# Load the trained model
vocab_size = 41  # Adjust this based on your training
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = GPTLanguageModel(vocab_size).to(device)

# Load the model state
model.load_state_dict(torch.load('/content/drive/MyDrive/GenAI/best_modelll.pth'))
model.eval()

GPTLanguageModel(
  (token_embedding_table): Embedding(41, 384)
  (position_embedding_table): Embedding(512, 384)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-5): 6 x Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
            (value): Linear(in_features=384, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1536, out_features=384, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((384,), eps=1e-05, elementwise_affine

In [None]:
import torch
from model import GPTLanguageModel, post_process_text
from dataset import CharTokenizer

# Load your data directly in the notebook
combined_file_path = './combined_data.txt'  # Path to your combined data file
with open(combined_file_path, 'r', encoding='utf-8') as f:
    text = f.read()

# Initialize the tokenizer
tokenizer = CharTokenizer(text)

# Load the trained model
vocab_size = 41  # Adjust this based on your training
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = GPTLanguageModel(vocab_size).to(device)

# Load the model state
model.load_state_dict(torch.load('/content/best_modelll.pth'))
model.eval()

# Example usage in a Jupyter notebook cell
starting_word = input("Enter a starting word (in Persian): ")
temperature = float(input("Enter a temperature value (e.g., 0.5 for less randomness, 1.0 for normal, 1.5 for more randomness): "))
top_k = int(input("Enter a value for top-k sampling (e.g., 50): "))
top_p = float(input("Enter a value for top-p sampling (e.g., 0.95): "))

# Generate the poem
generated_poem = generate_poem_from_word(
    model,
    tokenizer,
    starting_word,
    max_new_tokens=200,
    temperature=temperature,
    top_k=top_k,
    top_p=top_p
)

# Post-process the generated poem
formatted_poem = post_process_text(generated_poem, min_line_length=60)

# Print the formatted poem
print("Generated poem:")
print(formatted_poem)

Enter a starting word (in Persian): ز
Enter a temperature value (e.g., 0.5 for less randomness, 1.0 for normal, 1.5 for more randomness): 0.95
Enter a value for top-k sampling (e.g., 50): 38
Enter a value for top-p sampling (e.g., 0.95): 0.85
Generated poem:
ز دو چشم تو بودی سرخ و بوی گل
چندانکه من در آن بودم باز گل
ا
ز خواب روی تو دیدم کارم به چشم
هر شب چو او بر او خوابیم دید
باری که نی تو بر آبیم دید
چون باد از دو دیده بر در من نهاد
ا


In [None]:
import torch
from model import GPTLanguageModel, post_process_text
from dataset import CharTokenizer

# Load your data directly in the notebook
combined_file_path = './combined_data.txt'  # Path to your combined data file
with open(combined_file_path, 'r', encoding='utf-8') as f:
    text = f.read()

# Initialize the tokenizer
tokenizer = CharTokenizer(text)

# Load the trained model
vocab_size = 41  # Adjust this based on your training
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = GPTLanguageModel(vocab_size).to(device)

# Load the model state
model.load_state_dict(torch.load('/content/best_modelll.pth'))
model.eval()

# Example usage in a Jupyter notebook cell
starting_word = input("Enter a starting word (in Persian): ")
temperature = float(input("Enter a temperature value (e.g., 0.5 for less randomness, 1.0 for normal, 1.5 for more randomness): "))
top_k = int(input("Enter a value for top-k sampling (e.g., 50): "))
top_p = float(input("Enter a value for top-p sampling (e.g., 0.95): "))

# Generate the poem
generated_poem = generate_poem_from_word(
    model,
    tokenizer,
    starting_word,
    max_new_tokens=200,
    temperature=temperature,
    top_k=top_k,
    top_p=top_p
)

# Post-process the generated poem
formatted_poem = post_process_text(generated_poem, min_line_length=60)

# Print the formatted poem
print("Generated poem:")
print(formatted_poem)

Enter a starting word (in Persian): ت
Enter a temperature value (e.g., 0.5 for less randomness, 1.0 for normal, 1.5 for more randomness): 0.85
Enter a value for top-k sampling (e.g., 50): 40
Enter a value for top-p sampling (e.g., 0.95): 0.75
Generated poem:
تا شد از آن نهان بی رنج و بی رنج
به نیکی تا چه از رنج آمد ز
رنج
همی گفت این چه بی گنج و بی هنج و چار
به بیش از پی مرد بی
هنج و چار
همه پاک بی نیک و بد پاک بی پاک
بدو گفت رو کای مرد


In [None]:
import torch
from model import GPTLanguageModel, post_process_text
from dataset import CharTokenizer

# Load your data directly in the notebook
combined_file_path = './combined_data.txt'  # Path to your combined data file
with open(combined_file_path, 'r', encoding='utf-8') as f:
    text = f.read()

# Initialize the tokenizer
tokenizer = CharTokenizer(text)

# Load the trained model
vocab_size = 41  # Adjust this based on your training
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = GPTLanguageModel(vocab_size).to(device)

# Load the model state
model.load_state_dict(torch.load('/content/best_modelll.pth'))
model.eval()

# Example usage in a Jupyter notebook cell
starting_word = input("Enter a starting word (in Persian): ")
temperature = float(input("Enter a temperature value (e.g., 0.5 for less randomness, 1.0 for normal, 1.5 for more randomness): "))
top_k = int(input("Enter a value for top-k sampling (e.g., 50): "))
top_p = float(input("Enter a value for top-p sampling (e.g., 0.95): "))

# Generate the poem
generated_poem = generate_poem_from_word(
    model,
    tokenizer,
    starting_word,
    max_new_tokens=200,
    temperature=temperature,
    top_k=top_k,
    top_p=top_p
)

# Post-process the generated poem
formatted_poem = post_process_text(generated_poem, min_line_length=60)

# Print the formatted poem
print("Generated poem:")
print(formatted_poem)

Enter a starting word (in Persian): ب
Enter a temperature value (e.g., 0.5 for less randomness, 1.0 for normal, 1.5 for more randomness): 0.90
Enter a value for top-k sampling (e.g., 50): 35
Enter a value for top-p sampling (e.g., 0.95): 0.85
Generated poem:
بران مهر تو چون دید مهر آمد
نگه کرد و بر شد آن را کار دیدم
م
را بهر چه دید اندر گریزان
به راه آمد به بازوی ماهان
بیامد یک
زمان اندر جهان خواست
که از باد سحرگه ناله برخاست
به ناله کر
