<a href="https://colab.research.google.com/github/aashishkant/GhalibGram/blob/master/mirza2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers




In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import GRU, Embedding
from transformers import TrainingArguments


In [None]:
# Set the output directory to the current working directory
fine_tuned_model_output_dir = "./"

In [None]:
# Read data from the Oscar text file
with open("mirza.txt", "r", encoding="utf-8") as file:
    data = file.read()

In [None]:
def get_char_mapping(data):
    unique_chars = list(set(data))
    char_to_index = {char: idx for idx, char in enumerate(unique_chars)}
    index_to_char = {idx: char for char, idx in char_to_index.items()}
    return char_to_index, index_to_char
#char_to_index, index_to_char = get_char_mapping(data)
#print(get_char_mapping)print(char_to_index)

In [None]:
char_to_index, index_to_char = get_char_mapping(data)

In [None]:
block_size = 64
batch_size = 16
max_iters = 5000
learning_rate = 1e-4
eval_iters = 250
temperature = 1.0
device = torch.device("cpu")

In [None]:
# Assuming 'data' is defined elsewhere in your code
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]


In [None]:
def get_batch(split):
    data_split = train_data if split == 'train' else val_data
    ix = torch.randint(len(data_split) - block_size, (batch_size,))
    x = torch.stack([torch.tensor([char_to_index[char] for char in data_split[i:i + block_size]]) for i in ix])
    y = torch.stack([torch.tensor([char_to_index[char] for char in data_split[i + 1:i + block_size + 1]]) for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


In [None]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(BigramLanguageModel, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_size)
        self.gru = GRU(embedding_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(p=0.5)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, input, targets=None):
        embedded = self.embedding(input)
        gru_out, _ = self.gru(embedded)
        gru_out = self.dropout(gru_out)
        logits = self.fc(gru_out)

        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, vocab_size), targets.view(-1))
        else:
            loss = None

        return logits, loss

    def generate(self, input, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self.forward(input)
            logits = logits[:, -1, :]
            probs = F.softmax(logits / temperature, dim=-1)
            index_next = torch.multinomial(probs, num_samples=1)

            # Fixing the dimension mismatch
            index_next = index_next.view(-1, 1)

            input = torch.cat((input, index_next), dim=1)
        return input


In [None]:
# Define your vocabulary size based on the content of the Oscar text file
vocab_size = len(set(data))
embedding_size = 128
hidden_size = 256

model = BigramLanguageModel(vocab_size, embedding_size, hidden_size)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


In [None]:

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    xb, yb = get_batch('train')

    logits, loss = model.forward(xb, yb)

    # Add gradient clipping
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


step: 0, train loss: 0.087, val loss: 4.586
step: 250, train loss: 0.086, val loss: 4.559
step: 500, train loss: 0.085, val loss: 4.592
step: 750, train loss: 0.086, val loss: 4.672
step: 1000, train loss: 0.085, val loss: 4.622
step: 1250, train loss: 0.085, val loss: 4.639
step: 1500, train loss: 0.084, val loss: 4.666
step: 1750, train loss: 0.084, val loss: 4.657
step: 2000, train loss: 0.084, val loss: 4.657
step: 2250, train loss: 0.083, val loss: 4.659
step: 2500, train loss: 0.084, val loss: 4.665
step: 2750, train loss: 0.083, val loss: 4.671
step: 3000, train loss: 0.083, val loss: 4.688
step: 3250, train loss: 0.083, val loss: 4.694
step: 3500, train loss: 0.083, val loss: 4.616
step: 3750, train loss: 0.083, val loss: 4.711
step: 4000, train loss: 0.082, val loss: 4.680
step: 4250, train loss: 0.083, val loss: 4.771
step: 4500, train loss: 0.082, val loss: 4.711
step: 4750, train loss: 0.082, val loss: 4.705


In [None]:
print(loss.item())


0.10828258842229843


In [None]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_indices = model.generate(context, max_new_tokens=500)[0].tolist()
generated_chars = ''.join([index_to_char[idx] for idx in generated_indices])
print(generated_chars)

hī pā se nikāle gar na ḳhaar ātish

Sharar hai rañg ba.ad iz.hār-e-tāb-e-jalva-e-tamkīñ
kare hai sañ teñ zulā kar āvāz gar nahīñ aatī

Dāġh-e-dil gar nazar nahīñ aatā
bū bhī ai chārā-gar nahīñ aatī

Ham vahāñ haiñ jahāñ se ham ko bi
āḳhir is dard kī davā kyā hai

Ham haiñ mushtāq aur vo be-zār
yā ilāhī ye mājrā kyā hai

Maiñ bhī muñh meñ zabān rakhtā huuñ
kaash pūchho ki mudda.ā kyā hai

Jab ki tujh bin nahīñ koī maujūd
phir ye hañgāma ai ḳhudā kyā hai

Ye parī-chehra log kaise haiñ
ġhamza o ishva o adā kyā hai

Shikan-e-zulf-e-ambarīñ kyuuñ hai
nigah-e-chashm-e-surma se kous kahāñ se aa.e haiñ
abr kyā chiiz hai havā kyā hai
āġhil gar nahīñ aatī

Dāġh-e-dil gar nazar nahīñ aatā
bū bir darvehrd yāġhāñ haiñ jahāñ se ham ko bhī
kuchh hamārī ḳhabar nahīñ aatī

Marte haiñ aarzū meñ marne hī baat jo chup huuñ
varna kyā baat kar nahīñ aatī

Kyuun na chīḳhūñ ki yaad karte haiñ
merī āvāz gar nahīñ aatī

Dāġh-e-dil gar nazar nahīñ aatā
bū bhī ai chārā-gar nahīñ aatī

Ham vahāñ haiñ jahāñ se ham 