# **Tiny Transformer**
complete the training pipeline, including:
- Preprocessing (tokenization & vocab creation)
- Sequence preparation (adding <SOS> and <EOS>)
- Dataset & DataLoader
- Training loop
- Inference function

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim

## **1. Data**

In [None]:
pairs = [
    ("मैं खुश हूँ", "i am happy"),
    ("आप दुखी हैं", "you are sad"),
    ("वह थक गया है", "he is tired"),
    ("वह थकी हुई है", "she is tired"),
    ("हम भूखें हैं", "we are hungry"),
    ("वे व्यस्त हैं", "they are busy"),
    ("मुझे ठंड लग रही है", "i am cold"),
    ("तुम देरी से आए हो", "you are late"),
    ("वह खुश है", "she is happy"),
    ("हम तैयार हैं", "we are ready"),
    ("मैं दुखी हूँ", "i am sad"),
    ("तुम खुश हो", "you are happy"),
    ("वे तैयार हैं", "they are ready"),
    ("वह थका हुआ है", "he is tired"),
    ("तुम व्यस्त हो", "you are busy"),
    ("हम ठंडे हैं", "we are cold"),
    ("क्या आप तैयार हैं?", "are you ready?"),
    ("क्या वह आ रही है?", "is she coming?"),
    ("क्या तुम ठीक हो?", "are you okay?"),
    ("मैं ठीक हूँ", "i am fine"),
    ("क्या तुम व्यस्त हो?", "are you busy?"),
    ("वह स्कूल जा रहा है", "he is going to school"),
    ("वह बाजार जा रही है", "she is going to the market"),
    ("मैं पढ़ रहा हूँ", "i am studying"),
    ("तुम क्या कर रहे हो?", "what are you doing?"),
    ("मैं सो रहा हूँ", "i am sleeping"),
    ("तुम कहाँ जा रहे हो?", "where are you going?"),
    ("मैं घर जा रहा हूँ", "i am going home"),
    ("वह टीवी देख रहा है", "he is watching TV"),
    ("क्या तुम खाना खा चुके हो?", "have you eaten?"),
    ("मैंने खाना खा लिया है", "i have eaten"),
    ("तुम कौन हो?", "who are you?"),
    ("मैं एक छात्र हूँ", "i am a student"),
    ("वह डॉक्टर है", "he is a doctor"),
    ("वह शिक्षक है", "she is a teacher"),
    ("हम दोस्त हैं", "we are friends"),
    ("तुम मेरे दोस्त हो", "you are my friend"),
    ("आज मौसम अच्छा है", "the weather is good today"),
    ("बाहर बारिश हो रही है", "it is raining outside"),
    ("आज बहुत ठंड है", "it is very cold today"),
    ("क्या आप अंग्रेज़ी बोलते हैं?", "do you speak english?"),
    ("हाँ, मैं अंग्रेज़ी बोलता हूँ", "yes, i speak english"),
    ("नहीं, मैं अंग्रेज़ी नहीं बोलता", "no, i do not speak english"),
    ("कृपया धीरे बोलिए", "please speak slowly"),
    ("मुझे माफ़ करें", "i am sorry"),
    ("धन्यवाद", "thank you"),
    ("स्वागत है", "you are welcome"),
    ("क्या समय हुआ है?", "what time is it?"),
    ("मुझे नहीं पता", "i do not know"),
    ("मैं नहीं समझा", "i did not understand"),
    ("कृपया दोहराइए", "please repeat"),
]


## **2. Vocab building**

In [None]:
from collections import defaultdict

In [None]:
def build_vocab(sentences):
    vocab = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
    idx = 4
    for sent in sentences:
        for word in sent.split():
            if word not in vocab:
                vocab[word] = idx
                idx += 1
    return vocab


In [None]:
hindi_sentences = [h for h, e in pairs]
english_sentences = [e for h, e in pairs]

hindi_vocab = build_vocab(hindi_sentences)
english_vocab = build_vocab(english_sentences)

inv_english_vocab = {v: k for k, v in english_vocab.items()}

## **3. Tokenizer helpers**

In [None]:
def tokenize(sentence, vocab):
    return [vocab.get(word, vocab["<UNK>"]) for word in sentence.split()]

def add_tokens(seq, vocab):
    return [vocab["<SOS>"]] + seq + [vocab["<EOS>"]]

## **4. Dataset**

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, pairs, hindi_vocab, english_vocab):
        self.data = []
        for hin, eng in pairs:
            hin_tokens = add_tokens(tokenize(hin, hindi_vocab), hindi_vocab)
            eng_tokens = add_tokens(tokenize(eng, english_vocab), english_vocab)
            self.data.append((torch.tensor(hin_tokens), torch.tensor(eng_tokens)))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
def collate_fn(batch):
    hindi, english = zip(*batch)
    return (
        pad_sequence(hindi, batch_first=True, padding_value=hindi_vocab["<PAD>"]),
        pad_sequence(english, batch_first=True, padding_value=english_vocab["<PAD>"])
    )

In [None]:
dataset = TranslationDataset(pairs, hindi_vocab, english_vocab)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

## **5. Model**

In [None]:
#helper modules - way srp

class TinyTransformer(nn.Module):
  def __init__(self,hindi_vocab_size, english_vocab_size, d_model=32):
    super().__init__()
    self.d_model = d_model

    #Embeddings
    self.hindi_emb = nn.Embedding(hindi_vocab_size, d_model)
    self.english_emb = nn.Embedding(english_vocab_size, d_model)

    #Single attention layer
    self.attention = nn.MultiheadAttention(d_model, num_heads=1, batch_first=True)

    #feedforard later
    # hindi - englis
    #inp - 32 op-vocab size -? english _vocab size?
    self.ffn = nn.Linear(d_model, english_vocab_size)

  def forward(self, hindi, english):
    #embed
    h_emb = self.hindi_emb(hindi)
    e_emb = self.english_emb(english)

    #attention                         @query,key,value
    attented, weights = self.attention(e_emb, h_emb, h_emb)

    output = self.ffn(attented)

    return output

## **6. Training**

`CUDA stands for Compute Unified Device Architecture — it is a parallel computing platform and API developed by NVIDIA that allows your GPU (Graphics Processing Unit) to perform general-purpose computations (not just graphics rendering).`

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TinyTransformer(len(hindi_vocab), len(english_vocab)).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss(ignore_index=english_vocab["<PAD>"])

In [None]:
for epoch in range(50):
    model.train()
    total_loss = 0

    for hindi, english in dataloader:
        hindi, english = hindi.to(device), english.to(device)

        # Shift English for teacher forcing
        input_english = english[:, :-1]
        target_english = english[:, 1:]

        output = model(hindi, input_english)
        output = output.reshape(-1, output.shape[-1])
        target = target_english.reshape(-1)

        loss = criterion(output, target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 100.5661
Epoch 2, Loss: 78.6104
Epoch 3, Loss: 59.3238
Epoch 4, Loss: 54.4675
Epoch 5, Loss: 43.3650
Epoch 6, Loss: 38.0139
Epoch 7, Loss: 35.9755
Epoch 8, Loss: 30.1930
Epoch 9, Loss: 32.3335
Epoch 10, Loss: 31.3307
Epoch 11, Loss: 27.9239
Epoch 12, Loss: 27.0539
Epoch 13, Loss: 26.5243
Epoch 14, Loss: 25.7373
Epoch 15, Loss: 26.8565
Epoch 16, Loss: 27.4223
Epoch 17, Loss: 25.4458
Epoch 18, Loss: 22.4163
Epoch 19, Loss: 20.4710
Epoch 20, Loss: 20.8281
Epoch 21, Loss: 21.2567
Epoch 22, Loss: 19.1312
Epoch 23, Loss: 18.7586
Epoch 24, Loss: 20.7685
Epoch 25, Loss: 22.7718
Epoch 26, Loss: 18.9869
Epoch 27, Loss: 20.7499
Epoch 28, Loss: 21.4003
Epoch 29, Loss: 22.3462
Epoch 30, Loss: 21.4046
Epoch 31, Loss: 19.1117
Epoch 32, Loss: 18.7097
Epoch 33, Loss: 19.2413
Epoch 34, Loss: 18.6041
Epoch 35, Loss: 15.9459
Epoch 36, Loss: 14.8069
Epoch 37, Loss: 16.8873
Epoch 38, Loss: 17.2674
Epoch 39, Loss: 16.8566
Epoch 40, Loss: 15.5341
Epoch 41, Loss: 18.4584
Epoch 42, Loss: 20.6820


## **7. Inference**

In [None]:
def translate_sentence(hindi_sentence):
    model.eval()
    with torch.no_grad():
        hin = add_tokens(tokenize(hindi_sentence, hindi_vocab), hindi_vocab)
        hin_tensor = torch.tensor(hin).unsqueeze(0).to(device)

        eng_input = torch.tensor([english_vocab["<SOS>"]], dtype=torch.long).unsqueeze(0).to(device)
        result = []

        for _ in range(10):
            output = model(hin_tensor, eng_input)
            next_token = output[0, -1].argmax(-1).item()
            if next_token == english_vocab["<EOS>"]:
                break
            result.append(inv_english_vocab.get(next_token, "<UNK>"))
            eng_input = torch.cat([eng_input, torch.tensor([[next_token]], device=device)], dim=1)

        return ' '.join(result)

## **8. Test**

In [None]:
print("\nTranslation Examples:\n")
for hindi, _ in pairs:
    print(f"{hindi} -> {translate_sentence(hindi)}")


Translation Examples:

मैं खुश हूँ -> i am happy
आप दुखी हैं -> you are we
वह थक गया है -> is tired
वह थकी हुई है -> is tired
हम भूखें हैं -> we
वे व्यस्त हैं -> they
मुझे ठंड लग रही है -> i is are i is are i is are i
तुम देरी से आए हो -> you are late
वह खुश है -> is happy
हम तैयार हैं -> we
मैं दुखी हूँ -> i am sad
तुम खुश हो -> you are happy
वे तैयार हैं -> they
वह थका हुआ है -> is tired
तुम व्यस्त हो -> you are late
हम ठंडे हैं -> we
क्या आप तैयार हैं? -> you ready?
क्या वह आ रही है? -> is it?
क्या तुम ठीक हो? -> you ready?
मैं ठीक हूँ -> i am fine
क्या तुम व्यस्त हो? -> you ready?
वह स्कूल जा रहा है -> is going going going going going going going going going
वह बाजार जा रही है -> is going going going going going going going going going
मैं पढ़ रहा हूँ -> i am studying
तुम क्या कर रहे हो? -> you ready?
मैं सो रहा हूँ -> i am sleeping
तुम कहाँ जा रहे हो? -> you doing?
मैं घर जा रहा हूँ -> i am going going going going going going going going
वह टीवी देख रहा है -> is watching TV is wa

---

### ✅ Notes:

* It uses **only 1 attention head**, so it's a *very tiny* model—great for learning.
* You can improve it by:

  * Using positional encodings
  * Adding encoder-decoder layers
  * Using a larger dataset
* This model works because the dataset is tiny and deterministic.

---



# **Add positional encodings and Use multi-heads and layers**

1. Positional Encoding — Adds position awareness to the model.

2. Multi-head Attention — Already exists; we’ll expand it.

3. Stacked Layers — Multiple encoder and decoder layers.

In [None]:
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# ----------------------------
# 1. Data Setup
# ----------------------------

pairs = [
    ("मैं खुश हूँ", "i am happy"),
    ("आप दुखी हैं", "you are sad"),
    ("वह थक गया है", "he is tired"),
    ("वह थकी हुई है", "she is tired"),
    ("हम भूखें हैं", "we are hungry"),
    ("वे व्यस्त हैं", "they are busy"),
    ("मुझे ठंड लग रही है", "i am cold"),
    ("तुम देरी से आए हो", "you are late"),
    ("वह खुश है", "she is happy"),
    ("हम तैयार हैं", "we are ready"),
    ("मैं दुखी हूँ", "i am sad"),
    ("तुम खुश हो", "you are happy"),
    ("वे तैयार हैं", "they are ready"),
    ("वह थका हुआ है", "he is tired"),
    ("तुम व्यस्त हो", "you are busy"),
    ("हम ठंडे हैं", "we are cold"),
    ("क्या आप तैयार हैं?", "are you ready?"),
    ("क्या वह आ रही है?", "is she coming?"),
    ("क्या तुम ठीक हो?", "are you okay?"),
    ("मैं ठीक हूँ", "i am fine"),
    ("क्या तुम व्यस्त हो?", "are you busy?"),
    ("वह स्कूल जा रहा है", "he is going to school"),
    ("वह बाजार जा रही है", "she is going to the market"),
    ("मैं पढ़ रहा हूँ", "i am studying"),
    ("तुम क्या कर रहे हो?", "what are you doing?"),
    ("मैं सो रहा हूँ", "i am sleeping"),
    ("तुम कहाँ जा रहे हो?", "where are you going?"),
    ("मैं घर जा रहा हूँ", "i am going home"),
    ("वह टीवी देख रहा है", "he is watching TV"),
    ("क्या तुम खाना खा चुके हो?", "have you eaten?"),
    ("मैंने खाना खा लिया है", "i have eaten"),
    ("तुम कौन हो?", "who are you?"),
    ("मैं एक छात्र हूँ", "i am a student"),
    ("वह डॉक्टर है", "he is a doctor"),
    ("वह शिक्षक है", "she is a teacher"),
    ("हम दोस्त हैं", "we are friends"),
    ("तुम मेरे दोस्त हो", "you are my friend"),
    ("आज मौसम अच्छा है", "the weather is good today"),
    ("बाहर बारिश हो रही है", "it is raining outside"),
    ("आज बहुत ठंड है", "it is very cold today"),
    ("क्या आप अंग्रेज़ी बोलते हैं?", "do you speak english?"),
    ("हाँ, मैं अंग्रेज़ी बोलता हूँ", "yes, i speak english"),
    ("नहीं, मैं अंग्रेज़ी नहीं बोलता", "no, i do not speak english"),
    ("कृपया धीरे बोलिए", "please speak slowly"),
    ("मुझे माफ़ करें", "i am sorry"),
    ("धन्यवाद", "thank you"),
    ("स्वागत है", "you are welcome"),
    ("क्या समय हुआ है?", "what time is it?"),
    ("मुझे नहीं पता", "i do not know"),
    ("मैं नहीं समझा", "i did not understand"),
    ("कृपया दोहराइए", "please repeat"),
]


def build_vocab(sentences):
    vocab = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
    idx = 4
    for sent in sentences:
        for word in sent.split():
            if word not in vocab:
                vocab[word] = idx
                idx += 1
    return vocab

hindi_sentences = [h for h, e in pairs]
english_sentences = [e for h, e in pairs]

hindi_vocab = build_vocab(hindi_sentences)
english_vocab = build_vocab(english_sentences)
inv_english_vocab = {v: k for k, v in english_vocab.items()}

def tokenize(sentence, vocab):
    return [vocab.get(w, vocab["<UNK>"]) for w in sentence.split()]

def add_tokens(seq, vocab):
    return [vocab["<SOS>"]] + seq + [vocab["<EOS>"]]

class TranslationDataset(Dataset):
    def __init__(self, pairs, hin_vocab, eng_vocab):
        self.data = []
        for h, e in pairs:
            hin_tokens = add_tokens(tokenize(h, hin_vocab), hin_vocab)
            eng_tokens = add_tokens(tokenize(e, eng_vocab), eng_vocab)
            self.data.append((torch.tensor(hin_tokens), torch.tensor(eng_tokens)))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def collate_fn(batch):
    hin, eng = zip(*batch)
    return (
        pad_sequence(hin, batch_first=True, padding_value=hindi_vocab["<PAD>"]),
        pad_sequence(eng, batch_first=True, padding_value=english_vocab["<PAD>"])
    )

dataset = TranslationDataset(pairs, hindi_vocab, english_vocab)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# ----------------------------
# 2. Positional Encoding
# ----------------------------

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=100):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# ----------------------------
# 3. Transformer Architecture
# ----------------------------

class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super().__init__()
        self.attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        attn_out, _ = self.attn(x, x, x)
        x = self.norm1(x + attn_out)
        ff_out = self.ff(x)
        x = self.norm2(x + ff_out)
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
        self.cross_attn = nn.MultiheadAttention(d_model, n_heads, batch_first=True)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

    def forward(self, tgt, memory):
        tgt_len = tgt.size(1)
        mask = torch.triu(torch.ones((tgt_len, tgt_len), device=tgt.device), 1).bool()
        self_out, _ = self.self_attn(tgt, tgt, tgt, attn_mask=mask)
        tgt = self.norm1(tgt + self_out)

        cross_out, _ = self.cross_attn(tgt, memory, memory)
        tgt = self.norm2(tgt + cross_out)

        ff_out = self.ff(tgt)
        tgt = self.norm3(tgt + ff_out)
        return tgt

class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, n_layers):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pos = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ff) for _ in range(n_layers)])

    def forward(self, src):
        x = self.pos(self.embed(src))
        for layer in self.layers:
            x = layer(x)
        return x

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, d_ff, n_layers):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pos = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([DecoderLayer(d_model, n_heads, d_ff) for _ in range(n_layers)])
        self.out = nn.Linear(d_model, vocab_size)

    def forward(self, tgt, memory):
        x = self.pos(self.embed(tgt))
        for layer in self.layers:
            x = layer(x, memory)
        return self.out(x)

class TransformerModel(nn.Module):
    def __init__(self, hin_vocab_size, eng_vocab_size, d_model=32, n_heads=2, d_ff=64, n_layers=1):
        super().__init__()
        self.encoder = Encoder(hin_vocab_size, d_model, n_heads, d_ff, n_layers)
        self.decoder = Decoder(eng_vocab_size, d_model, n_heads, d_ff, n_layers)

    def forward(self, src, tgt):
        memory = self.encoder(src)
        return self.decoder(tgt, memory)

# ----------------------------
# 4. Train the Model
# ----------------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerModel(len(hindi_vocab), len(english_vocab)).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=english_vocab["<PAD>"])

for epoch in range(90):
    model.train()
    total_loss = 0
    for hin, eng in dataloader:
        hin, eng = hin.to(device), eng.to(device)
        input_eng = eng[:, :-1]
        target_eng = eng[:, 1:]

        out = model(hin, input_eng)
        out = out.reshape(-1, out.shape[-1])
        target = target_eng.reshape(-1)

        loss = criterion(out, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    if (epoch+1) % 5 == 0:
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# ----------------------------
# 5. Inference
# ----------------------------

def translate(hindi_sentence, max_len=12):
    model.eval()
    with torch.no_grad():
        hin = add_tokens(tokenize(hindi_sentence, hindi_vocab), hindi_vocab)
        hin_tensor = torch.tensor(hin).unsqueeze(0).to(device)

        tgt = torch.tensor([[english_vocab["<SOS>"]]], device=device)
        result = []

        for _ in range(max_len):
            out = model(hin_tensor, tgt)
            next_token = out[0, -1].argmax(-1).item()
            if next_token == english_vocab["<EOS>"]:
                break
            result.append(inv_english_vocab.get(next_token, "<UNK>"))
            tgt = torch.cat([tgt, torch.tensor([[next_token]], device=device)], dim=1)

        return ' '.join(result)

# ----------------------------
# 6. Evaluate Translations
# ----------------------------

print("\nTranslations:\n")
for h, _ in pairs:
    print(f"{h} → {translate(h)}")


Epoch 5, Loss: 68.5514
Epoch 10, Loss: 40.3118
Epoch 15, Loss: 24.7517
Epoch 20, Loss: 14.5274
Epoch 25, Loss: 8.3308
Epoch 30, Loss: 4.5063
Epoch 35, Loss: 3.2067
Epoch 40, Loss: 1.5605
Epoch 45, Loss: 1.0506
Epoch 50, Loss: 0.7618
Epoch 55, Loss: 0.5921
Epoch 60, Loss: 0.4638
Epoch 65, Loss: 0.3820
Epoch 70, Loss: 0.3142
Epoch 75, Loss: 0.2628
Epoch 80, Loss: 0.2204
Epoch 85, Loss: 0.1898
Epoch 90, Loss: 0.1635

Translations:

मैं खुश हूँ → i am happy
आप दुखी हैं → you are sad
वह थक गया है → he is tired
वह थकी हुई है → she is tired
हम भूखें हैं → we are hungry
वे व्यस्त हैं → they are busy
मुझे ठंड लग रही है → i am cold
तुम देरी से आए हो → you are late
वह खुश है → she is happy
हम तैयार हैं → we are ready
मैं दुखी हूँ → i am sad
तुम खुश हो → you are happy
वे तैयार हैं → they are ready
वह थका हुआ है → he is tired
तुम व्यस्त हो → you are busy
हम ठंडे हैं → we are cold
क्या आप तैयार हैं? → are you ready?
क्या वह आ रही है? → is she coming?
क्या तुम ठीक हो? → are you okay?
मैं ठीक हूँ → i 

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def evaluate_bleu():
    smoothie = SmoothingFunction().method4
    scores = []

    for hin, eng in pairs:
        reference = eng.split()
        prediction = translate(hin).split()
        score = sentence_bleu([reference], prediction, smoothing_function=smoothie)
        scores.append(score)
        print(f"{hin} → {translate(hin)}")
        print(f"Ref: {reference}")
        print(f"Hyp: {prediction}")
        print(f"BLEU: {score:.4f}\n")

    avg_bleu = sum(scores) / len(scores)
    print(f"\n🔵 Average BLEU Score: {avg_bleu:.4f} ({avg_bleu*100:.2f})")

evaluate_bleu()


मैं खुश हूँ → i am happy
Ref: ['i', 'am', 'happy']
Hyp: ['i', 'am', 'happy']
BLEU: 0.5757

आप दुखी हैं → you are sad
Ref: ['you', 'are', 'sad']
Hyp: ['you', 'are', 'sad']
BLEU: 0.5757

वह थक गया है → he is tired
Ref: ['he', 'is', 'tired']
Hyp: ['he', 'is', 'tired']
BLEU: 0.5757

वह थकी हुई है → she is tired
Ref: ['she', 'is', 'tired']
Hyp: ['she', 'is', 'tired']
BLEU: 0.5757

हम भूखें हैं → we are hungry
Ref: ['we', 'are', 'hungry']
Hyp: ['we', 'are', 'hungry']
BLEU: 0.5757

वे व्यस्त हैं → they are busy
Ref: ['they', 'are', 'busy']
Hyp: ['they', 'are', 'busy']
BLEU: 0.5757

मुझे ठंड लग रही है → i am cold
Ref: ['i', 'am', 'cold']
Hyp: ['i', 'am', 'cold']
BLEU: 0.5757

तुम देरी से आए हो → you are late
Ref: ['you', 'are', 'late']
Hyp: ['you', 'are', 'late']
BLEU: 0.5757

वह खुश है → she is happy
Ref: ['she', 'is', 'happy']
Hyp: ['she', 'is', 'happy']
BLEU: 0.5757

हम तैयार हैं → we are ready
Ref: ['we', 'are', 'ready']
Hyp: ['we', 'are', 'ready']
BLEU: 0.5757

मैं दुखी हूँ → i am sad
Ref

In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

In [None]:
def evaluate_corpus_bleu():
    references = []  # List of [reference] for each sentence
    hypotheses = []  # List of predicted tokens

    for hin, eng in pairs:
        reference = eng.split()
        prediction = translate(hin).split()

        references.append([reference])  # each ref wrapped in another list
        hypotheses.append(prediction)

        print(f"{hin} → {translate(hin)}")
        print(f"Ref: {reference}")
        print(f"Hyp: {prediction}\n")

    smoothie = SmoothingFunction().method4
    bleu = corpus_bleu(references, hypotheses, smoothing_function=smoothie)
    print(f"🔵 Corpus BLEU Score: {bleu:.4f} ({bleu*100:.2f})")

In [None]:
evaluate_corpus_bleu()


मैं खुश हूँ → i am happy
Ref: ['i', 'am', 'happy']
Hyp: ['i', 'am', 'happy']

आप दुखी हैं → you are sad
Ref: ['you', 'are', 'sad']
Hyp: ['you', 'are', 'sad']

वह थक गया है → he is tired
Ref: ['he', 'is', 'tired']
Hyp: ['he', 'is', 'tired']

वह थकी हुई है → she is tired
Ref: ['she', 'is', 'tired']
Hyp: ['she', 'is', 'tired']

हम भूखें हैं → we are hungry
Ref: ['we', 'are', 'hungry']
Hyp: ['we', 'are', 'hungry']

वे व्यस्त हैं → they are busy
Ref: ['they', 'are', 'busy']
Hyp: ['they', 'are', 'busy']

मुझे ठंड लग रही है → i am cold
Ref: ['i', 'am', 'cold']
Hyp: ['i', 'am', 'cold']

तुम देरी से आए हो → you are late
Ref: ['you', 'are', 'late']
Hyp: ['you', 'are', 'late']

वह खुश है → she is happy
Ref: ['she', 'is', 'happy']
Hyp: ['she', 'is', 'happy']

हम तैयार हैं → we are ready
Ref: ['we', 'are', 'ready']
Hyp: ['we', 'are', 'ready']

मैं दुखी हूँ → i am sad
Ref: ['i', 'am', 'sad']
Hyp: ['i', 'am', 'sad']

तुम खुश हो → you are happy
Ref: ['you', 'are', 'happy']
Hyp: ['you', 'are', 'happy']

Sure! Here's a simple example of how to use a **pre-trained Transformer model** using the **Hugging Face Transformers** library. We'll use a **BERT-based model for sentiment analysis**.

---

### ✅ Install Requirements

If you haven’t already installed `transformers` and `torch`, run this:

```bash
pip install transformers torch
```

---

### ✅ Load and Use Pre-trained Transformer (e.g., `distilbert-base-uncased`)

```python
from transformers import pipeline

# Load pre-trained sentiment analysis pipeline
classifier = pipeline("sentiment-analysis")

# Test the model
text = "I love using transformers! They make NLP so easy."
result = classifier(text)

print(result)
```

**Output:**

```python
[{'label': 'POSITIVE', 'score': 0.9998}]
```

---

### ✅ Custom Example: Load Pre-trained Model and Tokenizer

You can also manually load the model and tokenizer:

```python
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load pre-trained tokenizer and model
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Tokenize input
inputs = tokenizer("Transformers are powerful!", return_tensors="pt")

# Forward pass
with torch.no_grad():
    outputs = model(**inputs)

# Get prediction
logits = outputs.logits
predicted_class_id = torch.argmax(logits).item()

# Convert class ID to label
label = model.config.id2label[predicted_class_id]
print(f"Predicted Label: {label}")
```

---

### ✅ Common Tasks with Transformers

You can load different tasks by changing the `pipeline()` type:

| Task               | Pipeline Type            | Example Model                                           |
| ------------------ | ------------------------ | ------------------------------------------------------- |
| Sentiment Analysis | `"sentiment-analysis"`   | `distilbert-base-uncased`                               |
| Question Answering | `"question-answering"`   | `bert-large-uncased-whole-word-masking-finetuned-squad` |
| Text Generation    | `"text-generation"`      | `gpt2`                                                  |
| Translation        | `"translation_en_to_fr"` | `t5-small`                                              |
| Summarization      | `"summarization"`        | `facebook/bart-large-cnn`                               |

---



Perfect! Here's a collection of **ready-to-run code snippets** using **popular Transformer models** from the [Hugging Face 🤗 `transformers`](https://huggingface.co/models) library — for BERT, GPT-2, RoBERTa, T5, and BART — covering the **most common NLP tasks**.

---

## ✅ 1. **BERT for Text Classification (Sentiment Analysis)**

```python
from transformers import pipeline

# Load sentiment-analysis pipeline with BERT fine-tuned on SST-2
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Predict
text = "This is a fantastic movie!"
result = classifier(text)
print(result)
```

---

## ✅ 2. **GPT-2 for Text Generation**

```python
from transformers import pipeline

# Load GPT-2 text generation pipeline
generator = pipeline("text-generation", model="gpt2")

# Generate text
prompt = "In the future, AI will"
result = generator(prompt, max_length=50, num_return_sequences=1)
print(result[0]['generated_text'])
```

---

## ✅ 3. **RoBERTa for Sentiment Classification**

```python
from transformers import pipeline

# Load RoBERTa fine-tuned for sentiment
classifier = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")

# Predict
text = "I'm feeling awesome today!"
result = classifier(text)
print(result)
```

---

## ✅ 4. **T5 for Text-to-Text Tasks (e.g., Translation, Summarization)**

```python
from transformers import pipeline

# Load summarization pipeline with T5
summarizer = pipeline("summarization", model="t5-small")

# Summarize
text = "The Transformers library by Hugging Face makes it easy to use pretrained models for NLP tasks."
result = summarizer(text, max_length=30, min_length=5, do_sample=False)
print(result)
```

---

## ✅ 5. **BART for Summarization**

```python
from transformers import pipeline

# Load BART model for summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

text = """
Machine learning is a method of data analysis that automates analytical model building.
It is a branch of artificial intelligence based on the idea that systems can learn from data, identify patterns and make decisions with minimal human intervention.
"""
summary = summarizer(text, max_length=50, min_length=25, do_sample=False)
print(summary)
```

---

## ✅ 6. **T5 for English to French Translation**

```python
from transformers import pipeline

# T5 expects prompts for tasks
translator = pipeline("translation_en_to_fr", model="t5-small")

text = "The weather is nice today."
result = translator(text)
print(result[0]['translation_text'])
```

---

## ✅ 7. **BERT for Question Answering**

```python
from transformers import pipeline

# Load BERT QA pipeline
qa = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad")

# Ask question
context = "Hugging Face is a company that develops tools for natural language processing."
question = "What does Hugging Face do?"

result = qa(question=question, context=context)
print(result)
```

---

### ✅ Summary Table

| Model       | Task                       | Pretrained Example                          |
| ----------- | -------------------------- | ------------------------------------------- |
| **BERT**    | Classification, QA         | `bert-base-uncased`, SST-2, SQuAD           |
| **GPT-2**   | Text Generation            | `gpt2`                                      |
| **RoBERTa** | Classification             | `cardiffnlp/twitter-roberta-base-sentiment` |
| **T5**      | Translation, Summarization | `t5-small`                                  |
| **BART**    | Summarization              | `facebook/bart-large-cnn`                   |

---
