<a href="https://colab.research.google.com/github/anshulsinghkamboj-ml/nlp-/blob/main/lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import torch
import torch.nn as nn

In [7]:
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
corpus = [
    "artificial intelligence will shape the future",
    "machine learning enables artificial intelligence",
    "deep learning transforms data into predictions",
    "neural networks learn complex patterns"
]

In [10]:
def make_training_examples(text, seq_len=10):
    ids = tokenizer.encode(text)
    examples = []
    for i in range(1, len(ids)):
        input_ids = ids[:i]
        target_id = ids[i]
        input_ids = input_ids[-seq_len:]
        examples.append((input_ids, target_id))
    return examples

examples = []
for line in corpus:
    examples.extend(make_training_examples(line))

In [11]:
examples

[([101], 7976),
 ([101, 7976], 4454),
 ([101, 7976, 4454], 2097),
 ([101, 7976, 4454, 2097], 4338),
 ([101, 7976, 4454, 2097, 4338], 1996),
 ([101, 7976, 4454, 2097, 4338, 1996], 2925),
 ([101, 7976, 4454, 2097, 4338, 1996, 2925], 102),
 ([101], 3698),
 ([101, 3698], 4083),
 ([101, 3698, 4083], 12939),
 ([101, 3698, 4083, 12939], 7976),
 ([101, 3698, 4083, 12939, 7976], 4454),
 ([101, 3698, 4083, 12939, 7976, 4454], 102),
 ([101], 2784),
 ([101, 2784], 4083),
 ([101, 2784, 4083], 21743),
 ([101, 2784, 4083, 21743], 2951),
 ([101, 2784, 4083, 21743, 2951], 2046),
 ([101, 2784, 4083, 21743, 2951, 2046], 20932),
 ([101, 2784, 4083, 21743, 2951, 2046, 20932], 102),
 ([101], 15756),
 ([101, 15756], 6125),
 ([101, 15756, 6125], 4553),
 ([101, 15756, 6125, 4553], 3375),
 ([101, 15756, 6125, 4553, 3375], 7060),
 ([101, 15756, 6125, 4553, 3375, 7060], 102)]

In [14]:
def collate(batch):
    inputs = [item[0] for item in batch]
    targets = [item[1] for item in batch]
    padded = tokenizer.pad({"input_ids": inputs}, return_tensors="pt")
    return padded["input_ids"], torch.tensor(targets)


In [15]:
from torch.utils.data import DataLoader

loader = DataLoader(examples, batch_size=8, shuffle=True, collate_fn=collate)


In [16]:
vocab_size = tokenizer.vocab_size
vocab_size

30522

In [17]:

class LSTM_LM(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        logits = self.fc(out[:, -1, :])
        return logits

model = LSTM_LM(vocab_size)


In [18]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-3)

In [19]:
for epoch in range(20):
    for X, y in loader:
        optimizer.zero_grad()
        logits = model(X)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
    print("epoch", epoch, "loss:", loss.item())

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 0 loss: 9.62447452545166
epoch 1 loss: 8.938924789428711
epoch 2 loss: 9.31462287902832
epoch 3 loss: 4.729649543762207
epoch 4 loss: 3.766982078552246
epoch 5 loss: 3.257809638977051
epoch 6 loss: 2.0892677307128906
epoch 7 loss: 3.3763842582702637
epoch 8 loss: 2.709987163543701
epoch 9 loss: 2.887256622314453
epoch 10 loss: 2.890072822570801
epoch 11 loss: 3.3024349212646484
epoch 12 loss: 2.5401530265808105
epoch 13 loss: 0.9679878354072571
epoch 14 loss: 1.0676062107086182
epoch 15 loss: 1.0770823955535889
epoch 16 loss: 0.8845715522766113
epoch 17 loss: 1.2786500453948975
epoch 18 loss: 1.5766823291778564
epoch 19 loss: 0.5303216576576233


In [20]:
def generate(model, text, steps=10):
    ids = tokenizer.encode(text)
    model.eval()
    for _ in range(steps):
        seq = ids[-10:]
        inp = tokenizer.pad({"input_ids":[seq]}, return_tensors="pt")["input_ids"]
        logits = model(inp)
        next_id = logits.argmax().item()
        ids.append(next_id)
    return tokenizer.decode(ids)




[CLS] artificial intelligence [SEP] shape shape the future [SEP] [SEP] [SEP] [SEP] [SEP] [SEP]


In [21]:
print(generate(model, "data", 10))

[CLS] data [SEP] into predictions [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP] [SEP]
