# **Extract Texts**

In [1]:
from datasets import load_dataset

dataset = load_dataset("ptb_text_only")
texts = dataset['train']['sentence']

README.md:   0%|          | 0.00/4.21k [00:00<?, ?B/s]

ptb_text_only.py:   0%|          | 0.00/6.50k [00:00<?, ?B/s]

The repository for ptb_text_only contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/ptb_text_only.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/5.10M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/400k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/450k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/42068 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3761 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3370 [00:00<?, ? examples/s]

# **Cleaning the texts**

In [2]:
cleaned_texts = [text.lower().strip() for text in texts if text.strip()]

# **Joining the cleaned text to a single big text**

In [3]:
full_text = " ".join(cleaned_texts)

# **Tokenizing the text**

In [4]:
# import nltk
# nltk.download('punkt')
# from nltk.tokenize import word_tokenize

# tokens = word_tokenize(full_text)

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokens = tokenizer.encode(full_text, add_special_tokens=False) 


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1090586 > 1024). Running this sequence through the model will result in indexing errors


In [6]:
import torch

def create_dataset(tokens, context_size=5):
    inputs = []
    targets = []
    for i in range(len(tokens) - context_size):
        inputs.append(tokens[i:i+context_size])
        targets.append(tokens[i+context_size])
    return torch.tensor(inputs), torch.tensor(targets)

inputs, targets = create_dataset(tokens, context_size=5)

In [7]:
from sklearn.model_selection import train_test_split

train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    inputs, targets, test_size=0.1, random_state=42
)

In [8]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
BATCH_SIZE = 64

train_data = TensorDataset(train_inputs, train_targets)
val_data = TensorDataset(val_inputs, val_targets)

train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE)


In [10]:
class NextWordPredictor(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, context_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embeds = self.embedding(x)                    # [B, T, E]
        lstm_out, _ = self.lstm(embeds)               # [B, T, H]
        output = self.fc(lstm_out[:, -1, :])          # only last timestep
        return output


In [11]:
VOCAB_SIZE = tokenizer.vocab_size     # if using HuggingFace tokenizer
EMBED_DIM = 128
HIDDEN_DIM = 256
CONTEXT_SIZE = train_inputs.shape[1]

model = NextWordPredictor(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM, CONTEXT_SIZE).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)


In [12]:
EPOCHS = 5

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        output = model(batch_x)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Train Loss: {total_loss / len(train_loader):.4f}")


Epoch 1, Train Loss: 4.8340
Epoch 2, Train Loss: 4.1448
Epoch 3, Train Loss: 3.9053
Epoch 4, Train Loss: 3.7397
Epoch 5, Train Loss: 3.6084


In [13]:
model.eval()
val_loss = 0
with torch.no_grad():
    for batch_x, batch_y in val_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        output = model(batch_x)
        loss = criterion(output, batch_y)
        val_loss += loss.item()

print(f"Validation Loss: {val_loss / len(val_loader):.4f}")


Validation Loss: 4.3315


In [14]:
def predict_next_word(model, tokenizer, input_text, context_size):
    model.eval()
    tokens = tokenizer.encode(input_text, add_special_tokens=False)[-context_size:]
    input_tensor = torch.tensor([tokens], dtype=torch.long).to(device)
    with torch.no_grad():
        output = model(input_tensor)
        next_token_id = torch.argmax(output, dim=1).item()
        return tokenizer.decode([next_token_id])

# Example usage
predict_next_word(model, tokenizer, "he opened the", context_size=CONTEXT_SIZE)


' dollar'

In [22]:
predict_next_word(model, tokenizer, "i am going", context_size=CONTEXT_SIZE)

' to'