In [63]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import re
from tqdm import tqdm
import time

In [64]:
df = pd.read_csv("/content/medium_data.csv")

In [77]:
df.head()

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date,text
0,1,https://towardsdatascience.com/a-beginners-gui...,A Beginner’s Guide to Word Embedding with Gens...,,1.png,850,8,8,Towards Data Science,2019-05-30,A Beginner’s Guide to Word Embedding with Gens...
1,2,https://towardsdatascience.com/hands-on-graph-...,Hands-on Graph Neural Networks with PyTorch & ...,,2.png,1100,11,9,Towards Data Science,2019-05-30,Hands-on Graph Neural Networks with PyTorch & ...
2,3,https://towardsdatascience.com/how-to-use-ggpl...,How to Use ggplot2 in Python,A Grammar of Graphics for Python,3.png,767,1,5,Towards Data Science,2019-05-30,How to Use ggplot2 in Python A Grammar of Grap...
3,4,https://towardsdatascience.com/databricks-how-...,Databricks: How to Save Files in CSV on Your L...,When I work on Python projects dealing…,4.jpeg,354,0,4,Towards Data Science,2019-05-30,Databricks: How to Save Files in CSV on Your L...
4,5,https://towardsdatascience.com/a-step-by-step-...,A Step-by-Step Implementation of Gradient Desc...,One example of building neural…,5.jpeg,211,3,4,Towards Data Science,2019-05-30,A Step-by-Step Implementation of Gradient Desc...


In [66]:
df["text"] = df["title"].fillna("") + " " + df["subtitle"].fillna("") # Combine title and subtitle to create a new 'text' column, handling missing values

In [67]:
def clean_text(text):
  text = text.lower()
  text = re.sub(r"[^a-zA-Z\s]", "", text) # Remove special characters and numbers
  return text

texts = df["text"].apply(clean_text).tolist()

In [68]:
tokens = []
for text in texts:
  tokens.extend(text.split()) # Tokenize text into words

vocab = sorted(set(tokens))
word_to_idx = {word: i for i, word in enumerate(vocab)} # Map words to unique integer indices
idx_to_word = {i: word for word, i in word_to_idx.items()} # Map integer indices back to words
vocab_size = len(vocab)

print("Vocabulary size: ", vocab_size)

Vocabulary size:  10422


In [69]:
SEQ_LENGTH = 5 # Length of input sequences for the model

input_sequences = []
target_words = []

for text in texts:
  words = text.split()
  for i in range(len(words) - SEQ_LENGTH):
    input_sequences.append(words[i:i+SEQ_LENGTH]) # Creating input sequences of SEQ_LENGTH
    target_words.append(words[i+SEQ_LENGTH]) # The word immediately following the sequence is the target

In [70]:
X = [[word_to_idx[w] for w in seq] for seq in input_sequences] # Converting input sequences to numerical indices
y = [word_to_idx[w] for w in target_words] # Converting target words to numerical indices

X = torch.tensor(X, dtype=torch.long)
y = torch.tensor(y, dtype=torch.long)

print("X shape: ", X.shape)
print("y shape: ", y.shape)

X shape:  torch.Size([46700, 5])
y shape:  torch.Size([46700])


In [71]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx] # Return a sample (input sequence, target word)


dataset = TextDataset(X, y)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

In [72]:
class NextWordLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=2, dropout_rate=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=dropout_rate if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = out[:, -1, :] # Get the output from the last time step
        out = self.fc(out)
        return out

In [73]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

embed_dim = 200  # Dimension of word embeddings
hidden_dim = 256 # Number of units in the LSTM hidden state
num_layers = 2   # Number of LSTM layers
dropout_rate = 0.3 # Dropout rate for regularization

model = NextWordLSTM(vocab_size=vocab_size, embed_dim=embed_dim, hidden_dim=hidden_dim, num_layers=num_layers, dropout_rate=dropout_rate).to(device)

criterion = nn.CrossEntropyLoss() # Loss function
optimizer = optim.Adam(model.parameters(), lr=0.001) # Optimizer to update weights and minimize loss during training

In [74]:
EPOCHS = 70
torch.set_grad_enabled(True)
model.train()

best_loss = float("inf")
patience = 3   # number of epochs to wait
wait = 0

for epoch in range(EPOCHS):
    total_loss = 0

    for X_batch, y_batch in loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Avg Loss: {avg_loss:.4f}")

    if avg_loss < best_loss:
        best_loss = avg_loss
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print("!! Early stopping triggered !!")
            break

Epoch 1/70, Avg Loss: 7.2145
Epoch 2/70, Avg Loss: 6.3853
Epoch 3/70, Avg Loss: 5.8129
Epoch 4/70, Avg Loss: 5.2285
Epoch 5/70, Avg Loss: 4.6155
Epoch 6/70, Avg Loss: 4.0005
Epoch 7/70, Avg Loss: 3.4180
Epoch 8/70, Avg Loss: 2.8881
Epoch 9/70, Avg Loss: 2.4303
Epoch 10/70, Avg Loss: 2.0493
Epoch 11/70, Avg Loss: 1.7291
Epoch 12/70, Avg Loss: 1.4500
Epoch 13/70, Avg Loss: 1.2257
Epoch 14/70, Avg Loss: 1.0373
Epoch 15/70, Avg Loss: 0.8775
Epoch 16/70, Avg Loss: 0.7514
Epoch 17/70, Avg Loss: 0.6451
Epoch 18/70, Avg Loss: 0.5561
Epoch 19/70, Avg Loss: 0.4869
Epoch 20/70, Avg Loss: 0.4240
Epoch 21/70, Avg Loss: 0.3881
Epoch 22/70, Avg Loss: 0.3506
Epoch 23/70, Avg Loss: 0.3146
Epoch 24/70, Avg Loss: 0.2964
Epoch 25/70, Avg Loss: 0.2809
Epoch 26/70, Avg Loss: 0.2658
Epoch 27/70, Avg Loss: 0.2531
Epoch 28/70, Avg Loss: 0.2360
Epoch 29/70, Avg Loss: 0.2303
Epoch 30/70, Avg Loss: 0.2136
Epoch 31/70, Avg Loss: 0.2193
Epoch 32/70, Avg Loss: 0.2056
Epoch 33/70, Avg Loss: 0.1976
Epoch 34/70, Avg Lo

In [75]:
def predict_next_word(model, text, top_k=3):
    model.eval()
    words = clean_text(text).split()[-SEQ_LENGTH:] # Clean and get the last SEQ_LENGTH words from input text
    words = [word_to_idx[w] for w in words if w in word_to_idx] # Converting words to indices, only if present in vocab

    if len(words) < SEQ_LENGTH:
        return "Not enough words" # Handle cases where input text is too short

    x = torch.tensor([words], dtype=torch.long).to(device)

    with torch.no_grad():
        output = model(x) # Get model output (logits)
        probs = torch.softmax(output, dim=1) # Convert logits to probabilities
        top = torch.topk(probs, top_k) # Get the top K predicted words and their probabilities

    return [idx_to_word[idx.item()] for idx in top.indices[0]] # Return the actual words corresponding to the top K indices

In [76]:
model.eval()
torch.set_grad_enabled(False)

text = "a beginners guide to word embedding"

print("\n Live generation preview:\n")

for _ in tqdm(range(10), desc="Predicting next words", ncols=80): # Loop 1 for repeating the next-word prediction process 10 times
    preds = predict_next_word(model, text, top_k=1)

    if preds == "Not enough words":
        print("\nNot enough words to continue.")
        break

    next_word = preds[0] # Get the predicted next word
    text += " " + next_word # Append the predicted word to the current text

    time.sleep(0.3) # Small delay for visualization
    print(f'\n{text}') # Print the evolving text


for _ in range(20): # Loop 2 for completing sentence
    preds = predict_next_word(model, text, top_k=1) # Predict the next single word

    if preds == "Not enough words":
        break

    next_word = preds[0] # Get the predicted next word
    text += " " + next_word # Append the predicted word to the current text

    if len(text.split()) > 30: # Stop if the generated text exceeds 30 words
        break

print("\nFinal Output:")
print(text)


 Live generation preview:



Predicting next words:  10%|██                   | 1/10 [00:00<00:02,  3.31it/s]


a beginners guide to word embedding with


Predicting next words:  20%|████▏                | 2/10 [00:00<00:02,  3.31it/s]


a beginners guide to word embedding with gensim


Predicting next words:  30%|██████▎              | 3/10 [00:00<00:02,  3.31it/s]


a beginners guide to word embedding with gensim wordvec


Predicting next words:  40%|████████▍            | 4/10 [00:01<00:01,  3.30it/s]


a beginners guide to word embedding with gensim wordvec model


Predicting next words:  50%|██████████▌          | 5/10 [00:01<00:01,  3.30it/s]


a beginners guide to word embedding with gensim wordvec model using


Predicting next words:  60%|████████████▌        | 6/10 [00:01<00:01,  3.29it/s]


a beginners guide to word embedding with gensim wordvec model using machine


Predicting next words:  70%|██████████████▋      | 7/10 [00:02<00:00,  3.29it/s]


a beginners guide to word embedding with gensim wordvec model using machine learning


Predicting next words:  80%|████████████████▊    | 8/10 [00:02<00:00,  3.29it/s]


a beginners guide to word embedding with gensim wordvec model using machine learning ml


Predicting next words:  90%|██████████████████▉  | 9/10 [00:02<00:00,  3.30it/s]


a beginners guide to word embedding with gensim wordvec model using machine learning ml and


Predicting next words: 100%|████████████████████| 10/10 [00:03<00:00,  3.30it/s]


a beginners guide to word embedding with gensim wordvec model using machine learning ml and deep

Final Output:
a beginners guide to word embedding with gensim wordvec model using machine learning ml and deep learning dl dl and cause rights exploring included reinforcement learning algorithm rights jetson nvidia and



