In [1]:
import torch
import pytorch_lightning as pl 
import pandas as pd
import os
import numpy as np
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
medium_data = pd.read_csv('medium_data.csv')
medium_data.head()

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/a-beginners-gui...,A Beginner’s Guide to Word Embedding with Gens...,,1.png,850,8,8,Towards Data Science,2019-05-30
1,2,https://towardsdatascience.com/hands-on-graph-...,Hands-on Graph Neural Networks with PyTorch & ...,,2.png,1100,11,9,Towards Data Science,2019-05-30
2,3,https://towardsdatascience.com/how-to-use-ggpl...,How to Use ggplot2 in Python,A Grammar of Graphics for Python,3.png,767,1,5,Towards Data Science,2019-05-30
3,4,https://towardsdatascience.com/databricks-how-...,Databricks: How to Save Files in CSV on Your L...,When I work on Python projects dealing…,4.jpeg,354,0,4,Towards Data Science,2019-05-30
4,5,https://towardsdatascience.com/a-step-by-step-...,A Step-by-Step Implementation of Gradient Desc...,One example of building neural…,5.jpeg,211,3,4,Towards Data Science,2019-05-30


In [3]:
print("Number of records: ", medium_data.shape[0])
print("Number of fields: ", medium_data.shape[1])

Number of records:  6508
Number of fields:  10


In [4]:
medium_data['title'] = medium_data['title'].apply(lambda x: x.replace(u'\xa0',u' '))
medium_data['title'] = medium_data['title'].apply(lambda x: x.replace('\u200a',' '))

In [5]:
from transformers import BertTokenizer

# Initialize a BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [6]:
vocab_size = tokenizer.vocab_size

In [7]:
tokenizer(medium_data.iloc[0]['title'])

{'input_ids': [101, 1037, 4088, 3678, 1521, 1055, 5009, 2000, 2773, 7861, 8270, 4667, 2007, 8991, 5332, 2213, 2773, 2475, 3726, 2278, 2944, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
input_sequences = []
for line in medium_data['title']:
    token_list = tokenizer(line).input_ids
    #print(token_list)
    
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


In [9]:
# print(input_sequences)
print("Total input sequences: ", len(input_sequences))

Total input sequences:  79316


In [10]:
X = [lst[:-1] for lst in input_sequences]
Y = [lst[-1] for lst in input_sequences]

In [11]:
max_length = max(map(len, X))

In [12]:
X = [[0] * (max_length - len(lst)) + lst for lst in X]

In [13]:
X = torch.tensor(X, dtype=torch.long)  # Use torch.long if your data type is integer

In [14]:
Y = torch.tensor(Y, dtype=torch.long)  # Use torch.long if your data type is integer

In [15]:
import torch.nn.functional as F

In [16]:
print(X.shape)
print(Y.shape)

torch.Size([79316, 126])
torch.Size([79316])


In [17]:
# Define the BiLSTM model
class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim,padding_idx=0)
        self.bilstm = nn.LSTM(embedding_dim, hidden_size, 
                              num_layers=3,
                              dropout=0.2,
                              batch_first=True, 
                              bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, vocab_size)  # Multiply by 2 because of bidirectional
        self.dropout = nn.Dropout(0.4)
        
        
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.bilstm(embedded)
        lstm_out = self.dropout(lstm_out)
        output = self.fc(lstm_out[:, -1, :])
        return output

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


cuda


In [19]:
model = BiLSTMModel(vocab_size, 256, 128).to(device)


In [20]:
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [21]:
# Define a custom dataset
class SequenceDataset(Dataset):
    def __init__(self, input_sequences, target_sequences):
        self.input_sequences = input_sequences
        self.target_sequences = target_sequences

    def __len__(self):
        return len(self.input_sequences)

    def __getitem__(self, idx):
        ys = F.one_hot(self.target_sequences[idx], num_classes=vocab_size)

        return self.input_sequences[idx], ys


In [22]:
dataset = SequenceDataset(X, Y)

In [23]:
# Create a DataLoader
batch_size = 256
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [24]:
# Define loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.AdamW(model.parameters(), lr=0.001)


In [25]:
from tqdm import tqdm

In [26]:
# Training loop with DataLoader
epochs = 10
for epoch in range(epochs):
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    tqdm_dataloader = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{epochs}", 
                           leave=False,
                           bar_format='{desc}: {percentage:3.0f}%|{bar:50}{r_bar}',
                           colour='green')

    for input_batch, target_batch in tqdm_dataloader:
        input_batch, target_batch = input_batch.to(device), target_batch.to(device)

        optimizer.zero_grad()
        outputs = model(input_batch)
        loss = criterion(outputs, target_batch.argmax(dim=1))
        
        total_loss += loss.item()
        
        # Get top-k predictions
        _, predicted_indices = outputs.topk(k=3, dim=1)

        # Check if the correct label is in the top-k predictions
        correct_predictions += torch.any(predicted_indices == torch.argmax(target_batch, dim=1, keepdim=True), dim=1).sum().item()
        total_samples += target_batch.size(0)

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_samples

    print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {average_loss:.4f}, Accuracy: {accuracy * 100:.2f}%")

    

                                                                                                     4.30it/s]

Epoch 1/10, Average Loss: 6.8051, Accuracy: 16.93%


                                                                                                     4.22it/s]

Epoch 2/10, Average Loss: 5.8152, Accuracy: 27.21%


                                                                                                     3.91it/s]

Epoch 3/10, Average Loss: 5.4745, Accuracy: 30.39%


                                                                                                     4.19it/s]

Epoch 4/10, Average Loss: 5.2297, Accuracy: 32.67%


                                                                                                     4.72it/s]

Epoch 5/10, Average Loss: 5.0149, Accuracy: 34.78%


                                                                                                     3.91it/s]

Epoch 6/10, Average Loss: 4.8216, Accuracy: 36.51%


                                                                                                     4.44it/s]

Epoch 7/10, Average Loss: 4.6386, Accuracy: 38.27%


                                                                                                     4.09it/s]

Epoch 8/10, Average Loss: 4.4605, Accuracy: 39.75%


                                                                                                     4.14it/s]

Epoch 9/10, Average Loss: 4.3002, Accuracy: 41.10%


                                                                                                      4.16it/s]

Epoch 10/10, Average Loss: 4.1413, Accuracy: 42.65%




In [35]:
def predict(seed_text,next_words=3):
    
    print("Actual : ",seed_text)

    # Generate next words
    for _ in range(next_words):
        # Tokenize seed_text
        token_list = tokenizer(seed_text,)['input_ids']
        
        token_list = [0] * (max_length - len(token_list)) + token_list
        
        token_list = torch.tensor(token_list, dtype=torch.long).unsqueeze(0)  # Use torch.long if your data type is integer
                
        
        # Move token_list to GPU if available
        token_list = token_list.to(device)
        with torch.no_grad():
            output = model(token_list)
            

        # Get the index of the predicted word
        predicted_index = torch.argmax(output, dim=-1)
        
        
        # Convert index to word
        output_word = tokenizer.decode(predicted_index)

        # Update the seed_text
        seed_text += " " + output_word
        
    print("Predict : ",seed_text)

In [43]:
print(medium_data.iloc[78]['title'])

How To Limit Your Exposure To The Surveillance Capitalism


In [44]:
# Example seed_text
seed_text = "How To Limit Your Exposure"

In [45]:
# Print the generated text
predict(seed_text,next_words=4)

Actual :  How To Limit Your Exposure
Predict :  How To Limit Your Exposure in [SEP] [SEP] [SEP]
