In [1]:
import torch
import pytorch_lightning as pl 
import pandas as pd
import os
from bs4 import BeautifulSoup
import re
import demoji
import numpy as np
from torch.nn import MultiheadAttention
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
import torchtext
from transformers import BertTokenizer
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import seed_everything
from torch.utils.data import random_split


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
medium_data = pd.read_csv('medium_data.csv')
medium_data.head()

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/a-beginners-gui...,A Beginner’s Guide to Word Embedding with Gens...,,1.png,850,8,8,Towards Data Science,2019-05-30
1,2,https://towardsdatascience.com/hands-on-graph-...,Hands-on Graph Neural Networks with PyTorch & ...,,2.png,1100,11,9,Towards Data Science,2019-05-30
2,3,https://towardsdatascience.com/how-to-use-ggpl...,How to Use ggplot2 in Python,A Grammar of Graphics for Python,3.png,767,1,5,Towards Data Science,2019-05-30
3,4,https://towardsdatascience.com/databricks-how-...,Databricks: How to Save Files in CSV on Your L...,When I work on Python projects dealing…,4.jpeg,354,0,4,Towards Data Science,2019-05-30
4,5,https://towardsdatascience.com/a-step-by-step-...,A Step-by-Step Implementation of Gradient Desc...,One example of building neural…,5.jpeg,211,3,4,Towards Data Science,2019-05-30


In [3]:
print("Number of records: ", medium_data.shape[0])
print("Number of fields: ", medium_data.shape[1])

Number of records:  6508
Number of fields:  10


In [4]:
def remove_html_tags(title):
    soup = BeautifulSoup(title, 'html.parser')
    return soup.get_text()

In [5]:
def remove_non_alphabetic(title):
    return re.sub('[^a-zA-Z]', ' ', title)

In [6]:
def remove_imojis(title):
    return demoji.replace(title, '')

In [7]:
def preprocessing_title(title):
    title = remove_html_tags(title)
    title = remove_imojis(title)
    title = remove_non_alphabetic(title)
    title.replace(u'\xa0', u' ')
    title.replace('\x200a', ' ')
    return title

In [8]:
titles = medium_data['title'].apply(preprocessing_title)

  soup = BeautifulSoup(title, 'html.parser')


In [9]:
tokenizer = get_tokenizer('basic_english')

In [10]:
tokenized_titles = [tokenizer(title) for title in titles]

In [11]:
features_vocab = torchtext.vocab.build_vocab_from_iterator(
    tokenized_titles,
    min_freq=2,
    specials=['<pad>', '<oov>'],
    special_first=True
)

In [12]:
features_vocab_total_words = len(features_vocab)
print(f'Total number of words in features vocabulary: {features_vocab_total_words}')

Total number of words in features vocabulary: 3571


In [13]:
print('<pad> -> '+ str(features_vocab['<pad>']))
print('<oov> -> '+ str(features_vocab['<oov>']))

<pad> -> 0
<oov> -> 1


In [14]:
titlr = "Hello, How are you"
tt = tokenizer(titlr) 
print(tt)
xx = [features_vocab[word] if word in features_vocab.get_itos() else features_vocab['<pad>'] for word in tt]
xx

['hello', ',', 'how', 'are', 'you']


[0, 0, 6, 29, 10]

In [15]:
vocab_size = features_vocab_total_words

In [16]:
input_sequences = []
for line in tokenized_titles:
    token_list = [features_vocab[word] if word in features_vocab.get_itos() else features_vocab['<pad>'] for word in line]
    
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


In [17]:
print("Total input sequences: ", len(input_sequences))

Total input sequences:  44910


In [18]:
X = [lst[:-1] for lst in input_sequences]
Y = [lst[-1] for lst in input_sequences]

In [19]:
max_length = max(map(len, X))

In [20]:
print(max_length)

23


In [21]:
X = [[0] * (max_length - len(lst)) + lst for lst in X]

In [22]:
X = torch.tensor(X, dtype=torch.long)  # Use torch.long if your data type is integer
Y = torch.tensor(Y, dtype=torch.long)  # Use torch.long if your data type is integer

In [23]:
print(X.shape)
print(Y.shape)

torch.Size([44910, 23])
torch.Size([44910])


In [97]:
class BiLSTMModel(pl.LightningModule):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super(BiLSTMModel,self).__init__()
        self.embedding = nn.Embedding(vocab_size, 
                                      embedding_dim,
                                      padding_idx=0)
        
        self.mha = nn.MultiheadAttention(embedding_dim, 
                                         num_heads = 4,
                                         batch_first=True,
                                         dropout=0.4)
        
        self.bilstm = nn.LSTM(embedding_dim, 
                              hidden_size, 
                              num_layers=3,
                              dropout=0.4,
                              batch_first=True, 
                              bidirectional=True)
        
        self.fc = nn.Linear(hidden_size*2, vocab_size)  # Multiply by 2 because of bidirectional
        self.dropout = nn.Dropout(0.4)
        self.logsftmx = nn.LogSoftmax(dim=1)
        self.criterion = nn.NLLLoss(ignore_index=0)
        self.val_accuracy = 0.0
        self.total_predictions = 0
        


    def forward(self, x):
        embedded = self.embedding(x)
        mha_out,mha_out_wghts = self.mha(embedded, embedded, embedded)
        lstm_out, _ = self.bilstm(mha_out)
        lstm_out = self.dropout(lstm_out)
        output = self.fc(lstm_out[:, -1, :])
        output = self.logsftmx(output)
        return output
    
    def training_step(self, batch, batch_idx) :
        x,label = batch
        output = self.forward(x)
        loss = self.criterion(output, label.argmax(dim=1))
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx) :
        x,label = batch
        output = self.forward(x)
        loss = self.criterion(output, label.argmax(dim=1))
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        
        # Get top-k predictions
        _, predicted_indices = output.topk(k=3, dim=1)

        # Check if the correct label is in the top-k predictions
        self.val_accuracy += torch.any(predicted_indices == torch.argmax(label, dim=1, keepdim=True), dim=1).sum().item()
        self.total_predictions += label.size(0)
    
    def on_validation_epoch_end(self):
        accuracy = self.val_accuracy / self.total_predictions
        self.log('val_accuracy', accuracy)
        print(f" Accuracy: {accuracy * 100:.2f}%")

    
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=0.001)
        
    

In [98]:
# Define a custom dataset
class SequenceDataset(Dataset):
    def __init__(self, input_sequences, target_sequences):
        self.input_sequences = input_sequences
        self.target_sequences = target_sequences

    def __len__(self):
        return len(self.input_sequences)

    def __getitem__(self, idx):
        ys = F.one_hot(self.target_sequences[idx], num_classes=vocab_size)

        return self.input_sequences[idx], ys


In [99]:
class SequenceDataModule(pl.LightningDataModule):
    def __init__(self, input_sequences, target_sequences, batch_size=32):
        super(SequenceDataModule, self).__init__()
        self.input_sequences = input_sequences
        self.target_sequences = target_sequences
        self.batch_size = batch_size

    def setup(self, stage=None):
        # Split dataset into training and validation sets
        total_samples = len(self.input_sequences)
        val_samples = int(0.2 * total_samples)  # Adjust the validation split as needed
        train_samples = total_samples - val_samples

        self.train_dataset, self.val_dataset = random_split(
            dataset=SequenceDataset(self.input_sequences, self.target_sequences),
            lengths=[train_samples, val_samples],
            generator=torch.Generator().manual_seed(42)  # Set seed for reproducibility
        )

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=3,shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, num_workers=3,batch_size=self.batch_size,shuffle=False)

In [100]:
epochs = 50
batch_size = 512

In [101]:
# Instantiate your SequenceDataModule
data_module = SequenceDataModule(X, Y, batch_size=batch_size)

data_module.setup()

In [102]:
# Optionally, inspect the datasets and dataloaders
print("Train Dataset Length:", len(data_module.train_dataset))
print("Val Dataset Length:", len(data_module.val_dataset))

Train Dataset Length: 35928
Val Dataset Length: 8982


In [103]:
model = BiLSTMModel(vocab_size, embedding_dim=128, hidden_size=256)

In [104]:
checkpoint_callback = ModelCheckpoint(
    dirpath = 'checkpoints',
    filename = 'BestNWP',
    save_top_k = 1,
    verbose = True,
    monitor = 'val_accuracy',
    mode = 'max'
)

In [105]:
trainer = pl.Trainer(devices=-1, 
                  accelerator="gpu",
                  check_val_every_n_epoch=5,
                  callbacks=[checkpoint_callback],
                  max_epochs=epochs)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [106]:
trainer.fit(model=model,datamodule=data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type               | Params
-------------------------------------------------
0 | embedding | Embedding          | 457 K 
1 | mha       | MultiheadAttention | 66.0 K
2 | bilstm    | LSTM               | 3.9 M 
3 | fc        | Linear             | 1.8 M 
4 | dropout   | Dropout            | 0     
5 | logsftmx  | LogSoftmax         | 0     
6 | criterion | NLLLoss            | 0     
-------------------------------------------------
6.3 M     Trainable params
0         Non-trainable params
6.3 M     Total params
25.198    Total estimated model params size (MB)


Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00, 11.74it/s]

Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00, 11.49it/s] Accuracy: 0.20%
Epoch 4: 100%|██████████| 89/89 [00:17<00:00,  5.10it/s, loss=6.22, v_num=4, train_loss=6.330] Accuracy: 13.38%
Epoch 4: 100%|██████████| 89/89 [00:17<00:00,  5.10it/s, loss=6.22, v_num=4, train_loss=6.330, val_loss=6.300]

Epoch 4, global step 355: 'val_accuracy' reached 0.13382 (best 0.13382), saving model to 'checkpoints/BestNWP-v1.ckpt' as top 1


Epoch 9: 100%|██████████| 89/89 [00:18<00:00,  4.76it/s, loss=5.66, v_num=4, train_loss=5.700, val_loss=6.300] Accuracy: 15.84%
Epoch 9: 100%|██████████| 89/89 [00:18<00:00,  4.76it/s, loss=5.66, v_num=4, train_loss=5.700, val_loss=6.090]

Epoch 9, global step 710: 'val_accuracy' reached 0.15836 (best 0.15836), saving model to 'checkpoints/BestNWP-v1.ckpt' as top 1


Epoch 14: 100%|██████████| 89/89 [00:17<00:00,  5.22it/s, loss=5.25, v_num=4, train_loss=5.240, val_loss=6.090] Accuracy: 17.15%
Epoch 14: 100%|██████████| 89/89 [00:17<00:00,  5.22it/s, loss=5.25, v_num=4, train_loss=5.240, val_loss=6.110]

Epoch 14, global step 1065: 'val_accuracy' reached 0.17154 (best 0.17154), saving model to 'checkpoints/BestNWP-v1.ckpt' as top 1


Epoch 19: 100%|██████████| 89/89 [00:17<00:00,  5.16it/s, loss=4.88, v_num=4, train_loss=4.850, val_loss=6.110] Accuracy: 18.01%
Epoch 19: 100%|██████████| 89/89 [00:17<00:00,  5.16it/s, loss=4.88, v_num=4, train_loss=4.850, val_loss=6.220]

Epoch 19, global step 1420: 'val_accuracy' reached 0.18013 (best 0.18013), saving model to 'checkpoints/BestNWP-v1.ckpt' as top 1


Epoch 24: 100%|██████████| 89/89 [00:18<00:00,  4.89it/s, loss=4.49, v_num=4, train_loss=4.500, val_loss=6.220] Accuracy: 18.52%
Epoch 24: 100%|██████████| 89/89 [00:18<00:00,  4.89it/s, loss=4.49, v_num=4, train_loss=4.500, val_loss=6.340]

Epoch 24, global step 1775: 'val_accuracy' reached 0.18522 (best 0.18522), saving model to 'checkpoints/BestNWP-v1.ckpt' as top 1


Epoch 29: 100%|██████████| 89/89 [00:18<00:00,  4.93it/s, loss=4.21, v_num=4, train_loss=4.180, val_loss=6.340] Accuracy: 18.82%
Epoch 29: 100%|██████████| 89/89 [00:18<00:00,  4.93it/s, loss=4.21, v_num=4, train_loss=4.180, val_loss=6.460]

Epoch 29, global step 2130: 'val_accuracy' reached 0.18818 (best 0.18818), saving model to 'checkpoints/BestNWP-v1.ckpt' as top 1


Epoch 34: 100%|██████████| 89/89 [00:17<00:00,  5.16it/s, loss=3.92, v_num=4, train_loss=3.900, val_loss=6.460] Accuracy: 19.03%
Epoch 34: 100%|██████████| 89/89 [00:17<00:00,  5.16it/s, loss=3.92, v_num=4, train_loss=3.900, val_loss=6.560]

Epoch 34, global step 2485: 'val_accuracy' reached 0.19030 (best 0.19030), saving model to 'checkpoints/BestNWP-v1.ckpt' as top 1


Epoch 39: 100%|██████████| 89/89 [00:16<00:00,  5.28it/s, loss=3.64, v_num=4, train_loss=3.650, val_loss=6.560] Accuracy: 19.22%
Epoch 39: 100%|██████████| 89/89 [00:16<00:00,  5.28it/s, loss=3.64, v_num=4, train_loss=3.650, val_loss=6.690]

Epoch 39, global step 2840: 'val_accuracy' reached 0.19222 (best 0.19222), saving model to 'checkpoints/BestNWP-v1.ckpt' as top 1


Epoch 44: 100%|██████████| 89/89 [00:16<00:00,  5.34it/s, loss=3.48, v_num=4, train_loss=3.430, val_loss=6.690] Accuracy: 19.31%
Epoch 44: 100%|██████████| 89/89 [00:16<00:00,  5.34it/s, loss=3.48, v_num=4, train_loss=3.430, val_loss=6.810]

Epoch 44, global step 3195: 'val_accuracy' reached 0.19314 (best 0.19314), saving model to 'checkpoints/BestNWP-v1.ckpt' as top 1


Epoch 49: 100%|██████████| 89/89 [00:15<00:00,  5.63it/s, loss=3.28, v_num=4, train_loss=3.230, val_loss=6.810] Accuracy: 19.39%
Epoch 49: 100%|██████████| 89/89 [00:15<00:00,  5.62it/s, loss=3.28, v_num=4, train_loss=3.230, val_loss=6.940]

Epoch 49, global step 3550: 'val_accuracy' reached 0.19395 (best 0.19395), saving model to 'checkpoints/BestNWP-v1.ckpt' as top 1


Epoch 49: 100%|██████████| 89/89 [00:16<00:00,  5.54it/s, loss=3.28, v_num=4, train_loss=3.200, val_loss=6.940]

`Trainer.fit` stopped: `max_epochs=50` reached.


Epoch 49: 100%|██████████| 89/89 [00:16<00:00,  5.54it/s, loss=3.28, v_num=4, train_loss=3.200, val_loss=6.940]


In [107]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [112]:
def predict(seed_text,next_words=3):
    
    print("Actual : ",seed_text)

    # Generate next words
    for _ in range(next_words):
        # Tokenize seed_text
        token_list = tokenizer(seed_text) 
        token_list = [features_vocab[word] if word in features_vocab.get_itos() else features_vocab['<oov>'] for word in token_list]   
        token_list = [0] * (max_length - len(token_list)) + token_list
        
        token_list = torch.tensor(token_list, dtype=torch.long).unsqueeze(0)  # Use torch.long if your data type is integer
                
        # Move token_list to GPU if available
        token_list = token_list #.to(device)
        with torch.no_grad():
            output = model(token_list)
            
        # Get the index of the predicted word
        predicted_index = torch.argmax(output, dim=-1)
        
        # Convert index to word
        output_word = features_vocab.lookup_token(predicted_index.item())

        # Update the seed_text
        seed_text += " " + output_word
        
    print("Predict : ",seed_text)

In [113]:
print(medium_data.iloc[15]['title'])

How to Automate Hyperparameter Optimization


In [114]:
seed_text = "How to Automate"

In [129]:
# Print the generated text
predict(seed_text,next_words=2)

Actual :  How to Automate
Predict :  How to Automate hyperparameter optimization


In [130]:
print(medium_data.iloc[16]['title'])

Ideas: Design Methodologies for Data Sprints


In [152]:
predict(seed_text='Ideas: Design Methodologies',next_words=3)

Actual :  Ideas: Design Methodologies
Predict :  Ideas: Design Methodologies weather marketing with
