In [1]:
import torch
import pytorch_lightning as pl 
import pandas as pd
import os
import numpy as np
import torch.nn as nn
from transformers import BertTokenizer
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import seed_everything
from torch.utils.data import random_split


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
medium_data = pd.read_csv('medium_data.csv')
medium_data.head()

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/a-beginners-gui...,A Beginner’s Guide to Word Embedding with Gens...,,1.png,850,8,8,Towards Data Science,2019-05-30
1,2,https://towardsdatascience.com/hands-on-graph-...,Hands-on Graph Neural Networks with PyTorch & ...,,2.png,1100,11,9,Towards Data Science,2019-05-30
2,3,https://towardsdatascience.com/how-to-use-ggpl...,How to Use ggplot2 in Python,A Grammar of Graphics for Python,3.png,767,1,5,Towards Data Science,2019-05-30
3,4,https://towardsdatascience.com/databricks-how-...,Databricks: How to Save Files in CSV on Your L...,When I work on Python projects dealing…,4.jpeg,354,0,4,Towards Data Science,2019-05-30
4,5,https://towardsdatascience.com/a-step-by-step-...,A Step-by-Step Implementation of Gradient Desc...,One example of building neural…,5.jpeg,211,3,4,Towards Data Science,2019-05-30


In [3]:
print("Number of records: ", medium_data.shape[0])
print("Number of fields: ", medium_data.shape[1])

Number of records:  6508
Number of fields:  10


In [4]:
medium_data['title'] = medium_data['title'].apply(lambda x: x.replace(u'\xa0',u' '))
medium_data['title'] = medium_data['title'].apply(lambda x: x.replace('\u200a',' '))

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [6]:
vocab_size = tokenizer.vocab_size

In [7]:
tokenizer(medium_data.iloc[0]['title'])

{'input_ids': [101, 1037, 4088, 3678, 1521, 1055, 5009, 2000, 2773, 7861, 8270, 4667, 2007, 8991, 5332, 2213, 2773, 2475, 3726, 2278, 2944, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
input_sequences = []
for line in medium_data['title']:
    token_list = tokenizer(line).input_ids
    #print(token_list)
    
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)


In [9]:
print("Total input sequences: ", len(input_sequences))

Total input sequences:  79316


In [10]:
X = [lst[:-1] for lst in input_sequences]
Y = [lst[-1] for lst in input_sequences]

In [11]:
max_length = max(map(len, X))

In [12]:
X = [[0] * (max_length - len(lst)) + lst for lst in X]

In [13]:
X = torch.tensor(X, dtype=torch.long)  # Use torch.long if your data type is integer
Y = torch.tensor(Y, dtype=torch.long)  # Use torch.long if your data type is integer

In [14]:
print(X.shape)
print(Y.shape)

torch.Size([79316, 126])
torch.Size([79316])


In [15]:
class BiLSTMModel(pl.LightningModule):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super(BiLSTMModel,self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim,padding_idx=0)
        self.bilstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, vocab_size)  # Multiply by 2 because of bidirectional
        self.dropout = nn.Dropout(0.4)
        self.criterion = nn.CrossEntropyLoss(ignore_index=0)
        self.val_accuracy = 0.0
        self.total_predictions = 0
        


    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.bilstm(embedded)
        lstm_out = self.dropout(lstm_out)
        output = self.fc(lstm_out[:, -1, :])
        return output
    
    def training_step(self, batch, batch_idx) :
        x,label = batch
        output = self.forward(x)
        loss = self.criterion(output, label.argmax(dim=1))
        self.log('train_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx) :
        x,label = batch
        output = self.forward(x)
        loss = self.criterion(output, label.argmax(dim=1))
        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)
        
        # Get top-k predictions
        _, predicted_indices = output.topk(k=3, dim=1)

        # Check if the correct label is in the top-k predictions
        self.val_accuracy += torch.any(predicted_indices == torch.argmax(label, dim=1, keepdim=True), dim=1).sum().item()
        self.total_predictions += label.size(0)
    
    def on_validation_epoch_end(self):
        accuracy = self.val_accuracy / self.total_predictions
        self.log('val_accuracy', accuracy, on_step=False, on_epoch=True, prog_bar=True)

    
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=0.001)
        
    

In [16]:
# Define a custom dataset
class SequenceDataset(Dataset):
    def __init__(self, input_sequences, target_sequences):
        self.input_sequences = input_sequences
        self.target_sequences = target_sequences

    def __len__(self):
        return len(self.input_sequences)

    def __getitem__(self, idx):
        ys = F.one_hot(self.target_sequences[idx], num_classes=vocab_size)

        return self.input_sequences[idx], ys


In [17]:
class SequenceDataModule(pl.LightningDataModule):
    def __init__(self, input_sequences, target_sequences, batch_size=32):
        super(SequenceDataModule, self).__init__()
        self.input_sequences = input_sequences
        self.target_sequences = target_sequences
        self.batch_size = batch_size

    def setup(self, stage=None):
        # Split dataset into training and validation sets
        total_samples = len(self.input_sequences)
        val_samples = int(0.2 * total_samples)  # Adjust the validation split as needed
        train_samples = total_samples - val_samples

        self.train_dataset, self.val_dataset = random_split(
            dataset=SequenceDataset(self.input_sequences, self.target_sequences),
            lengths=[train_samples, val_samples],
            generator=torch.Generator().manual_seed(42)  # Set seed for reproducibility
        )

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=3,shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, num_workers=3,batch_size=self.batch_size)

In [18]:
epochs = 50
batch_size = 256

In [19]:
# Instantiate your SequenceDataModule
data_module = SequenceDataModule(X, Y, batch_size=batch_size)

data_module.setup()

In [20]:
# Optionally, inspect the datasets and dataloaders
print("Train Dataset Length:", len(data_module.train_dataset))
print("Val Dataset Length:", len(data_module.val_dataset))

Train Dataset Length: 63453
Val Dataset Length: 15863


In [21]:
model = BiLSTMModel(vocab_size, 256, 128)

In [22]:
checkpoint_callback = ModelCheckpoint(
    dirpath = 'checkpoints',
    filename = 'BestNWP',
    save_top_k = 1,
    verbose = True,
    monitor = 'val_loss',
    mode = 'min'
)

In [23]:
trainer = pl.Trainer(devices=-1, 
                  accelerator="gpu",
                  check_val_every_n_epoch=5,
                  max_epochs=epochs)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [24]:
trainer.fit(model=model,datamodule=data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 7.8 M 
1 | bilstm    | LSTM             | 395 K 
2 | fc        | Linear           | 7.8 M 
3 | dropout   | Dropout          | 0     
4 | criterion | CrossEntropyLoss | 0     
-----------------------------------------------
16.1 M    Trainable params
0         Non-trainable params
16.1 M    Total params
64.212    Total estimated model params size (MB)


Epoch 15:  58%|█████▊    | 145/248 [00:16<00:11,  8.73it/s, loss=2.75, v_num=3, train_loss_step=2.970, train_loss_epoch=2.770, val_loss_step=5.050, val_loss_epoch=5.390, val_accuracy=0.392]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [26]:
def predict(seed_text,next_words=3):
    
    print("Actual : ",seed_text)

    # Generate next words
    for _ in range(next_words):
        # Tokenize seed_text
        token_list = tokenizer(seed_text,)['input_ids']
        
        token_list = [0] * (max_length - len(token_list)) + token_list
        
        token_list = torch.tensor(token_list, dtype=torch.long).unsqueeze(0)  # Use torch.long if your data type is integer
                
        # Move token_list to GPU if available
        token_list = token_list.to(device)
        with torch.no_grad():
            output = model(token_list)
            
        # Get the index of the predicted word
        predicted_index = torch.argmax(output, dim=-1)
        
        # Convert index to word
        output_word = tokenizer.decode(predicted_index)

        # Update the seed_text
        seed_text += " " + output_word
        
    print("Predict : ",seed_text)

In [27]:
seed_text = "What if AI model"

In [28]:
# Print the generated text
predict(seed_text,next_words=4)

Actual :  What if AI model
Predict :  What if AI model in a ##or ?
