In [1]:
import os
import re
import nltk
import spacy
import torch 
import random

import openpyxl
from collections import Counter

import torch.nn as nn
import torch.optim as optim
import torch.functional as F
import torch.nn.utils as utils
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader

import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer
from transformers import BertTokenizer, BertForPreTraining


from TransformerModel import *

  hasattr(torch, "has_mps")
  and torch.has_mps  # type: ignore[attr-defined]


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
nlp = spacy.load('en_core_web_sm')
device

'cuda'

In [3]:
# Custom Dataset
class ArticleDataset(Dataset):
    def __init__(self, articles, vocab, seq_len=50):
        self.articles = articles
        self.vocab = vocab
        self.seq_len = seq_len
        self.tokenized_articles = self.tokenize_articles()

    def tokenize_articles(self):
        tokenized = []
        for article in self.articles:
            tokens = [self.vocab[token.text.lower()] for token in nlp(article) if token.text.lower() in self.vocab]
            if len(tokens) > self.seq_len:
                tokenized += [tokens[i:i+self.seq_len] for i in range(0, len(tokens) - self.seq_len + 1)]
        return tokenized

    def __len__(self):
        return len(self.tokenized_articles)

    def __getitem__(self, idx):
        seq = torch.tensor(self.tokenized_articles[idx])
        return seq[:-1], seq[1:]
    
# Function to load articles and build vocabulary from an Excel file
def load_data_and_vocab(df):

    # Assuming the articles are in a column named 'article_text'
    articles = df['article_text'].tolist()
    
    vocab = Counter()
    
    # Build vocabulary
    for article in articles:
        vocab.update([token.text.lower() for token in nlp(article)])
    
    # Create vocabulary dictionary with a <PAD> token
    vocab = {word: idx for idx, (word, _) in enumerate(vocab.items(), start=1)}
    vocab['<PAD>'] = 0
    
    return articles, vocab

# Custom collate function to handle the padding
def collate_fn(batch):
    inputs, targets = zip(*batch)  # Unzip the batch into inputs and targets
    inputs = pad_sequence(inputs, batch_first=True, padding_value=vocab['<PAD>'])
    targets = pad_sequence(targets, batch_first=True, padding_value=vocab['<PAD>'])
    return inputs, targets

In [4]:
# Example usage
data = pd.read_excel("../../data/Articles.xlsx")

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s.,]", "", text)
    
    return text

topics = data["NewsType"].values
data["Article"] = data["Article"].apply(lambda x: clean_text(x))
data["Heading"] = data["Heading"].apply(lambda x: clean_text(x))
data["article_text"] = data["Heading"] + ". " + data["Article"]
data["article_text"] = data['article_text'].astype(str).apply(openpyxl.utils.escape.unescape)
data["article_text"] = data["article_text"].replace(r'\s+|\\n', ' ', regex=True) 

In [5]:
articles, vocab = load_data_and_vocab(data)

dataset = ArticleDataset(articles, vocab)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [6]:
# Hyperparameters
vocab_size = len(vocab) # 53529
embed_size = 512
num_heads = 8
num_encoder_layers = 6
num_decoder_layers = 6
forward_expansion = 2048
dropout = 0.1
max_len = 100

# Initialize the model
model = TransformerModel(vocab_size, embed_size, num_heads, num_encoder_layers, num_decoder_layers, forward_expansion, dropout, max_len).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<PAD>'])
optimizer = optim.Adam(model.parameters(), lr=3e-4)

In [7]:
# Training loop
for epoch in range(2):  # Adjust the number of epochs as needed
    model.train()
    for batch_idx, (inputs, targets) in enumerate(data_loader):
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        output = model(inputs, targets, src_mask=None, tgt_mask=None)  # Add masks as needed
        
        # Reshape output and targets to match the CrossEntropyLoss requirements
        output = output.view(-1, vocab_size)
        targets = targets.view(-1)

        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()

        if batch_idx % 1000 == 0:
            print(f"Epoch {epoch+1}, Batch {batch_idx}, Loss: {loss.item()}")

Epoch 1, Batch 0, Loss: 11.063329696655273
Epoch 1, Batch 1000, Loss: 1.0386213064193726
Epoch 1, Batch 2000, Loss: 0.37171775102615356
Epoch 1, Batch 3000, Loss: 0.15745075047016144
Epoch 1, Batch 4000, Loss: 0.07022745907306671
Epoch 1, Batch 5000, Loss: 0.05629092827439308
Epoch 1, Batch 6000, Loss: 0.03750816732645035
Epoch 1, Batch 7000, Loss: 0.00822608545422554
Epoch 1, Batch 8000, Loss: 0.007244476117193699
Epoch 1, Batch 9000, Loss: 0.01773599535226822
Epoch 1, Batch 10000, Loss: 0.005674783606082201
Epoch 1, Batch 11000, Loss: 0.001688830554485321
Epoch 1, Batch 12000, Loss: 0.0021751674357801676
Epoch 1, Batch 13000, Loss: 0.0039748442359268665
Epoch 1, Batch 14000, Loss: 0.002291230484843254
Epoch 1, Batch 15000, Loss: 0.005225534550845623
Epoch 1, Batch 16000, Loss: 0.00411038426682353
Epoch 1, Batch 17000, Loss: 0.0026115996297448874
Epoch 1, Batch 18000, Loss: 0.0023951444309204817
Epoch 1, Batch 19000, Loss: 0.002825278090313077
Epoch 1, Batch 20000, Loss: 0.00165949796

In [7]:
# Saving the model
# model_save_path = 'trained-transformer_model.pth'
# torch.save(model.state_dict(), model_save_path)

# Loading the model
model_load_path = 'trained-transformer_model.pth'

# Ensure you initialize the model with the same architecture as before
loaded_model = TransformerModel(vocab_size, embed_size, num_heads, num_encoder_layers, num_decoder_layers, forward_expansion, dropout, max_len).to(device)

# Load the saved state dictionary into the model
loaded_model.load_state_dict(torch.load(model_load_path))

# Set the model to evaluation mode
loaded_model.eval()

TransformerModel(
  (embedding): Embedding(53529, 512)
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (attention): MultiHeadAttention(
          (values): Linear(in_features=64, out_features=64, bias=False)
          (keys): Linear(in_features=64, out_features=64, bias=False)
          (queries): Linear(in_features=64, out_features=64, bias=False)
          (fc_out): Linear(in_features=512, out_features=512, bias=True)
        )
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (feed_forward): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): ReLU()
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerDec

In [12]:
def generate_text(model, vocab, input_text, max_length=200):
    model.eval()
    with torch.no_grad():
        # Convert input text to tokens
        input_tokens = [vocab.get(token, vocab['<PAD>']) for token in input_text.split()]
        input_tensor = torch.tensor(input_tokens).unsqueeze(0).to(device)  # Batch size of 1

        generated_text = input_tokens.copy()

        for _ in range(max_length):
            # Ensure positional encoding matches input sequence length
            if input_tensor.size(1) > model.positional_encoding.size(1):
                # If input is longer than positional encoding, pad positional encoding
                positional_encoding = torch.cat([model.positional_encoding, 
                                                 model.positional_encoding[:, -1:, :].repeat(1, input_tensor.size(1) - model.positional_encoding.size(1), 1)], dim=1)
            else:
                positional_encoding = model.positional_encoding[:, :input_tensor.size(1), :]

            # Generate the next token
            embed_x = model.dropout(model.embedding(input_tensor) + positional_encoding)
            output = model.encoder(embed_x, None)  # Removed src_mask argument
            next_token = output[:, -1, :].argmax(-1).item()
            generated_text.append(next_token)

            # Update input_tensor for the next iteration
            input_tensor = torch.tensor(generated_text).unsqueeze(0).to(device)

        # Convert generated tokens back to words
        generated_text = [list(vocab.keys())[list(vocab.values()).index(tok)] for tok in generated_text]
        return " ".join(generated_text)




# Example usage
keyword = "business"
generated_paragraph = generate_text(model, vocab, keyword, max_length=200)
print(generated_paragraph)


business member lower daniel information 0.55 0.55 5 signals compare national was texas ease 0.48 indo 8.48 sindh karachi slipped unions ease 0.48 indo above highs.the developer 7pc remained an recorded momentum upswing sovereign tools ease 0.48 indo above highs.the developer 7pc remained 15,900 momentum upswing sovereign easing at inflation afp half ang tambangraya week survey anticipated half ang tambangraya week momentum upswing sovereign tools 8.48 sindh karachi slipped 252.78 above highs.the developer 7pc remained fact stream nbs expectations inflation afp half ang tambangraya week survey anticipated half ang tambangraya week survey anticipated half ang tambangraya week momentum upswing sovereign tools 8.48 sindh karachi slipped transporters last national traveling.meanwhile trading.trainbuilders over toasted public commuters adding 50.3 michael 0.93 asian finance at 252.78 above highs.the developer 7pc remained fact was texas at bukhari growth including back chief mumbai 3,370.59

In [13]:
keyword = "karachi"
generated_paragraph = generate_text(model, vocab, keyword, max_length=200)
print(generated_paragraph)

karachi down of could economy.the do 0.55 momentum upswing sovereign easing at 252.78 managers later later later later later later later later later later later later later later later factories ahead estate 1,045 lead remained fact stream figure week finishing 16.0 trade.the chief rej decides commuters adding malaysias rej decides commuters adding delivery half managers later factories ahead estate 1,045 lead remained fact 3.41 18.22 estate 1,045 worlds economy.the policy level while expansion estate 1,045 worlds economy.the policy level while expansion estate 1,045 worlds economy.the policy level while expansion estate 1,045 worlds economy.the policy level while expansion estate 1,045 worlds economy.the policy level while expansion estate 1,045 worlds economy.the policy level while expansion estate 1,045 worlds economy.the policy level while expansion estate 1,045 worlds economy.the policy level while expansion estate 1,045 worlds economy.the policy level while expansion estate 1,045

In [14]:
keyword = "health"
generated_paragraph = generate_text(model, vocab, keyword, max_length=200)
print(generated_paragraph)

health bureau figure 57.51.the 0.93 asian finance secondlargest release fact 3.41 csr close driver trade.the chief mumbai 3,370.59.agribusiness economy.the investment bukhari other 18.22 estate november.the demand.chinas it unions ang tambangraya week survey economy responses public commuters adding 50.3 michael 0.93 asian finance secondlargest sg3.30 158.63 indo above highs.the developer ang tambangraya week survey anticipated half ang tambangraya week survey economy responses public commuters adding 50.3 michael 0.93 asian finance secondlargest sg3.30 158.63 indo above highs.the developer ang tambangraya week survey anticipated half ang tambangraya week survey anticipated half ang tambangraya week survey anticipated half ang tambangraya week survey economy responses public commuters adding 50.3 michael 0.93 asian finance secondlargest sg3.30 50.3 michael 0.93 asian finance secondlargest sg3.30 50.3 michael 0.93 asian finance secondlargest sg3.30 50.3 michael 0.93 asian 8.48 finishing

In [15]:
keyword = "finance"
generated_paragraph = generate_text(model, vocab, keyword, max_length=200)
print(generated_paragraph)

finance examining karachi slipped transporters tambang lead banks.it 1,171.80 gains 15,900 momentum upswing sovereign easing at 252.78 managers 0.44 8.48 sindh karachi slipped transporters last national traveling.meanwhile trading.trainbuilders over hong momentum upswing sovereign tools 8.48 sindh karachi slipped transporters last national traveling.meanwhile trading.trainbuilders over hong momentum upswing sovereign tools until trading.trainbuilders over hong momentum upswing sovereign easing at 252.78 managers 0.44 was texas at 252.78 managers 0.44 was texas at 252.78 managers 0.44 was texas at bukhari refused managers 0.44 was texas at bukhari refused managers november.the demand.chinas it unions ang tambangraya week product compare momentum upswing sovereign tools 8.48 sindh karachi slipped transporters last national traveling.meanwhile trading.trainbuilders over hong momentum upswing sovereign tools 8.48 sindh karachi slipped transporters last national traveling.meanwhile trading.

In [16]:
keyword = "economy and growth"
generated_paragraph = generate_text(model, vocab, keyword, max_length=200)
print(generated_paragraph)

economy and growth including back estate november.the demand.chinas it 24.89 value 9.12 decides commuters adding 50.3 michael 0.93 asian finance secondlargest slipped transporters last national traveling.meanwhile trading.trainbuilders tuesday back estate 1,045 lead refused 0.55 0.55 0.55 0.55 0.55 0.55 0.55 5 signals cmc january sydney pmi company currency , president investors adding 50.3 until trading.trainbuilders tuesday back estate 1,045 lead refused 0.55 0.55 0.55 5 signals cmc january sydney pmi company currency , president investors adding 50.3 until trading.trainbuilders tuesday back estate 1,045 lead refused managers later 50.3 until trading.trainbuilders tuesday back chief rej january sydney pmi company currency , president investors adding 50.3 until trading.trainbuilders tuesday back chief rej january sydney pmi company currency , president investors adding 50.3 until trading.trainbuilders tuesday back estate 1,045 lead largescale peoples inflation afp half ang tambangray

In [17]:
keyword = "business and markets"
generated_paragraph = generate_text(model, vocab, keyword, max_length=200)
print(generated_paragraph)

business and markets reduction economy.the investment 5,242.77.coal remained november.the demand.chinas 3.41 level while 9.12 trade.the chief rej january csr close driver holidays.with below for 0.57 24.89 value 9.12 trade.the chief mumbai 3,370.59.agribusiness economy.the investment bukhari texas ease 0.48 indo 8.48 finishing 9.12 trade.the chief rej decides commuters adding 50.3 michael 0.93 asian finance at 252.78 above highs.the developer ang tambangraya week survey anticipated half ang tambangraya week survey economy responses public commuters adding 50.3 michael 0.93 asian finance at bukhari texas january sydney pmi company currency , quantitative 3.41 level while expansion estate 1,045 lead remained november.the demand.chinas it unions ang tambangraya week survey anticipated half ang tambangraya week survey anticipated half ang tambangraya week survey economy responses public commuters adding 50.3 michael 0.93 asian finance at bukhari growth including back estate november.the de

In [18]:
keyword = "situation of pakistan"
generated_paragraph = generate_text(model, vocab, keyword, max_length=200)
print(generated_paragraph)

situation of pakistan year year year year year year year year year when 5.51 karachi slipped bonds texas ease 0.48 indo above growth made examining karachi slipped unions 1.88 1.07 sovereign tools ease 0.48 indo above highs.the compare national national national national national national national national national national national national national signals 8.48 sindh karachi slipped unions ease 0.48 rej decides commuters adding 50.3 michael 0.93 asian finance at 252.78 above highs.the developer 7pc remained fact stream nbs for 0.57 parts 1,045 lead remained fact stream nbs for 0.57 parts 1,045 lead remained fact stream nbs for 0.57 parts 1,045 lead remained fact stream nbs for 0.57 parts 1,045 lead remained fact stream nbs for 0.57 it unions ang tambangraya week survey anticipated half ang tambangraya week momentum upswing sovereign tools ease 0.48 indo above highs.the developer 7pc remained fact cent signals cmc january sydney pmi commuters adding 50.3 michael 0.93 asian finance at 