In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/text-data/tokenized_output.txt
/kaggle/input/text-data/financial_tokenizer.model


In [2]:
import torch
import sentencepiece as spm
from torch.utils.data import Dataset
import numpy as np

class FinancialDataset(Dataset):
    def __init__(self, tokenized_txt_path, tokenizer_path, max_len=512):
        self.tokenized_txt_path = tokenized_txt_path
        self.tokenizer = spm.SentencePieceProcessor(model_file=tokenizer_path)
        self.max_len = max_len
        self.data = self.load_data()

    def load_data(self):
        with open(self.tokenized_txt_path, 'r') as f:
            text = f.read().split('\n')
        sentences = [line.strip() for line in text if line.strip() and not line.startswith('---')]
        return sentences
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sentence = self.data[idx]
        
        # Encode the sentence into tokens (input sequence)
        input_ids = self.tokenizer.encode(sentence)
        
        # Pad the input sequence to max_len
        if len(input_ids) < self.max_len:
            padding_length = self.max_len - len(input_ids)
            input_ids = input_ids + [0] * padding_length  # Pad with 0s
        
        # Truncate to max_len
        input_ids = input_ids[:self.max_len]
        
        # Create input and target sequences (target sequence is the same as input shifted by 1)
        input_tensor = torch.tensor(input_ids[:-1], dtype=torch.long)  # All tokens except the last one for input
        target_tensor = torch.tensor(input_ids[1:], dtype=torch.long)  # All tokens except the first one for target
        
        return input_tensor, target_tensor


In [3]:
from torch.utils.data import DataLoader
import torch

def collate_fn(batch):
    """
    Custom collate function to pad sequences in a batch to the same length.
    """
    input_tensors, target_tensors = zip(*batch)
    
    # Pad sequences to the maximum length in the batch
    input_padded = torch.nn.utils.rnn.pad_sequence(input_tensors, batch_first=True, padding_value=0)
    target_padded = torch.nn.utils.rnn.pad_sequence(target_tensors, batch_first=True, padding_value=0)
    
    return input_padded, target_padded

# Initialize dataset and dataloader with the custom collate_fn
dataset = FinancialDataset(tokenized_txt_path='/kaggle/input/text-data/tokenized_output.txt', tokenizer_path='/kaggle/input/text-data/financial_tokenizer.model', max_len=512)
dataloader = DataLoader(dataset, batch_size=8, collate_fn=collate_fn, shuffle=True)

In [5]:
import torch.nn as nn
class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=300, hidden_dim=512, num_layers=2):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, vocab_size)

    def forward(self, x):
        embedded = self.embedding(x)                # (batch_size, seq_len, embedding_dim)
        lstm_out, _ = self.lstm(embedded)           # (batch_size, seq_len, hidden_dim*2)
        output = self.fc(lstm_out)                  # (batch_size, seq_len, vocab_size)
        return output

In [8]:
import os
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
vocab_size = len(dataset.tokenizer)
embedding_dim = 300
hidden_dim = 512

# Model, optimizer, loss
model = BiLSTMModel(vocab_size, embedding_dim, hidden_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Load checkpoint if it exists
start_epoch = 0
checkpoint_path = "bilstm_checkpoint.pth"

if os.path.exists(checkpoint_path):
    print("Loading checkpoint...")
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    print(f"Resuming from epoch {start_epoch}")

# Training loop
num_epochs = 3
for epoch in range(start_epoch, num_epochs):
    model.train()
    total_loss = 0

    for input_tensor, target_tensor in dataloader:
        input_tensor = input_tensor.to(device)
        target_tensor = target_tensor.to(device)

        optimizer.zero_grad()
        output = model(input_tensor)
        loss = criterion(output.view(-1, vocab_size), target_tensor.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.6f}")

    # Save checkpoint
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': avg_loss,
    }, checkpoint_path)
    print(f"Checkpoint saved at epoch {epoch+1}")


Loading checkpoint...
Resuming from epoch 3


  checkpoint = torch.load(checkpoint_path, map_location=device)


In [10]:
import math

def calculate_perplexity(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    total_tokens = 0

    with torch.no_grad():
        for input_tensor, target_tensor in dataloader:
            input_tensor = input_tensor.to(device)
            target_tensor = target_tensor.to(device)

            output = model(input_tensor)  # (batch, seq_len, vocab_size)
            loss = criterion(output.view(-1, vocab_size), target_tensor.view(-1))
            
            total_loss += loss.item() * target_tensor.numel()  # scale by number of tokens
            total_tokens += target_tensor.numel()

    average_loss = total_loss / total_tokens
    perplexity = math.exp(average_loss)
    return perplexity

# Usage
perplexity = calculate_perplexity(model, dataloader, criterion, device)
print(f"Perplexity: {perplexity:.4f}")


Perplexity: 1.0000


In [12]:
def generate_text(model, tokenizer, prompt, max_new_tokens=50, device='cpu', temperature=1.0):
    model.eval()
    
    # Encode the prompt
    input_ids = tokenizer.encode(prompt)
    input_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)

    generated = input_ids[:]
    
    for _ in range(max_new_tokens):
        input_tensor = torch.tensor(generated[-512:], dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            output = model(input_tensor)
        
        logits = output[0, -1, :] / temperature  # Get logits for the last token
        probabilities = torch.softmax(logits, dim=-1)
        next_token = torch.multinomial(probabilities, num_samples=1).item()

        generated.append(next_token)

        # If tokenizer has an end-of-sentence token, you can break on that.
        # For example:
        # if next_token == tokenizer.eos_id():
        #     break

    return tokenizer.decode(generated)


In [13]:
prompt = "what are the latest global trends in finance"
generated_text = generate_text(model, dataset.tokenizer, prompt, max_new_tokens=50, device=device)
print("\nGenerated text:\n")
print(generated_text)



Generated text:

what are the latest global trends in finance                                                  
