In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
import sentencepiece as spm
from torch.utils.data import Dataset
import numpy as np

class FinancialDataset(Dataset):
    def __init__(self, tokenized_txt_path, tokenizer_path, max_len=512):
        self.tokenized_txt_path = tokenized_txt_path
        self.tokenizer = spm.SentencePieceProcessor(model_file=tokenizer_path)
        self.max_len = max_len
        self.data = self.load_data()

    def load_data(self):
        with open(self.tokenized_txt_path, 'r') as f:
            text = f.read().split('\n')
        sentences = [line.strip() for line in text if line.strip() and not line.startswith('---')]
        return sentences
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sentence = self.data[idx]
        
        # Encode the sentence into tokens (input sequence)
        input_ids = self.tokenizer.encode(sentence)
        
        # Pad the input sequence to max_len
        if len(input_ids) < self.max_len:
            padding_length = self.max_len - len(input_ids)
            input_ids = input_ids + [0] * padding_length  # Pad with 0s
        
        # Truncate to max_len
        input_ids = input_ids[:self.max_len]
        
        # Create input and target sequences (target sequence is the same as input shifted by 1)
        input_tensor = torch.tensor(input_ids[:-1], dtype=torch.long)  # All tokens except the last one for input
        target_tensor = torch.tensor(input_ids[1:], dtype=torch.long)  # All tokens except the first one for target
        
        return input_tensor, target_tensor


In [5]:
from torch.utils.data import DataLoader
import torch

def collate_fn(batch):
    """
    Custom collate function to pad sequences in a batch to the same length.
    """
    input_tensors, target_tensors = zip(*batch)
    
    # Pad sequences to the maximum length in the batch
    input_padded = torch.nn.utils.rnn.pad_sequence(input_tensors, batch_first=True, padding_value=0)
    target_padded = torch.nn.utils.rnn.pad_sequence(target_tensors, batch_first=True, padding_value=0)
    
    return input_padded, target_padded

# Initialize dataset and dataloader with the custom collate_fn
dataset = FinancialDataset(tokenized_txt_path='/kaggle/input/text-data/tokenized_output.txt', tokenizer_path='/kaggle/input/text-data/financial_tokenizer.model', max_len=512)
dataloader = DataLoader(dataset, batch_size=8, collate_fn=collate_fn, shuffle=True)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Check if CUDA (GPU) is available, if not, use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define BiLSTM model (as in previous code)
class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=2):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, vocab_size)  # *2 for bidirectional

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        out = self.fc(lstm_out)
        return out

# Hyperparameters
embedding_dim = 256
hidden_dim = 512
vocab_size = len(dataset.tokenizer)  # Size of the vocabulary from SentencePiece model
model = BiLSTMModel(vocab_size=vocab_size, embedding_dim=embedding_dim, hidden_dim=hidden_dim).to(device)

# Optimizer and Loss Function
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training Loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for input_tensor, target_tensor in dataloader:
        # Move tensors to the same device (either GPU or CPU)
        input_tensor, target_tensor = input_tensor.to(device), target_tensor.to(device)

        optimizer.zero_grad()
        output = model(input_tensor)
        # Loss computation (only consider the tokenized sequence part)
        loss = criterion(output.view(-1, vocab_size), target_tensor.view(-1))
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        print('Current epoch',str(epoch),'Loss',str(total_loss))
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(dataloader)}")


In [None]:
import math

def calculate_perplexity(model, dataloader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for input_tensor, target_tensor in dataloader:
            # Move tensors to GPU if available
            input_tensor, target_tensor = input_tensor.cuda(), target_tensor.cuda()

            output = model(input_tensor)
            loss = criterion(output.view(-1, vocab_size), target_tensor.view(-1))
            total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    perplexity = math.exp(avg_loss)
    return perplexity

# After training
perplexity = calculate_perplexity(model, dataloader)
print(f"Perplexity: {perplexity}")
