In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os, sys
import pandas as pd

sys.path.append("..")
from functions import build_vocab, encode_sequences

In [10]:
# Create custom Dataset
class TextDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        sequence, target = self.sequences[idx]
        return torch.tensor(sequence), torch.tensor(target)

# Define LSTM model
class NextWordLSTM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(NextWordLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Take the output of the last LSTM cell
        out = self.fc(lstm_out)
        return out

# Training loop
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch_idx, (sequences, targets) in enumerate(train_loader):
            sequences, targets = sequences.to(device), targets.to(device)
            
            # Forward pass
            outputs = model(sequences)
            loss = criterion(outputs, targets)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            # Log progress every 100 batches
            if (batch_idx + 1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
        
        # Log epoch summary
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}] completed, Average Loss: {avg_loss:.4f}')

# Predict the next word
def predict_next_word(model, sequence, word_to_idx, idx_to_word):
    model.eval()
    sequence = torch.tensor(sequence).unsqueeze(0).to(device)  # Add batch dimension
    with torch.no_grad():
        output = model(sequence)
        predicted_idx = torch.argmax(output, dim=1).item()
    return idx_to_word[predicted_idx]

def get_csv_files(folder_path):
    return [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]

def read_csv_files(folder_path):
    csv_files = get_csv_files(folder_path)[:5]
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {folder_path}")
    
    all_texts = []
    for file_path in csv_files:
        df = pd.read_csv(file_path)
        if 'text' in df.columns:
            all_texts.extend(df['text'].tolist())
        elif 'Content' in df.columns:
            all_texts.extend(df['Content'].tolist())
        else:
            print(f"Warning: No 'text' or 'Content' column found in {file_path}")
    
    return all_texts

# Read data from all CSV files in the 'x' and 'emails' folders
x_folder_path = '../../data/x'
emails_folder_path = '../../data/emails'

texts = read_csv_files(x_folder_path)

print(f"Loaded {len(texts)} text samples from CSV files.")

# Build vocabulary with a cap of 5000 most common words
word_to_idx, tokenized_texts = build_vocab(texts)
word_to_idx = dict(sorted(word_to_idx.items(), key=lambda x: x[1])[:5000])
word_to_idx['<UNK>'] = len(word_to_idx)  # Add unknown token

# Filter sequences to only include words in the vocabulary
def filter_sequences(tokenized_texts, word_to_idx, seq_length=10):
    filtered_sequences = []
    for tokens in tokenized_texts:
        if len(tokens) < seq_length + 1:
            continue
        for i in range(seq_length, len(tokens)):
            seq = tokens[i-seq_length:i]
            target = tokens[i]
            if all(word in word_to_idx for word in seq) and target in word_to_idx:
                encoded_seq = [word_to_idx[word] for word in seq]
                encoded_target = word_to_idx[target]
                filtered_sequences.append((encoded_seq, encoded_target))
    return filtered_sequences

sequences = filter_sequences(tokenized_texts, word_to_idx, seq_length=4)


print(f"Vocabulary size: {len(word_to_idx)}")
print(f"Number of sequences: {len(sequences)}")

# Create Dataset and DataLoader
dataset = TextDataset(sequences)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define the model, loss function, and optimizer
vocab_size = len(word_to_idx)
embed_size = 128
hidden_size = 256
num_layers = 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}")

model = NextWordLSTM(vocab_size, embed_size, hidden_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print("Starting model training...")

# Train the model
train_model(model, train_loader, criterion, optimizer, num_epochs=10)

print("Model training completed.")

# Test prediction (example sequence)
example_sequence = [word_to_idx['so'], word_to_idx['sad'], word_to_idx['to'], word_to_idx['learn']]  # Sequence from data
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
predicted_word = predict_next_word(model, example_sequence, word_to_idx, idx_to_word)
print(f'Predicted next word: {predicted_word}')

Loaded 50000 text samples from CSV files.
Number of names replaced: 20105
Vocabulary size: 5001
Number of sequences: 325969
Using device: cuda
Starting model training...
Epoch [1/10], Batch [100/10187], Loss: 6.6217
Epoch [1/10], Batch [200/10187], Loss: 6.5695
Epoch [1/10], Batch [300/10187], Loss: 5.6999
Epoch [1/10], Batch [400/10187], Loss: 5.7570
Epoch [1/10], Batch [500/10187], Loss: 5.8981
Epoch [1/10], Batch [600/10187], Loss: 6.0360
Epoch [1/10], Batch [700/10187], Loss: 6.2257
Epoch [1/10], Batch [800/10187], Loss: 5.8280
Epoch [1/10], Batch [900/10187], Loss: 5.9156
Epoch [1/10], Batch [1000/10187], Loss: 6.0553
Epoch [1/10], Batch [1100/10187], Loss: 5.7014
Epoch [1/10], Batch [1200/10187], Loss: 5.7064
Epoch [1/10], Batch [1300/10187], Loss: 6.4004
Epoch [1/10], Batch [1400/10187], Loss: 5.9957
Epoch [1/10], Batch [1500/10187], Loss: 6.3230
Epoch [1/10], Batch [1600/10187], Loss: 6.0752
Epoch [1/10], Batch [1700/10187], Loss: 6.0973
Epoch [1/10], Batch [1800/10187], Loss: 

In [30]:
indata = [word_to_idx[words] for words in "what a waste of time i was hoping".split()]
print(predict_next_word(model, indata, word_to_idx, idx_to_word))


street


In [12]:
import json
# Save the trained model
model_save_path = 'next_word_lstm_model.pth'
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

# Save the vocabulary (word_to_idx dictionary)<
vocab_save_path = 'vocabulary.json'
with open(vocab_save_path, 'w') as f:
    json.dump(word_to_idx, f)
print(f"Vocabulary saved to {vocab_save_path}")


Model saved to next_word_lstm_model.pth
Vocabulary saved to vocabulary.json
