In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os, sys
import pandas as pd

sys.path.append("..")
from functions import build_vocab, encode_sequences

In [13]:
# Create custom Dataset
class TextDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        sequence, target = self.sequences[idx]
        return torch.tensor(sequence), torch.tensor(target)

# Define LSTM model
class GRUWordPredictor(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(GRUWordPredictor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        gru_out, _ = self.gru(x)
        gru_out = gru_out[:, -1, :]  # Use the last output of the GRU for prediction
        out = self.fc(gru_out)
        return out

# Training loop
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch_idx, (sequences, targets) in enumerate(train_loader):
            sequences, targets = sequences.to(device), targets.to(device)
            
            # Forward pass
            outputs = model(sequences)
            loss = criterion(outputs, targets)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            # Log progress every 100 batches
            if (batch_idx + 1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
        
        # Log epoch summary
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}] completed, Average Loss: {avg_loss:.4f}')

# Predict the next word
def predict_next_word(model, sequence, word_to_idx, idx_to_word):
    model.eval()
    sequence = torch.tensor(sequence).unsqueeze(0).to(device)  # Add batch dimension
    with torch.no_grad():
        output = model(sequence)
        predicted_idx = torch.argmax(output, dim=1).item()
    return idx_to_word[predicted_idx]

def get_first_csv_file(folder_path):
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {folder_path}")
    return os.path.join(folder_path, csv_files[0])

# Read data from the first CSV file in the 'x' folder
csv_file_path = get_first_csv_file('../../data/x')
df = pd.read_csv(csv_file_path)

# Assuming the CSV has a 'text' column. Adjust if the column name is different.
texts = df['text'].tolist()

print(f"Loaded {len(texts)} text samples from CSV.")

# Preprocess and tokenize
word_to_idx, tokenized_texts = build_vocab(texts)
sequences = encode_sequences(tokenized_texts, word_to_idx, seq_length=4)

print(f"Vocabulary size: {len(word_to_idx)}")
print(f"Number of sequences: {len(sequences)}")

# Create Dataset and DataLoader
dataset = TextDataset(sequences)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define the model, loss function, and optimizer
vocab_size = len(word_to_idx)
embed_size = 128
hidden_size = 256
num_layers = 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}")

model = GRUWordPredictor(vocab_size, embed_size, hidden_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print("Starting model training...")

# Train the model
train_model(model, train_loader, criterion, optimizer, num_epochs=10)

print("Model training completed.")

# Test prediction (example sequence)
example_sequence = [word_to_idx['so'], word_to_idx['sad'], word_to_idx['to'], word_to_idx['learn']]  # Sequence from data
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
predicted_word = predict_next_word(model, example_sequence, word_to_idx, idx_to_word)
print(f'Predicted next word: {predicted_word}')

Loaded 10000 text samples from CSV.
Vocabulary size: 19081
Number of sequences: 85750
Using device: cpu
Starting model training...
tensor([[-0.0534,  0.0292, -0.0948,  ..., -0.0064,  0.0378,  0.0447],
        [-0.1494,  0.0193,  0.0090,  ..., -0.0116,  0.0004,  0.0169],
        [-0.0241,  0.0933, -0.0353,  ...,  0.0609, -0.0525,  0.0170],
        ...,
        [-0.0761, -0.0011,  0.0111,  ..., -0.0441,  0.0716,  0.0361],
        [-0.1903, -0.0013,  0.0024,  ...,  0.0356, -0.0318,  0.0465],
        [-0.1121, -0.0996, -0.0502,  ...,  0.0680,  0.0145, -0.0032]],
       grad_fn=<AddmmBackward0>)
tensor([[-0.0109, -0.0333,  0.0772,  ..., -0.1020, -0.0193,  0.0415],
        [-0.0027,  0.0034,  0.0416,  ..., -0.0959,  0.0368, -0.0672],
        [-0.0583, -0.0087,  0.1026,  ...,  0.0087,  0.0136,  0.0163],
        ...,
        [-0.1601, -0.0029,  0.0411,  ...,  0.0812,  0.0280, -0.0103],
        [-0.0852,  0.0564, -0.0322,  ..., -0.0841,  0.0747, -0.0248],
        [-0.0477,  0.0437,  0.0325,  ..

KeyboardInterrupt: 

In [None]:
indata = [word_to_idx[words] for words in "hello how are".split()]
print(predict_next_word(model, indata, word_to_idx, idx_to_word))

In [None]:
import json
# Save the trained model
model_save_path = 'next_word_lstm_model.pth'
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

# Save the vocabulary (word_to_idx dictionary)
vocab_save_path = 'vocabulary.json'
with open(vocab_save_path, 'w') as f:
    json.dump(word_to_idx, f)
print(f"Vocabulary saved to {vocab_save_path}")