In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np
import re
import os
import pandas as pd
import time

# Preprocessing: Clean and Tokenize Text Data
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)  # Remove user names
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters and numbers
    text = text.lower().strip()  # Lowercase and strip whitespaces
    return text

# Tokenize the text
def tokenize_text(text):
    return text.split()

# Build a vocabulary and tokenize the dataset
def build_vocab(texts):
    tokenized_texts = [tokenize_text(clean_text(text)) for text in texts]
    all_words = [word for text in tokenized_texts for word in text]
    word_counts = Counter(all_words)
    sorted_words = sorted(word_counts, key=word_counts.get, reverse=True)
    
    # Create a mapping from word to index
    word_to_idx = {word: idx+1 for idx, word in enumerate(sorted_words)}
    word_to_idx['<PAD>'] = 0  # Padding index
    return word_to_idx, tokenized_texts

# Convert sequences of words to sequences of integers
def encode_sequences(tokenized_texts, word_to_idx, seq_length=4):
    sequences = []
    for tokens in tokenized_texts:
        if len(tokens) < seq_length:
            continue
        for i in range(seq_length, len(tokens)):
            seq = tokens[i-seq_length:i]  # Input sequence of words
            target = tokens[i]  # Target word (next word)
            encoded_seq = [word_to_idx[word] for word in seq]
            encoded_target = word_to_idx[target]
            sequences.append((encoded_seq, encoded_target))
    return sequences




In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import os
import pandas as pd

# Create custom Dataset for character sequences
class CharDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence, target = self.sequences[idx]
        return torch.tensor(sequence), torch.tensor(target)

# Define LSTM model for character-level prediction
class NextCharLSTM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(NextCharLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.layer_norm = nn.LayerNorm(hidden_size)  # Layer normalization for stability
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, lengths=None):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        lstm_out = torch.mean(lstm_out, dim=1)  # Mean pooling across the sequence
        lstm_out = self.layer_norm(lstm_out)
        out = self.fc(lstm_out)
        return out

# Training loop with gradient clipping and learning rate scheduling
def train_model(model, train_loader, criterion, optimizer, scheduler, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch_idx, (sequences, targets) in enumerate(train_loader):
            sequences, targets = sequences.to(device), targets.to(device)

            # Forward pass
            outputs = model(sequences)
            loss = criterion(outputs, targets)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()

            # Apply gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

            total_loss += loss.item()

            # Log progress every 100 batches
            if (batch_idx + 1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

        # Log epoch summary
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}] completed, Average Loss: {avg_loss:.4f}')

        # Step the learning rate scheduler
        scheduler.step()

# Predict the next character
def predict_next_char(model, sequence, char_to_idx, idx_to_char):
    model.eval()
    sequence = torch.tensor(sequence).unsqueeze(0).to(device)  # Add batch dimension
    with torch.no_grad():
        output = model(sequence)
        predicted_idx = torch.argmax(output, dim=1).item()
    return idx_to_char[predicted_idx]

# Build character-level vocabulary
def build_char_vocab(texts):
    all_text = ''.join(texts)
    chars = sorted(set(all_text))
    char_to_idx = {ch: i for i, ch in enumerate(chars)}
    idx_to_char = {i: ch for i, ch in enumerate(chars)}
    return char_to_idx, idx_to_char

# Encode sequences as character indices
def encode_char_sequences(texts, char_to_idx, seq_length=100):
    sequences = []
    for text in texts:
        encoded_text = [char_to_idx[char] for char in text]
        for i in range(0, len(encoded_text) - seq_length):
            input_seq = encoded_text[i:i + seq_length]
            target = encoded_text[i + seq_length]
            sequences.append((input_seq, target))
    return sequences

# Read data from all text files in the 'x' folder
def get_all_text_files(folder_path):
    text_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    if not text_files:
        raise FileNotFoundError(f"No text files found in {folder_path}")
    return [os.path.join(folder_path, f) for f in text_files]

# Load text data
text_file_paths = get_all_text_files('/home/m/dev/scaleout/data/x')
texts = []
max_files = 1

for file_path in text_file_paths[:max_files]:
    text_data = pd.read_csv(file_path)
    texts.extend(text_data['text'].tolist())

# Remove empty lines and strip whitespace
texts = [text.strip() for text in texts if text.strip()]

# Build character-level vocabulary and encode sequences
char_to_idx, idx_to_char = build_char_vocab(texts)
sequences = encode_char_sequences(texts, char_to_idx, seq_length=100)  # Use character sequences of length 100

# Create Dataset and DataLoader
dataset = CharDataset(sequences)
train_loader = DataLoader(dataset, batch_size=4, shuffle=True)

# Model parameters
vocab_size = len(char_to_idx)
embed_size = 128
hidden_size = 256
num_layers = 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize model
model = NextCharLSTM(vocab_size, embed_size, hidden_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.9)

# Train the model
train_model(model, train_loader, criterion, optimizer, scheduler, num_epochs=10)

# Test prediction (example sequence)
example_sequence = [char_to_idx[ch] for ch in "hello there, how d"]
predicted_char = predict_next_char(model, example_sequence, char_to_idx, idx_to_char)
print(f'Predicted next character: {predicted_char}')


Epoch [1/10], Batch [100/15033], Loss: 4.0246
Epoch [1/10], Batch [200/15033], Loss: 4.7928
Epoch [1/10], Batch [300/15033], Loss: 3.7313
Epoch [1/10], Batch [400/15033], Loss: 3.4175
Epoch [1/10], Batch [500/15033], Loss: 3.8200
Epoch [1/10], Batch [600/15033], Loss: 2.7125
Epoch [1/10], Batch [700/15033], Loss: 3.2242
Epoch [1/10], Batch [800/15033], Loss: 3.0812
Epoch [1/10], Batch [900/15033], Loss: 3.1804
Epoch [1/10], Batch [1000/15033], Loss: 2.9325
Epoch [1/10], Batch [1100/15033], Loss: 4.0897
Epoch [1/10], Batch [1200/15033], Loss: 3.5993
Epoch [1/10], Batch [1300/15033], Loss: 2.9626
Epoch [1/10], Batch [1400/15033], Loss: 2.8005
Epoch [1/10], Batch [1500/15033], Loss: 3.3586
Epoch [1/10], Batch [1600/15033], Loss: 3.8237
Epoch [1/10], Batch [1700/15033], Loss: 3.6323
Epoch [1/10], Batch [1800/15033], Loss: 2.5344
Epoch [1/10], Batch [1900/15033], Loss: 3.8406
Epoch [1/10], Batch [2000/15033], Loss: 3.2437
Epoch [1/10], Batch [2100/15033], Loss: 3.3370
Epoch [1/10], Batch [2

In [18]:
example_sequence = [char_to_idx[ch] for ch in "hello there, how"]
predicted_text = ""
current_sequence = example_sequence.copy()

while True:
    predicted_char = predict_next_char(model, current_sequence, char_to_idx, idx_to_char)
    predicted_text += predicted_char
    
    if predicted_char in [' ', '.', '!', '?']:
        break
    
    current_sequence = current_sequence[1:] + [char_to_idx[predicted_char]]

print(f'Predicted text: {predicted_text}')


Predicted text:  


In [None]:
import json
# Save the trained model
model_save_path = 'next_word_lstm_model.pth'
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

# Save the vocabulary (word_to_idx dictionary)
vocab_save_path = 'vocabulary.json'
with open(vocab_save_path, 'w') as f:
    json.dump(word_to_idx, f)
print(f"Vocabulary saved to {vocab_save_path}")
print(f"Vocabulary Length: {len(word_to_idx)}")

Model saved to next_word_lstm_model.pth
Vocabulary saved to vocabulary.json
Vocabulary Length: 56116
