In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np
import re
import os
import pandas as pd

# Preprocessing: Clean and Tokenize Text Data
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters and numbers
    text = text.lower().strip()  # Lowercase and strip whitespaces
    return text

# Tokenize the text
def tokenize_text(text):
    return text.split()

# Build a vocabulary and tokenize the dataset
def build_vocab(texts):
    tokenized_texts = [tokenize_text(clean_text(text)) for text in texts]
    all_words = [word for text in tokenized_texts for word in text]
    word_counts = Counter(all_words)
    sorted_words = sorted(word_counts, key=word_counts.get, reverse=True)
    
    # Create a mapping from word to index
    word_to_idx = {word: idx+1 for idx, word in enumerate(sorted_words)}
    word_to_idx['<PAD>'] = 0  # Padding index
    return word_to_idx, tokenized_texts

# Convert sequences of words to sequences of integers
def encode_sequences(tokenized_texts, word_to_idx, seq_length=4):
    sequences = []
    for tokens in tokenized_texts:
        if len(tokens) < seq_length:
            continue
        for i in range(seq_length, len(tokens)):
            seq = tokens[i-seq_length:i]  # Input sequence of words
            target = tokens[i]  # Target word (next word)
            encoded_seq = [word_to_idx[word] for word in seq]
            encoded_target = word_to_idx[target]
            sequences.append((encoded_seq, encoded_target))
    return sequences




In [4]:
# Create custom Dataset
class TextDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        sequence, target = self.sequences[idx]
        return torch.tensor(sequence), torch.tensor(target)

# Define LSTM model
class NextWordLSTM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(NextWordLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Take the output of the last LSTM cell
        out = self.fc(lstm_out)
        return out

# Training loop
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch_idx, (sequences, targets) in enumerate(train_loader):
            sequences, targets = sequences.to(device), targets.to(device)
            
            # Forward pass
            outputs = model(sequences)
            loss = criterion(outputs, targets)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            # Log progress every 100 batches
            if (batch_idx + 1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
        
        # Log epoch summary
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}] completed, Average Loss: {avg_loss:.4f}')

# Predict the next word
def predict_next_word(model, sequence, word_to_idx, idx_to_word):
    model.eval()
    sequence = torch.tensor(sequence).unsqueeze(0).to(device)  # Add batch dimension
    with torch.no_grad():
        output = model(sequence)
        predicted_idx = torch.argmax(output, dim=1).item()
    return idx_to_word[predicted_idx]

def get_first_csv_file(folder_path):
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {folder_path}")
    return os.path.join(folder_path, csv_files[0])

# Read data from the first CSV file in the 'x' folder
csv_file_path = get_first_csv_file('x')
df = pd.read_csv(csv_file_path)

# Assuming the CSV has a 'text' column. Adjust if the column name is different.
texts = df['text'].tolist()

print(f"Loaded {len(texts)} text samples from CSV.")

# Preprocess and tokenize
word_to_idx, tokenized_texts = build_vocab(texts)
sequences = encode_sequences(tokenized_texts, word_to_idx, seq_length=4)

print(f"Vocabulary size: {len(word_to_idx)}")
print(f"Number of sequences: {len(sequences)}")

# Create Dataset and DataLoader
dataset = TextDataset(sequences)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define the model, loss function, and optimizer
vocab_size = len(word_to_idx)
embed_size = 128
hidden_size = 256
num_layers = 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}")

model = NextWordLSTM(vocab_size, embed_size, hidden_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print("Starting model training...")

# Train the model
train_model(model, train_loader, criterion, optimizer, num_epochs=10)

print("Model training completed.")

# Test prediction (example sequence)
example_sequence = [word_to_idx['so'], word_to_idx['sad'], word_to_idx['to'], word_to_idx['learn']]  # Sequence from data
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
predicted_word = predict_next_word(model, example_sequence, word_to_idx, idx_to_word)
print(f'Predicted next word: {predicted_word}')

Loaded 10000 text samples from CSV.
Vocabulary size: 16865
Number of sequences: 93724
Using device: cpu
Starting model training...
Epoch [1/10], Batch [100/2929], Loss: 7.8576
Epoch [1/10], Batch [200/2929], Loss: 7.4485
Epoch [1/10], Batch [300/2929], Loss: 6.7471
Epoch [1/10], Batch [400/2929], Loss: 6.2216
Epoch [1/10], Batch [500/2929], Loss: 6.8894
Epoch [1/10], Batch [600/2929], Loss: 6.3081
Epoch [1/10], Batch [700/2929], Loss: 6.5213
Epoch [1/10], Batch [800/2929], Loss: 6.5693
Epoch [1/10], Batch [900/2929], Loss: 6.8036
Epoch [1/10], Batch [1000/2929], Loss: 6.7776
Epoch [1/10], Batch [1100/2929], Loss: 6.8371
Epoch [1/10], Batch [1200/2929], Loss: 7.0833
Epoch [1/10], Batch [1300/2929], Loss: 6.0505
Epoch [1/10], Batch [1400/2929], Loss: 6.7945
Epoch [1/10], Batch [1500/2929], Loss: 5.3987
Epoch [1/10], Batch [1600/2929], Loss: 7.6319
Epoch [1/10], Batch [1700/2929], Loss: 6.8224
Epoch [1/10], Batch [1800/2929], Loss: 6.8201
Epoch [1/10], Batch [1900/2929], Loss: 6.4734
Epoc

In [12]:
indata = [word_to_idx[words] for words in "hello how are".split()]
print(predict_next_word(model, indata, word_to_idx, idx_to_word))


i
