In [6]:
# !!!VERY BAD DON*T USE THIS!!!
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np
import re
import os
import pandas as pd
from transformers import BertTokenizer, BertModel

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Preprocessing: Clean Text Data
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters and numbers
    text = text.lower().strip()  # Lowercase and strip whitespaces
    return text

# Tokenize and encode the text using BERT tokenizer
def tokenize_and_encode(text, max_length=128):
    cleaned_text = clean_text(text)
    encoded = tokenizer.encode_plus(
        cleaned_text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    return encoded['input_ids'].squeeze(), encoded['attention_mask'].squeeze()

# Create sequences for next word prediction within a single text
def create_sequences(input_ids, attention_mask, seq_length=4):
    sequences = []
    for i in range(seq_length, len(input_ids)):
        seq_input_ids = input_ids[i-seq_length:i]
        seq_attention_mask = attention_mask[i-seq_length:i]
        target = input_ids[i]
        if target not in [tokenizer.pad_token_id, tokenizer.sep_token_id, tokenizer.cls_token_id]:
            sequences.append((seq_input_ids, seq_attention_mask, target))
    return sequences

# Create custom Dataset
class TextDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        input_ids, attention_mask, target = self.sequences[idx]
        return input_ids, attention_mask, target

# Define LSTM model with BERT embeddings
class NextWordLSTM(nn.Module):
    def __init__(self, hidden_size, num_layers):
        super(NextWordLSTM, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.lstm = nn.LSTM(768, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, tokenizer.vocab_size)
    
    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            bert_outputs = self.bert(input_ids, attention_mask=attention_mask)
        bert_embeddings = bert_outputs.last_hidden_state
        lstm_out, _ = self.lstm(bert_embeddings)
        lstm_out = lstm_out[:, -1, :]  # Take the output of the last LSTM cell
        out = self.fc(lstm_out)
        return out

# Training loop
def train_model(model, train_loader, criterion, optimizer, num_epochs=10, max_batches=100000000):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch_idx, (input_ids, attention_mask, targets) in enumerate(train_loader):
            if batch_idx >= max_batches:
                break
            input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)
            
            # Forward pass
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, targets)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            if batch_idx % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
        
        # Log epoch summary
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}] completed, Average Loss: {avg_loss:.4f}')

# Predict the next word
def predict_next_word(model, sequence, attention_mask):
    model.eval()
    with torch.no_grad():
        output = model(sequence.unsqueeze(0).to(device), attention_mask.unsqueeze(0).to(device))
        predicted_idx = torch.argmax(output, dim=1).item()
    predicted_word = tokenizer.decode([predicted_idx])
    return predicted_word if predicted_word != '[PAD]' else 'Unable to predict'

def get_first_csv_file(folder_path):
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {folder_path}")
    return os.path.join(folder_path, csv_files[0])

# Read data from the first CSV file in the 'x' folder
csv_file_path = get_first_csv_file('../../data/x')
df = pd.read_csv(csv_file_path)

# Assuming the CSV has a 'text' column. Adjust if the column name is different.
texts = df['text'].tolist()

print(f"Loaded {len(texts)} text samples from CSV.")

# Preprocess and tokenize
sequences = []
for text in texts:
    input_ids, attention_mask = tokenize_and_encode(text)
    sequences.extend(create_sequences(input_ids, attention_mask))

print(f"Number of sequences: {len(sequences)}")


Loaded 10000 text samples from CSV.
Number of sequences: 131024


In [7]:
# Create Dataset and DataLoader
dataset = TextDataset(sequences)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)


# Define the model, loss function, and optimizer
hidden_size = 256
num_layers = 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}")

Using device: cpu


In [8]:

model = NextWordLSTM(hidden_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print("Starting model training...")

# Train the model
train_model(model, train_loader, criterion, optimizer, num_epochs=3, max_batches=1000)

print("Model training completed.")

Starting model training...
Epoch [1/3], Batch [1/4095], Loss: 10.3150
Epoch [1/3], Batch [101/4095], Loss: 6.2973
Epoch [1/3], Batch [201/4095], Loss: 7.7169
Epoch [1/3], Batch [301/4095], Loss: 7.8870
Epoch [1/3], Batch [401/4095], Loss: 7.3806
Epoch [1/3], Batch [501/4095], Loss: 7.3208
Epoch [1/3], Batch [601/4095], Loss: 8.0329
Epoch [1/3], Batch [701/4095], Loss: 6.6193
Epoch [1/3], Batch [801/4095], Loss: 6.6307
Epoch [1/3], Batch [901/4095], Loss: 6.8697
Epoch [1/3] completed, Average Loss: 1.7862
Epoch [2/3], Batch [1/4095], Loss: 6.9962
Epoch [2/3], Batch [101/4095], Loss: 7.0526
Epoch [2/3], Batch [201/4095], Loss: 7.0492
Epoch [2/3], Batch [301/4095], Loss: 6.6422
Epoch [2/3], Batch [401/4095], Loss: 7.1796
Epoch [2/3], Batch [501/4095], Loss: 7.1068
Epoch [2/3], Batch [601/4095], Loss: 7.2947
Epoch [2/3], Batch [701/4095], Loss: 7.3314
Epoch [2/3], Batch [801/4095], Loss: 7.2943
Epoch [2/3], Batch [901/4095], Loss: 6.1930
Epoch [2/3] completed, Average Loss: 1.7017
Epoch [3

In [12]:

# Test prediction (example sequence)
example_text = "so sad to learn"
input_ids, attention_mask = tokenize_and_encode(example_text)
predicted_word = predict_next_word(model, input_ids, attention_mask)
print(f'Predicted next word: {predicted_word}')

indata = "Hello my name is"
input_ids, attention_mask = tokenize_and_encode(indata)
print(predict_next_word(model, input_ids, attention_mask))

Predicted next word: i
i
