In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os, sys
import pandas as pd

sys.path.append("..")
from functions import build_vocab, encode_sequences
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [7]:
import re
from collections import Counter
import re


def clean_text(text):
    text = text.lower().strip()  # Lowercase and strip whitespaces
    # Keep only 2 consecutive same characters
    text = re.sub(r'(.)\1+', r'\1\1', text)
    return text

# Tokenize the text
def tokenize_text(text):
    return text.split()

def build_vocab(texts):
    tokenized_texts = [tokenize_text(clean_text(text)) for text in texts]
    all_words = [word for text in tokenized_texts for word in text if not ('https' in word or any(char.isalnum() == False for char in word))]
    word_counts = Counter(all_words)
    sorted_words = sorted(word_counts, key=word_counts.get, reverse=True)
    
    # Keep only the 1000 most frequent words
    top_1000_words = sorted_words[:1000]
    
    # Create a mapping from word to index
    word_to_idx = {word: idx+1 for idx, word in enumerate(top_1000_words)}
    word_to_idx['<PAD>'] = 0  # Padding index
    word_to_idx['<UNK>'] = len(word_to_idx)  # Unknown word token
    return word_to_idx, tokenized_texts

def is_illegal_word(word):
    return 'https' in word or any(char.isalnum() == False for char in word) or word not in word_to_idx

def encode_sequences(tokenized_texts, word_to_idx, seq_length=6):
    sequences = []
    for tokens in tokenized_texts:
        if len(tokens) < seq_length:
            continue
        for i in range(seq_length, len(tokens)):
            seq = tokens[i-seq_length:i]  # Input sequence of words
            target = tokens[i]  # Target word (next word)
            if any(is_illegal_word(word) for word in seq) or is_illegal_word(target):
                continue
            encoded_seq = [word_to_idx.get(word, word_to_idx['<UNK>']) for word in seq]
            encoded_target = word_to_idx.get(target, word_to_idx['<UNK>'])
            sequences.append((encoded_seq, encoded_target))
    return sequences

# Create custom Dataset
class TextDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        sequence, target = self.sequences[idx]
        return torch.tensor(sequence), torch.tensor(target)

# Define GRU model
class NextWordGRU(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(NextWordGRU, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x):
        x = self.embedding(x)
        gru_out, _ = self.gru(x)
        gru_out = gru_out[:, -1, :]  # Take the output of the last GRU cell
        out = self.fc(gru_out)
        return out

# Training loop
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch_idx, (sequences, targets) in enumerate(train_loader):
            sequences, targets = sequences.to(device), targets.to(device)
            
            # Forward pass
            outputs = model(sequences)
            loss = criterion(outputs, targets)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            
            # Log progress every 100 batches
            if (batch_idx + 1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
        
        # Log epoch summary
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}] completed, Average Loss: {avg_loss:.4f}')

# Predict the next word
def predict_next_word(model, sequence, word_to_idx, idx_to_word):
    model.eval()
    sequence = torch.tensor(sequence).unsqueeze(0).to(device)  # Add batch dimension
    with torch.no_grad():
        output = model(sequence)
        predicted_idx = torch.argmax(output, dim=1).item()
    return idx_to_word[predicted_idx]

def get_first_csv_file(folder_path):
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {folder_path}")
    return os.path.join(folder_path, csv_files[0])

# Read data from the first x CSV files in the 'x' folder
folder_path = '../../data/x'
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
csv_files = sorted(csv_files)[:3]  # Take the first x files

dfs = []
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all dataframes
df = pd.concat(dfs, ignore_index=True)

# Assuming the CSV has a 'text' column. Adjust if the column name is different.
texts = df['text'].tolist()

print(f"Loaded {len(texts)} text samples from CSV.")

# Preprocess and tokenize
word_to_idx, tokenized_texts = build_vocab(texts)
sequences = encode_sequences(tokenized_texts, word_to_idx, seq_length=4)

print(f"Vocabulary size: {len(word_to_idx)}")
print(f"Number of sequences: {len(sequences)}")


Loaded 30000 text samples from CSV.
Vocabulary size: 1002
Number of sequences: 47888


In [8]:

# Create Dataset and DataLoader
dataset = TextDataset(sequences)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define the model, loss function, and optimizer
vocab_size = len(word_to_idx)
embed_size = 128
hidden_size = 256
num_layers = 2
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}")

model = NextWordGRU(vocab_size, embed_size, hidden_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print("Starting model training...")

# Train the model
train_model(model, train_loader, criterion, optimizer, num_epochs=10)

Using device: cpu
Starting model training...
Epoch [1/10], Batch [100/1497], Loss: 5.7747
Epoch [1/10], Batch [200/1497], Loss: 6.0244
Epoch [1/10], Batch [300/1497], Loss: 5.1599
Epoch [1/10], Batch [400/1497], Loss: 5.2224
Epoch [1/10], Batch [500/1497], Loss: 4.7782
Epoch [1/10], Batch [600/1497], Loss: 4.7490
Epoch [1/10], Batch [700/1497], Loss: 4.3544
Epoch [1/10], Batch [800/1497], Loss: 4.4572
Epoch [1/10], Batch [900/1497], Loss: 4.2286
Epoch [1/10], Batch [1000/1497], Loss: 4.4709
Epoch [1/10], Batch [1100/1497], Loss: 5.5663
Epoch [1/10], Batch [1200/1497], Loss: 4.9151
Epoch [1/10], Batch [1300/1497], Loss: 5.3888
Epoch [1/10], Batch [1400/1497], Loss: 4.2341
Epoch [1/10] completed, Average Loss: 4.9469
Epoch [2/10], Batch [100/1497], Loss: 4.0155
Epoch [2/10], Batch [200/1497], Loss: 4.7878
Epoch [2/10], Batch [300/1497], Loss: 4.3421
Epoch [2/10], Batch [400/1497], Loss: 3.8245
Epoch [2/10], Batch [500/1497], Loss: 4.6494
Epoch [2/10], Batch [600/1497], Loss: 3.5959
Epoch

In [11]:
import json

# Save the vocabulary (word_to_idx dictionary)
vocab_save_path = 'vocabulary.json'
with open(vocab_save_path, 'w') as f:
    json.dump(word_to_idx, f)
print(f"Vocabulary saved to {vocab_save_path}")

#save model
model_save_path = 'model.pth'
torch.save(model, model_save_path)
print(f"Model saved to {model_save_path}")

Vocabulary saved to vocabulary.json
Model saved to model.pth


In [12]:
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

indata = [word_to_idx[words] for words in "hello how are".split()]
print(predict_next_word(model, indata, word_to_idx, idx_to_word))

you


In [13]:
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

for i in range(100):
    sequence, target = dataset[i+1000]
    sequence_words = [idx_to_word[idx.item()] for idx in sequence]
    target_word = idx_to_word[target.item()]
    print(f"Sequence {i + 1}: {' '.join(sequence_words)} -> {target_word}")

Sequence 1: hate that i keep -> missing
Sequence 2: that i keep missing -> out
Sequence 3: i keep missing out -> on
Sequence 4: keep missing out on -> the
Sequence 5: missing out on the -> fun
Sequence 6: out on the fun -> cuz
Sequence 7: on the fun cuz -> of
Sequence 8: the fun cuz of -> the
Sequence 9: fun cuz of the -> time
Sequence 10: this is exactly how -> it
Sequence 11: is exactly how it -> feels
Sequence 12: exactly how it feels -> wearing
Sequence 13: how it feels wearing -> a
Sequence 14: definitely isnt the word -> i
Sequence 15: i love money 2 -> though
Sequence 16: gonna watch it then -> sleep
Sequence 17: in one of those -> days
Sequence 18: one of those days -> when
Sequence 19: of those days when -> i
Sequence 20: those days when i -> really
Sequence 21: days when i really -> just
Sequence 22: when i really just -> want
Sequence 23: i really just want -> to
Sequence 24: really just want to -> work
Sequence 25: just want to work -> from
Sequence 26: it a game or -> just