Set up for data

In [2]:
import wget, os, gzip, pickle, random, re, sys

IMDB_URL = 'http://dlvu.github.io/data/imdb.{}.pkl.gz'
IMDB_FILE = 'imdb.{}.pkl.gz'

PAD, START, END, UNK = '.pad', '.start', '.end', '.unk'

def load_imdb(final=False, val=5000, seed=0, voc=None, char=False):

    cst = 'char' if char else 'word'

    imdb_url = IMDB_URL.format(cst)
    imdb_file = IMDB_FILE.format(cst)

    if not os.path.exists(imdb_file):
        wget.download(imdb_url)

    with gzip.open(imdb_file) as file:
        sequences, labels, i2w, w2i = pickle.load(file)

    if voc is not None and voc < len(i2w):
        nw_sequences = {}

        i2w = i2w[:voc]
        w2i = {w: i for i, w in enumerate(i2w)}

        mx, unk = voc, w2i['.unk']
        for key, seqs in sequences.items():
            nw_sequences[key] = []
            for seq in seqs:
                seq = [s if s < mx else unk for s in seq]
                nw_sequences[key].append(seq)

        sequences = nw_sequences

    if final:
        return (sequences['train'], labels['train']), (sequences['test'], labels['test']), (i2w, w2i), 2

    # Make a validation split
    random.seed(seed)

    x_train, y_train = [], []
    x_val, y_val = [], []

    val_ind = set( random.sample(range(len(sequences['train'])), k=val) )
    for i, (s, l) in enumerate(zip(sequences['train'], labels['train'])):
        if i in val_ind:
            x_val.append(s)
            y_val.append(l)
        else:
            x_train.append(s)
            y_train.append(l)

    return (x_train, y_train), \
           (x_val, y_val), \
           (i2w, w2i), 2


def gen_sentence(sent, g):

    symb = '_[a-z]*'

    while True:

        match = re.search(symb, sent)
        if match is None:
            return sent

        s = match.span()
        sent = sent[:s[0]] + random.choice(g[sent[s[0]:s[1]]]) + sent[s[1]:]

def gen_dyck(p):
    open = 1
    sent = '('
    while open > 0:
        if random.random() < p:
            sent += '('
            open += 1
        else:
            sent += ')'
            open -= 1

    return sent

def gen_ndfa(p):

    word = random.choice(['abc!', 'uvw!', 'klm!'])

    s = ''
    while True:
        if random.random() < p:
            return 's' + s + 's'
        else:
            s+= word

def load_brackets(n=50_000, seed=0):
    return load_toy(n, char=True, seed=seed, name='dyck')

def load_ndfa(n=50_000, seed=0):
    return load_toy(n, char=True, seed=seed, name='ndfa')

def load_toy(n=50_000, char=True, seed=0, name='lang'):

    random.seed(0)

    if name == 'lang':
        sent = '_s'

        toy = {
            '_s': ['_s _adv', '_np _vp', '_np _vp _prep _np', '_np _vp ( _prep _np )', '_np _vp _con _s' , '_np _vp ( _con _s )'],
            '_adv': ['briefly', 'quickly', 'impatiently'],
            '_np': ['a _noun', 'the _noun', 'a _adj _noun', 'the _adj _noun'],
            '_prep': ['on', 'with', 'to'],
            '_con' : ['while', 'but'],
            '_noun': ['mouse', 'bunny', 'cat', 'dog', 'man', 'woman', 'person'],
            '_vp': ['walked', 'walks', 'ran', 'runs', 'goes', 'went'],
            '_adj': ['short', 'quick', 'busy', 'nice', 'gorgeous']
        }

        sentences = [ gen_sentence(sent, toy) for _ in range(n)]
        sentences.sort(key=lambda s : len(s))

    elif name == 'dyck':

        sentences = [gen_dyck(7./16.) for _ in range(n)]
        sentences.sort(key=lambda s: len(s))

    elif name == 'ndfa':

        sentences = [gen_ndfa(1./4.) for _ in range(n)]
        sentences.sort(key=lambda s: len(s))

    else:
        raise Exception(name)

    tokens = set()
    for s in sentences:

        if char:
            for c in s:
                tokens.add(c)
        else:
            for w in s.split():
                tokens.add(w)

    i2t = [PAD, START, END, UNK] + list(tokens)
    t2i = {t:i for i, t in enumerate(i2t)}

    sequences = []
    for s in sentences:
        if char:
            tok = list(s)
        else:
            tok = s.split()
        sequences.append([t2i[t] for t in tok])

    return sequences, (i2t, t2i)

Load  NDFA  data

In [3]:
import torch
import torch.nn as nn 
import numpy as np

In [4]:
x_train, (i2w, w2i) = load_ndfa(n=150_000)

In [5]:
len(x_train)

150000

Batching and Padding 

In [96]:
def batch(x_train):
    # batching 
    x_batches = []
    # cut of value for batches -> batches are created with sequences that contain a max diff of 100
    batch_buffer = 15 
    # key for batching -? [index, current seq length]
    start = [0, len(x_train[0])] 
    # batch
    for i, val in enumerate(x_train):
        # if seq length is greater than batch_buffer create batch 
        if len(val) - start[1] > batch_buffer:
            # create batch
            x_batches.append(x_train[start[0] : i])
            # update index and current seq length
            start[0] = i
            start[1] = len(val)
    
    return x_batches

def padding(x_batches):
    # padded batches 
    px_batches = []
    py_batches = []
    # apply padding per batch
    for batch in x_batches:
        xp_batch = [] # current patted batch
        yp_batch = []

        # get maximal seq length for current batch
        max_size = max(len(seq) for seq in batch)
        # loop over seq in batch
        for seq in batch:
            # apply padding, start and ending char to seq and apped
            xp_batch.append([1] + seq + [0]*(max_size - len(seq)) + [2])
            yp_batch.append([0] +  seq[1:] + [0]*(max_size - len(seq)) + [2, 0])

            # yp_batch.append([0] + seq + [0]*(max_size - len(seq) ) + [2])
            
        # append padded batch to padded batches
        px_batches.append(xp_batch)
        py_batches.append(yp_batch)
    
    return px_batches, py_batches

In [94]:
# get padded batched train
x_padbatches, y_padbatches = padding(batch(x_train)) 
xtens_pb = [torch.tensor(batch) for batch in x_padbatches]
ytens_pb = [torch.tensor(batch) for batch in y_padbatches]
data_set = [(x, y) for x,y in zip(xtens_pb, ytens_pb)]

### Autoregressive LSTM 

In [8]:
class autoLSTM(nn.Module):
    def __init__(self, embedding_size, hidden_size, embedding_dim, num_layers = 1): 
        #  input_size, hidden_size, num_layers, num_classes):
        super(autoLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.emb = nn.Embedding(embedding_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_size, num_layers= num_layers, batch_first=True)
        self.lin1 = nn.Linear(hidden_size, embedding_size)

    def forward(self, x, states):
        # create emebeddings
        x_emb = self.emb(x)

        # pass through rnn
        out, _ = self.rnn(x_emb, states)

        # predict
        out = self.lin1(out) 

        return out

In [67]:
def train_autoreg(xtens_pb, ytens_pb, embedding_size, hidden_size, epochs, alpha, num_layers, opt = "sgd"):
    batch_size = len(xtens_pb)
    embedding_dim = 150

    # inti network
    rnn = autoLSTM(embedding_size, hidden_size, embedding_dim, num_layers = num_layers)

    # select optimizer
    if opt == "adam":
        optimizer = torch.optim.Adam(rnn.parameters(), alpha)
    elif opt == "adamdelta":
        optimizer = torch.optim.Adadelta(rnn.parameters(), alpha)
    else:
        optimizer = torch.optim.SGD(rnn.parameters(), alpha)

    # set objective function 
    obj_func = nn.CrossEntropyLoss() 

    data_set = [(x, y) for x,y in zip(xtens_pb, ytens_pb)]

    # epoch loss 
    e_loss = {"loss": [], "norm_loss": []}
    for epoch in range(epochs):
        # shuffle training data 
        np.random.shuffle(data_set) 

        # init batch loss
        batch_loss = 0.0
        # loop over batches
        for idx, batch in enumerate(data_set):
            optimizer.zero_grad()

            # settings for LSTM
            h0 = torch.zeros(num_layers, batch[0].shape[0], hidden_size) 
            c0 = torch.zeros(num_layers, batch[0].shape[0], hidden_size)

            # get ouput 
            outputs = rnn(batch[0], (h0, c0))

            # reshape data for outputs#
            outputs = outputs.reshape(-1, embedding_size)
            targets = batch[1].reshape(-1)

            # print(f"output dim {outputs.size()}, target dims {targets.size()}")
          
            # get loss 
            loss = obj_func(outputs, targets)
            
            # update network
            loss.backward()
            optimizer.step()

            # update batch loss
            batch_loss += loss.item()

        print(f"Epoch {epoch}:\nBatch loss: {batch_loss}, normalized loss: {batch_loss/batch_size}")
        # store loss
        e_loss["loss"].append(batch_loss)
        e_loss["norm_loss"].append(batch_loss/batch_size)


### Trainning 

In [95]:
# set hyperparameters
embedding_size = 32
hidden_size = 16
epochs = 3
num_classes = len(w2i)
alpha = 0.03
num_layers = 1

train_autoreg(xtens_pb, ytens_pb, embedding_size, hidden_size, epochs, alpha, num_layers, opt = "adam")

Epoch 0:
Batch loss: 17.622108101844788, normalized loss: 1.9580120113160875
Epoch 1:
Batch loss: 4.604785189032555, normalized loss: 0.511642798781395
Epoch 2:
Batch loss: 2.312642350792885, normalized loss: 0.25696026119920945


sequnceing strategy:

xp_batch.append([1] + seq + [0]*(max_size - len(seq)) + [2])
yp_batch.append( [0] + seq + [0]*(max_size - len(seq)) + [2])

In [38]:
# set hyperparameters
embedding_size = 32
hidden_size = 16
epochs = 3
num_classes = len(w2i)
alpha = 0.03
num_layers = 1

train_autoreg(xtens_pb, ytens_pb, embedding_size, hidden_size, epochs, alpha, num_layers, opt = "adam")

Epoch 0:
Batch loss: 17.205260932445526, normalized loss: 1.911695659160614
Epoch 1:
Batch loss: 3.2505833879113197, normalized loss: 0.36117593199014664
Epoch 2:
Batch loss: 0.6622391305863857, normalized loss: 0.07358212562070952


sequnceing strategy:

xp_batch.append( seq + [0]*(max_size - len(seq)) + [2])
yp_batch.append( [0] + seq[1:] + [0]*(max_size - len(seq)) + [2])

In [87]:
# set hyperparameters
embedding_size = 32
hidden_size = 16
epochs = 3
num_classes = len(w2i)
alpha = 0.03
num_layers = 1

train_autoreg(xtens_pb, ytens_pb, embedding_size, hidden_size, epochs, alpha, num_layers, opt = "adam")

Epoch 0:
Batch loss: 17.33151751756668, normalized loss: 1.92572416861852
Epoch 1:
Batch loss: 2.9026977717876434, normalized loss: 0.3225219746430715
Epoch 2:
Batch loss: 0.5765323266386986, normalized loss: 0.06405914740429984


In [13]:
index = [1, 2, 3, 4, 5, 6]
def cycle_permute(index):
    for i in range(len(index)- 2, 0, -1):
        index = index[i:] + index[:i]
    for i in range(1, len(index), 2):
        index = index[i : ] + index[: i]
    return index


print("Initial indeces: ", index)
for i in range(5):
    index = cycle_permute(index)
    print(f"Permute. num {i +1}, indices = {index}")


Initial indeces:  [1, 2, 3, 4, 5, 6]
Permute. num 1, indices = [2, 3, 4, 5, 6, 1]
Permute. num 2, indices = [3, 4, 5, 6, 1, 2]
Permute. num 3, indices = [4, 5, 6, 1, 2, 3]
Permute. num 4, indices = [5, 6, 1, 2, 3, 4]
Permute. num 5, indices = [6, 1, 2, 3, 4, 5]


In [17]:
def shuffle(list):
    converge = 1
    while converge > 1e-6:

        iterater = 1 if (list[int(len(list)/2)] * 2) / 3 < 1 else (list[int(len(list)/2)] * 2) / 3

        for i in range(int(iterater)):
            list = cycle_permute(list)
            list = list[:i] + list[i:]
        
        converge /= list[int(len(list)/2)]
    
    return list

In [18]:
print("Initial indeces: ", index)
for i in range(5):
    index = shuffle(index)
    print(f"Permute. num {i +1}, indices = {index}")

Initial indeces:  [2, 3, 4, 5, 6, 1]
Permute. num 1, indices = [5, 6, 1, 2, 3, 4]
Permute. num 2, indices = [6, 1, 2, 3, 4, 5]
Permute. num 3, indices = [2, 3, 4, 5, 6, 1]
Permute. num 4, indices = [5, 6, 1, 2, 3, 4]
Permute. num 5, indices = [6, 1, 2, 3, 4, 5]


In [20]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import torch.distributions as dist
import numpy as np
import random
import re
from data_rnn import *

# ------------- Data Preparation -------------

# Load the NDFA dataset
x_train, (i2w, w2i) = load_ndfa(n=150_000, seed=0)

# Define Hyperparameters
embedding_dim = 32    # e=32
hidden_size = 16      # h=16
num_layers = 2        # Single layer
batch_size = 64
epochs = 10
learning_rate = 0.001
max_length = 50       # Adjust based on your data
vocab_size = len(i2w) #15 for NDFA

# Convert sequences to PyTorch tensors
x_train_tensors = [torch.tensor(seq, dtype=torch.long) for seq in x_train]

# Define maximum sequence length (based on your data or set a reasonable limit)
max_length = 50  # Adjust as needed

# Pad sequences with the `.pad` token (index 0) at the end
x_train_padded = pad_sequence(
    x_train_tensors,
    batch_first=True,
    padding_value=w2i['.pad']
)

# Truncate or pad to `max_length`
if x_train_padded.size(1) > max_length:
    x_train_padded = x_train_padded[:, :max_length]
else:
    padding = (0, max_length - x_train_padded.size(1))
    x_train_padded = torch.nn.functional.pad(x_train_padded, padding, value=w2i['.pad'])

# Creating Targets for Next-Token Prediction
# Input: all tokens except the last
# Target: all tokens except the first
x_input = x_train_padded[:, :-1]  # Shape: (batch, time)
y_target = x_train_padded[:, 1:]  # Shape: (batch, time)

# Create TensorDataset
dataset = TensorDataset(x_input, y_target)

# Create DataLoader
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# ------------- Model Definition -------------

class SimpleLSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers):
        super(SimpleLSTMModel, self).__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            padding_idx=w2i['.pad']
        )
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True
        )
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        """
        Forward pass of the model.

        Args:
            x (torch.LongTensor): Input tensor of shape (batch, time)

        Returns:
            torch.FloatTensor: Output tensor of shape (batch, time, vocab_size)
        """
        embedded = self.embedding(x)  # Shape: (batch, time, embedding_dim)
        lstm_out, _ = self.lstm(embedded)  # Shape: (batch, time, hidden_size)
        logits = self.fc(lstm_out)  # Shape: (batch, time, vocab_size)
        return logits

# ------------- Device Configuration -------------

# Check for MPS (Apple GPU) support
if torch.backends.mps.is_available():
    device = torch.device('mps')
    print("Using Apple GPU (MPS) for training.")
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print("Using CUDA GPU for training.")
else:
    device = torch.device('cpu')
    print("Using CPU for training.")

# Initialize the model
model = SimpleLSTMModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_size=hidden_size,
    num_layers=num_layers
)

# Move the model to the selected device
model.to(device)

# ------------- Loss Function and Optimizer -------------

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=w2i['.pad'])  # Ignore padding in loss
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# ------------- Sampling Function -------------

def generate_sequence(model, seed_seq, w2i, i2w, device, max_length=50):
    """
    Generate a sequence by sampling from the model starting with a seed sequence.

    Args:
        model (nn.Module): The trained PyTorch model.
        seed_seq (list of int): The seed sequence as a list of integer indices.
        w2i (dict): Word-to-index mapping.
        i2w (list): Index-to-word mapping.
        device (torch.device): The device to run the model on.
        max_length (int): Maximum length of the generated sequence.

    Returns:
        list of str: The generated sequence as a list of tokens.
    """
    model.eval()
    generated_seq = seed_seq.copy()
    input_tensor = torch.tensor([generated_seq], dtype=torch.long).to(device)

    with torch.no_grad():
        for _ in range(max_length - len(seed_seq)):
            outputs = model(input_tensor)
            last_logits = outputs[0, -1, :]
            probs = F.softmax(last_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).item()
            generated_seq.append(next_token)
            if next_token == w2i['s']:
                break
            input_tensor = torch.tensor([generated_seq], dtype=torch.long).to(device)

    generated_tokens = [i2w[idx] if idx < len(i2w) else '.unk' for idx in generated_seq]
    return generated_tokens

# ------------- Training Loop -------------

# Define a seed sequence
seed_sequence = [w2i['s']]  # Start with 's'

# Number of samples to generate per epoch
num_samples = 10

# Training Loop
model.train()  # Ensure the model is in training mode

losses = []  # To track training loss

for epoch in range(1, epochs + 1):
    epoch_loss = 0
    for batch_x, batch_y in dataloader:
        # Move data to the appropriate device
        batch_x = batch_x.to(device)  # Shape: (batch, time)
        batch_y = batch_y.to(device)  # Shape: (batch, time)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(batch_x)  # Shape: (batch, time, vocab_size)

        # Reshape outputs and targets for loss computation
        outputs = outputs.view(-1, vocab_size)  # Shape: (batch * time, vocab_size)
        batch_y = batch_y.view(-1)  # Shape: (batch * time)

        # Compute loss
        loss = criterion(outputs, batch_y)

        # Backward pass and optimization
        loss.backward()

        # Apply gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        optimizer.step()

        # Accumulate loss
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(dataloader)
    losses.append(avg_loss)
    print(f'Epoch {epoch}/{epochs}, Loss: {avg_loss:.4f}')

    # Generate and print samples after each epoch, similar to the Q7 to check the model performance
    print(f'\n--- Samples after Epoch {epoch} ---')
    for i in range(num_samples):
        generated = generate_sequence(
            model=model,
            seed_seq=seed_sequence,
            w2i=w2i,
            i2w=i2w,
            device=device,
            max_length=max_length
        )
        # Join tokens to form a string
        generated_str = ''.join(generated)
        print(f'Sample {i + 1}: {generated_str}')
    print('-----------------------------------\n')
    # If you don't want to generate samples after per epoch, comment the part above

# ------------- Evaluation -------------

def evaluate(model, dataloader, criterion, device):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():
        for batch_x, batch_y in dataloader:
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)

            outputs = model(batch_x)
            outputs = outputs.view(-1, vocab_size)
            batch_y = batch_y.view(-1)

            loss = criterion(outputs, batch_y)
            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    return avg_loss

# Evaluate on the training data (or replace with validation DataLoader)
val_loss = evaluate(model, dataloader, criterion, device)
print(f'Validation Loss: {val_loss:.4f}')

# ------------- Making Final Predictions -------------

# Function to convert indices to tokens
def indices_to_tokens(indices, i2w):
    return [i2w[idx] if idx < len(i2w) else '.unk' for idx in indices]

# Example prediction
sample_sequence = [w2i['s']]  # Starting with 's'
generated_tokens = generate_sequence(
    model=model,
    seed_seq=sample_sequence,
    w2i=w2i,
    i2w=i2w,
    device=device,
    max_length=max_length
)
# Join tokens to form a string
generated_str = ''.join(generated_tokens)
print("\nFinal Sample:", generated_str)

Using CPU for training.
Epoch 1/10, Loss: 0.4731

--- Samples after Epoch 1 ---
Sample 1: suvw!uvw!uvw!uvw!s
Sample 2: suvw!uvw!uvw!uvw!uvw!uvw!s
Sample 3: sklm!klm!klm!klm!klm!s
Sample 4: ss
Sample 5: ss
Sample 6: ss
Sample 7: ss
Sample 8: sklm!klm!klm!s
Sample 9: ss
Sample 10: ss
-----------------------------------

Epoch 2/10, Loss: 0.2458

--- Samples after Epoch 2 ---
Sample 1: ss
Sample 2: sabc!abc!abc!abc!abc!s
Sample 3: suvw!uvw!uvw!s
Sample 4: sklm!klm!klm!klm!s
Sample 5: sabc!abc!abc!s
Sample 6: sklm!klm!klm!klm!klm!klm!klm!s
Sample 7: sklm!klm!klm!s
Sample 8: sklm!klm!s
Sample 9: sabc!abc!s
Sample 10: sklm!klm!s
-----------------------------------

Epoch 3/10, Loss: 0.2433

--- Samples after Epoch 3 ---
Sample 1: sabc!abc!abc!abc!abc!abc!abc!s
Sample 2: suvw!uvw!uvw!uvw!uvw!uvw!s
Sample 3: sklm!klm!klm!klm!klm!klm!s
Sample 4: sabc!abc!abc!s
Sample 5: sabc!s
Sample 6: ss
Sample 7: ss
Sample 8: sklm!klm!klm!klm!klm!klm!s
Sample 9: sabc!s
Sample 10: sabc!abc!abc!abc!abc!abc!abc