In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [3]:
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Config

In [148]:
test_split = 0.05
validation_split = 0.05
learning_rate = 1e-3
input_size = 300
hidden_size = 225
bottleneck_size = 200
epochs = 5
batch_size = 128
use_hidden_layer = False
denoising = 'masking'
noise_factor = 0.3
alpha_weight = 1
beta_weight = 0.5
experiment_name = f"autoencoder_300to{bottleneck_size}_v3.1"
dataset = "data/embeddings/base/clipped.glove.6B.300d.txt"

In [149]:
gpu = torch.cuda.is_available()
device = torch.device("cuda" if gpu else "cpu")

In [150]:
print(gpu, device)

False cpu


# Load & Prepare Embeddings for Training

In [7]:
words = []
vectors = []
with open(dataset, "r", encoding='utf8') as fp:
    for line in fp:
        line = line.split()
        word = line[0]
        vector = np.asarray(line[1:], 'float32')
        words.append(word)
        vectors.append(vector)
vectors = torch.from_numpy(np.asarray(vectors))

In [8]:
test_split = int(test_split * len(words))
validation_split = int(validation_split* len(words))

In [9]:
print(len(words), vectors.shape)
print(test_split, validation_split)

400000 torch.Size([400000, 300])
20000 20000


In [10]:
train_vectors, test_vectors, train_words, test_words = train_test_split(
    vectors, words, test_size=test_split, random_state=seed
)

In [11]:
train_vectors, validation_vectors, train_words, validation_words = train_test_split(
    train_vectors, train_words, test_size=validation_split, random_state=seed
)

In [12]:
train_vectors.shape, validation_vectors.shape, test_vectors.shape

(torch.Size([360000, 300]), torch.Size([20000, 300]), torch.Size([20000, 300]))

In [13]:
# Note: We don't actually use these words since the model doesn't care about them.
# We just compute them in case we want to check some particular word or something.
len(train_words), len(validation_words), len(test_words)

(360000, 20000, 20000)

In [151]:
train_dataloader = torch.utils.data.DataLoader(
    train_vectors, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=gpu
)
validation_dataloader = torch.utils.data.DataLoader(
    validation_vectors, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=gpu
)
test_dataloader = torch.utils.data.DataLoader(
    test_vectors, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=gpu
)

# Model Time

In [152]:
class AutoEncoderWithoutHiddenLayer(nn.Module):
    def __init__(self, input_size, bottleneck_size):
        super().__init__()
        self.encoder = nn.Linear(in_features=input_size, out_features=bottleneck_size)
        self.encoder_activation = nn.Tanh()
        
        self.decoder = nn.Linear(in_features=bottleneck_size, out_features=input_size)
        self.decoder_activation = nn.Tanh()
        
        self.decoder.weight = nn.Parameter(self.encoder.weight.transpose(0,1))
    
    def forward(self, features):
        latent_representation = self.encoder_activation(self.encoder(features))
        reconstructed_input = self.decoder_activation(self.decoder(latent_representation))
        return reconstructed_input
    
    def encode(self, features):
        return self.encoder_activation(self.encoder(features))

In [153]:
class AutoEncoderWithHiddenLayer(nn.Module):
    def __init__(self, input_size, hidden_size, bottleneck_size):
        super().__init__()
        self.encoder_input = nn.Linear(in_features=input_size, out_features=hidden_size)
        self.encoder_input_activation = nn.ReLU(True)
        self.encoder_hidden = nn.Linear(in_features=hidden_size, out_features=bottleneck_size)
        self.encoder_hidden_activation = nn.Tanh()
        
        self.decoder_hidden = nn.Linear(in_features=bottleneck_size, out_features=hidden_size)
        self.decoder_hidden_activation = nn.ReLU(True)
        self.decoder_output = nn.Linear(in_features=hidden_size, out_features=input_size)
        self.decoder_output_activation = nn.Tanh()
        
        self.decoder_hidden.weight = nn.Parameter(self.encoder_hidden.weight.transpose(0,1))
        self.decoder_output.weight = nn.Parameter(self.encoder_input.weight.transpose(0,1))
        
        self.encoder = [
            self.encoder_input, self.encoder_input_activation, self.encoder_hidden, self.encoder_hidden_activation
        ]
        self.decoder = [
            self.decoder_hidden, self.decoder_hidden_activation, self.decoder_output, self.decoder_output_activation
        ]
    
    def forward(self, features):
        # Encoder
        for layer in self.encoder:
            features = layer(features)
        # Decoder
        for layer in self.decoder:
            features = layer(features)
        return features
    
    def encode(self, features):
        for layer in self.encoder:
            features = layer(features)
        return features

In [154]:
if use_hidden_layer:
    model = AutoEncoderWithHiddenLayer(input_size, hidden_size, bottleneck_size).to(device)
else:
    model = AutoEncoderWithoutHiddenLayer(input_size, bottleneck_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

In [155]:
model

AutoEncoderWithoutHiddenLayer(
  (encoder): Linear(in_features=300, out_features=200, bias=True)
  (encoder_activation): Tanh()
  (decoder): Linear(in_features=200, out_features=300, bias=True)
  (decoder_activation): Tanh()
)

In [156]:
def create_masking_noise(shape, amount):
    """
    Shape: Shape of the noise matrix
    Amount: The amount of the matrix that should be masked (zero'd) out
    """
    mask = np.ones(shape)
    amount = int(shape[0] * amount)
    mask[:, :amount] = 0
    for x in mask:
        np.random.shuffle(x)
    return torch.from_numpy(mask.astype(np.float32))

In [157]:
for epoch in range(epochs):
    train_loss = 0
    validation_loss = 0
    
    # Training Loop
    for iteration, batch in enumerate(tqdm(train_dataloader)):
        # Reset gradients back to zero for this iteration
        optimizer.zero_grad()
        
        if denoising == 'additive':
            # Add noise to our inputs
            noise = torch.randn(batch.shape) * noise_factor
            noisy_batch = torch.clamp(batch + noise, -1, +1)
            
            # Move batch to device
            noisy_batch = noisy_batch.to(device)

            # Run our model & get outputs
            outputs = model(noisy_batch)
            
            # Calculate reconstruction loss
            batch_loss = criterion(outputs, batch)
        elif denoising == 'masking':
            # Create masking noise and mask inputs
            noise = create_masking_noise(batch.shape, noise_factor)
            masked_batch = np.multiply(batch, noise)
            
            # Move batch to device
            masked_batch = masked_batch.to(device)
            
            # Run model & get outputs
            outputs = model(masked_batch)
            
            # Calculate reconstruction loss
            # We calculate the error for the masked dimensions separately from the unmasked ones
            # We can then assign a weight to each of the two components 
            unmasked_error = criterion(outputs * noise, masked_batch)
            masked_error = criterion(outputs * (1 - noise), batch * (1 - noise))
            batch_loss = (alpha_weight * masked_error) + (beta_weight * unmasked_error)
        else:
            # Move batch to device
            batch = batch.to(device)

            # Run our model & get outputs
            outputs = model(batch)
            
            # Calculate reconstruction loss
            batch_loss = criterion(outputs, batch)
                  
        # Backprop
        batch_loss.backward()
        
        # Update our optimizer parameters
        optimizer.step()
        
        # Add the batch's loss to the total loss for the epoch
        train_loss += batch_loss.item()
        
    # Validation Loop
    with torch.no_grad():
        for iteration, batch in enumerate(tqdm(validation_dataloader)):
            # Move batch to device
            batch = batch.to(device)

            # Run our model & get outputs
            outputs = model(batch)

            # Calculate reconstruction loss
            batch_loss = criterion(outputs, batch)

            # Add the batch's loss to the total loss for the epoch
            validation_loss += batch_loss.item()
    
    # Compute the average losses for this epoch
    train_loss = train_loss / len(train_dataloader)
    validation_loss = validation_loss / len(validation_dataloader)
    
    # Print Metrics
    print(
        f"Epoch: {epoch+1}/{epochs}, Train Reconstruction Loss = {train_loss}, \
        Validation Reconstruction Loss = {validation_loss}"
    )

100%|█████████████████████████████████████████████████████████████████████████████| 2813/2813 [00:18<00:00, 153.08it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 157/157 [00:01<00:00, 145.76it/s]
  0%|                                                                                         | 0/2813 [00:00<?, ?it/s]

Epoch: 1/5, Train Reconstruction Loss = 0.02092375290516196,         Validation Reconstruction Loss = 0.017712685368528033


100%|█████████████████████████████████████████████████████████████████████████████| 2813/2813 [00:19<00:00, 144.49it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 157/157 [00:01<00:00, 127.10it/s]
  0%|                                                                                         | 0/2813 [00:00<?, ?it/s]

Epoch: 2/5, Train Reconstruction Loss = 0.020048815937142674,         Validation Reconstruction Loss = 0.017682940206804852


100%|█████████████████████████████████████████████████████████████████████████████| 2813/2813 [00:18<00:00, 150.94it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 157/157 [00:01<00:00, 124.59it/s]
  0%|                                                                                         | 0/2813 [00:00<?, ?it/s]

Epoch: 3/5, Train Reconstruction Loss = 0.02001157213218306,         Validation Reconstruction Loss = 0.01759489730095408


100%|█████████████████████████████████████████████████████████████████████████████| 2813/2813 [00:20<00:00, 140.37it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 157/157 [00:01<00:00, 138.78it/s]
  0%|                                                                                         | 0/2813 [00:00<?, ?it/s]

Epoch: 4/5, Train Reconstruction Loss = 0.019991760944349672,         Validation Reconstruction Loss = 0.017615610164157143


100%|█████████████████████████████████████████████████████████████████████████████| 2813/2813 [00:20<00:00, 140.62it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 157/157 [00:01<00:00, 127.93it/s]

Epoch: 5/5, Train Reconstruction Loss = 0.019973666228054684,         Validation Reconstruction Loss = 0.017561830864970093





# Test Model

In [158]:
model.eval()

AutoEncoderWithoutHiddenLayer(
  (encoder): Linear(in_features=300, out_features=200, bias=True)
  (encoder_activation): Tanh()
  (decoder): Linear(in_features=200, out_features=300, bias=True)
  (decoder_activation): Tanh()
)

In [159]:
reconstruction_loss = 0

# Testing Loop
with torch.no_grad():
    for iteration, batch in enumerate(tqdm(test_dataloader)):
        # Move batch to device
        batch = batch.to(device)
        
        # Run our model & get outputs
        outputs = model(batch)

        # Calculate reconstruction loss
        batch_loss = criterion(outputs, batch)

        # Add the batch's loss to the total loss for the epoch
        reconstruction_loss += batch_loss.item()

# Compute the average losses for this epoch
reconstruction_loss = reconstruction_loss / len(test_dataloader)

# Print Metrics
print(
    f"Test Reconstruction Loss = {reconstruction_loss}"
)

100%|███████████████████████████████████████████████████████████████████████████████| 157/157 [00:01<00:00, 135.55it/s]

Test Reconstruction Loss = 0.01756591818467447





# Generate Latent Embeddings

In [160]:
latent_vectors = {}
with torch.no_grad():
    for i, (word, vector) in enumerate(tqdm(zip(words, vectors))):
        latent_vectors[word] = model.encode(vector)

400000it [00:20, 19450.47it/s]


In [161]:
len(latent_vectors)

400000

In [162]:
latent_vectors['the']

tensor([-0.0491, -0.7057,  0.5585,  0.4505,  0.1197,  0.8171,  0.2767,  0.1517,
         0.2609, -0.3418, -0.5732, -0.2670, -0.5859, -0.6228, -0.8439,  0.5550,
         0.4134, -0.2099,  0.0208,  0.4179, -0.0282, -0.3089,  0.1710, -0.6101,
        -0.1411, -0.1068,  0.1119,  0.4446, -0.0538, -0.1984,  0.4466, -0.0773,
        -0.1837, -0.0393,  0.0667,  0.5724, -0.0938, -0.2321, -0.1937, -0.0537,
         0.4824, -0.6321,  0.6465,  0.0949, -0.6390, -0.2557, -0.7237,  0.3523,
         0.5180, -0.2043, -0.2086, -0.5812,  0.3521, -0.4286,  0.0879, -0.7394,
        -0.6898, -0.2148,  0.3370,  0.0734, -0.7275,  0.2699,  0.2641,  0.3486,
        -0.1092, -0.4515, -0.3972, -0.3416,  0.4906,  0.1302, -0.1905, -0.1336,
         0.7853, -0.0809,  0.5394, -0.2317,  0.0281, -0.1255,  0.4968,  0.4325,
        -0.3001, -0.0125,  0.0547,  0.0385,  0.4382, -0.2669, -0.6018, -0.1798,
         0.4208,  0.0494, -0.0710, -0.0635, -0.5309,  0.6894,  0.0994, -0.7257,
        -0.3527, -0.1521, -0.5692, -0.06

# Save Model & Embeddings

In [163]:
torch.save(model.state_dict(), f"models/{experiment_name}.pt")

In [164]:
# Need to convert the latent embeddings into the glove format
# word dim1 dim2 dim3 dim4 ... dimX
lines = []
for i, (word, vector) in tqdm(enumerate(latent_vectors.items())):
    line = [word] + [str(x) for x in vector.tolist()]
    lines.append(' '.join(line))

400000it [00:57, 6906.56it/s]


In [165]:
with open(f"data/embeddings/trained/{experiment_name}.glove.6B.300d.txt", "w", encoding="utf-8") as fp:
    fp.write("\n".join(lines))