In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [3]:
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Config

In [4]:
test_split = 0.05
validation_split = 0.05

learning_rate = 1e-3
epochs = 5
batch_size = 128

input_size = 300
bottleneck_size = 200

alpha = 1 # How much the teacher should influence the latent representation

experiment_name = f"autoencoder_{input_size}to{bottleneck_size}_v4"
dataset = f"data/embeddings/base/clipped.glove.6B.{input_size}d.txt"
teacher_dataset = f"data/embeddings/base/clipped.glove.6B.{bottleneck_size}d.txt"

In [5]:
gpu = torch.cuda.is_available()
device = torch.device("cuda" if gpu else "cpu")

In [6]:
print(gpu, device)

False cpu


# Load & Prepare Embeddings for Training

In [8]:
def load_embeddings(dataset):
    words = []
    vectors = []
    with open(dataset, "r", encoding='utf8') as fp:
        for line in fp:
            line = line.split()
            word = line[0]
            vector = np.asarray(line[1:], 'float32')
            words.append(word)
            vectors.append(vector)
    vectors = torch.as_tensor(vectors)
    return words, vectors

In [9]:
input_words, input_vectors = load_embeddings(dataset)

In [10]:
teacher_words, teacher_vectors = load_embeddings(teacher_dataset)

In [11]:
test_split = int(test_split * len(input_words))
validation_split = int(validation_split* len(input_words))

In [22]:
print(len(input_words), input_vectors.shape)
print(len(teacher_words), teacher_vectors.shape)
print(test_split, validation_split)

400000 torch.Size([400000, 300])
400000 torch.Size([400000, 200])
20000 20000


In [23]:
input_train_vectors, input_test_vectors, input_train_words, input_test_words = train_test_split(
    input_vectors, input_words, test_size=test_split, random_state=seed
)

In [24]:
input_train_vectors, input_validation_vectors, input_train_words, input_validation_words = train_test_split(
    input_train_vectors, input_train_words, test_size=test_split, random_state=seed
)

In [25]:
teacher_train_vectors, teacher_test_vectors, teacher_train_words, teacher_test_words = train_test_split(
    teacher_vectors, teacher_words, test_size=test_split, random_state=seed
)

In [26]:
teacher_train_vectors, teacher_validation_vectors, teacher_train_words, teacher_validation_words = train_test_split(
    teacher_train_vectors, teacher_train_words, test_size=validation_split, random_state=seed
)

In [29]:
input_train_vectors.shape, input_validation_vectors.shape, input_test_vectors.shape

(torch.Size([360000, 300]), torch.Size([20000, 300]), torch.Size([20000, 300]))

In [28]:
teacher_train_vectors.shape, teacher_validation_vectors.shape, teacher_test_vectors.shape

(torch.Size([360000, 200]), torch.Size([20000, 200]), torch.Size([20000, 200]))

In [30]:
# Note: We don't actually use these words since the model doesn't care about them.
# We just compute them in case we want to check some particular word or something.
len(input_train_words), len(input_validation_words), len(input_test_words)

(360000, 20000, 20000)

In [31]:
train_dataset = torch.utils.data.TensorDataset(input_train_vectors, teacher_train_vectors)
validation_dataset = torch.utils.data.TensorDataset(input_validation_vectors, teacher_validation_vectors)
test_dataset = torch.utils.data.TensorDataset(input_test_vectors, teacher_test_vectors)

In [32]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=gpu
)
validation_dataloader = torch.utils.data.DataLoader(
    validation_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=gpu
)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=gpu
)

# Model Time

In [33]:
class AutoEncoderWithoutHiddenLayer(nn.Module):
    def __init__(self, input_size, bottleneck_size):
        super().__init__()
        self.encoder = nn.Linear(in_features=input_size, out_features=bottleneck_size)
        self.encoder_activation = nn.Tanh()
        
        self.decoder = nn.Linear(in_features=bottleneck_size, out_features=input_size)
        self.decoder_activation = nn.Tanh()
        
        self.decoder.weight = nn.Parameter(self.encoder.weight.transpose(0,1))
    
    def forward(self, features):
        latent_representation = self.encoder_activation(self.encoder(features))
        reconstructed_input = self.decoder_activation(self.decoder(latent_representation))
        return reconstructed_input, latent_representation
    
    def encode(self, features):
        return self.encoder_activation(self.encoder(features))

In [34]:
model = AutoEncoderWithoutHiddenLayer(input_size, bottleneck_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

In [35]:
model

AutoEncoderWithoutHiddenLayer(
  (encoder): Linear(in_features=300, out_features=200, bias=True)
  (encoder_activation): Tanh()
  (decoder): Linear(in_features=200, out_features=300, bias=True)
  (decoder_activation): Tanh()
)

In [36]:
for epoch in range(epochs):
    train_loss = 0
    train_reconstruction_loss = 0
    train_teacher_loss = 0
    validation_loss = 0
    validation_reconstruction_loss = 0
    validation_teacher_loss = 0
    
    # Training Loop
    for iteration, batch in enumerate(tqdm(train_dataloader)):
        # Reset gradients back to zero for this iteration
        optimizer.zero_grad()
        
        # Get our inputs & our teacher representation
        input_batch, teacher_batch = batch
        
        # Move batch to device
        input_batch = input_batch.to(device)

        # Run our model & get outputs
        outputs, latent_representation = model(input_batch)

        # Calculate reconstruction loss
        reconstruction_loss = criterion(outputs, input_batch)
        teacher_loss = alpha * criterion(latent_representation, teacher_batch)
        batch_loss = reconstruction_loss + teacher_loss 
                  
        # Backprop
        batch_loss.backward()
        
        # Update our optimizer parameters
        optimizer.step()
        
        # Add the batch's loss to the total loss for the epoch
        train_loss += batch_loss.item()
        train_reconstruction_loss += reconstruction_loss.item()
        train_teacher_loss += teacher_loss.item()
        
    # Validation Loop
    with torch.no_grad():
        for iteration, batch in enumerate(tqdm(validation_dataloader)):
            # Get our inputs & our teacher representation
            input_batch, teacher_batch = batch
            
            # Move batch to device
            input_batch = input_batch.to(device)

            # Run our model & get outputs
            outputs, latent_representation = model(input_batch)

            # Calculate reconstruction loss
            reconstruction_loss = criterion(outputs, input_batch)
            teacher_loss = alpha * criterion(latent_representation, teacher_batch)
            batch_loss = reconstruction_loss + teacher_loss

            # Add the batch's loss to the total loss for the epoch
            validation_loss += batch_loss.item()
            validation_reconstruction_loss += reconstruction_loss.item()
            validation_teacher_loss += teacher_loss.item()
    
    # Compute the average losses for this epoch
    train_loss = train_loss / len(train_dataloader)
    train_reconstruction_loss = train_reconstruction_loss / len(train_dataloader)
    train_teacher_loss = train_teacher_loss / len(train_dataloader)
    
    validation_loss = validation_loss / len(validation_dataloader)
    validation_reconstruction_loss = validation_reconstruction_loss / len(validation_dataloader)
    validation_teacher_loss = validation_teacher_loss / len(validation_dataloader)
    
    # Print Metrics
    print(
        f"Epoch: {epoch+1}/{epochs}, \
        \nTrain Reconstruction Loss = {train_loss} = {train_reconstruction_loss} + {train_teacher_loss}, \
        \nValidation Reconstruction Loss = {validation_loss} = {validation_reconstruction_loss} + {validation_teacher_loss}"
    )

100%|█████████████████████████████████████████████████████████████████████████████| 2813/2813 [00:13<00:00, 212.18it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 157/157 [00:01<00:00, 129.62it/s]
  0%|                                                                                         | 0/2813 [00:00<?, ?it/s]

Epoch: 1/5,         
Train Reconstruction Loss = 0.06494576979522555 = 0.023195221450032815 + 0.04175054836836821,         
Validation Reconstruction Loss = 0.05879058902430686 = 0.02140855298015722 + 0.03738203602042168


100%|█████████████████████████████████████████████████████████████████████████████| 2813/2813 [00:12<00:00, 229.68it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 157/157 [00:01<00:00, 140.15it/s]
  0%|                                                                                         | 0/2813 [00:00<?, ?it/s]

Epoch: 2/5,         
Train Reconstruction Loss = 0.058855424989034726 = 0.021528047710455824 + 0.0373273772242821,         
Validation Reconstruction Loss = 0.05876534130827636 = 0.021567174310612072 + 0.03719816691461642


100%|█████████████████████████████████████████████████████████████████████████████| 2813/2813 [00:12<00:00, 226.53it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 157/157 [00:01<00:00, 144.77it/s]
  0%|                                                                                         | 0/2813 [00:00<?, ?it/s]

Epoch: 3/5,         
Train Reconstruction Loss = 0.05883989539004033 = 0.021569619803476537 + 0.037270275586563796,         
Validation Reconstruction Loss = 0.05867983441159224 = 0.02155939209613071 + 0.037120442303597546


100%|█████████████████████████████████████████████████████████████████████████████| 2813/2813 [00:12<00:00, 224.29it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 157/157 [00:01<00:00, 143.09it/s]
  0%|                                                                                         | 0/2813 [00:00<?, ?it/s]

Epoch: 4/5,         
Train Reconstruction Loss = 0.05883965088624654 = 0.021570523713450182 + 0.03726912714564796,         
Validation Reconstruction Loss = 0.05876012184437673 = 0.02156622135999856 + 0.037193900555562064


100%|█████████████████████████████████████████████████████████████████████████████| 2813/2813 [00:12<00:00, 227.54it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 157/157 [00:01<00:00, 138.66it/s]

Epoch: 5/5,         
Train Reconstruction Loss = 0.058842258131243434 = 0.02157196946775854 + 0.037270288660174115,         
Validation Reconstruction Loss = 0.05876836446440144 = 0.021565937514232983 + 0.03720242705694429





# Test Model

In [37]:
model.eval()

AutoEncoderWithoutHiddenLayer(
  (encoder): Linear(in_features=300, out_features=200, bias=True)
  (encoder_activation): Tanh()
  (decoder): Linear(in_features=200, out_features=300, bias=True)
  (decoder_activation): Tanh()
)

In [38]:
test_loss = 0
test_reconstruction_loss = 0
test_teacher_loss = 0

# Testing Loop
with torch.no_grad():
    for iteration, batch in enumerate(tqdm(test_dataloader)):
        # Get our inputs & our teacher representation
        input_batch, teacher_batch = batch

        # Move batch to device
        input_batch = input_batch.to(device)

        # Run our model & get outputs
        outputs, latent_representation = model(input_batch)

        # Calculate reconstruction loss
        reconstruction_loss = criterion(outputs, input_batch)
        teacher_loss = alpha * criterion(latent_representation, teacher_batch)
        batch_loss = reconstruction_loss + teacher_loss

        # Add the batch's loss to the total loss for the epoch
        test_loss += batch_loss.item()
        test_reconstruction_loss += reconstruction_loss.item()
        test_teacher_loss += teacher_loss.item()

# Compute the average losses for this epoch
test_loss = test_loss / len(test_dataloader)
test_reconstruction_loss = test_reconstruction_loss / len(test_dataloader)
test_teacher_loss = test_teacher_loss / len(test_dataloader)

# Print Metrics
print(
        f"Train Reconstruction Loss = {test_loss} = {test_reconstruction_loss} + {test_teacher_loss}"
    )

100%|███████████████████████████████████████████████████████████████████████████████| 157/157 [00:01<00:00, 123.50it/s]

Train Reconstruction Loss = 0.05871252117642931 = 0.02152089415129962 + 0.03719162698953774





# Generate Latent Embeddings

In [39]:
latent_vectors = {}
with torch.no_grad():
    for i, (word, vector) in enumerate(tqdm(zip(input_words, input_vectors))):
        latent_vectors[word] = model.encode(vector)

400000it [00:20, 19277.60it/s]


In [40]:
len(latent_vectors)

400000

In [41]:
latent_vectors['the']

tensor([-1.9597e-01, -6.6826e-03, -1.7408e-02, -5.6985e-02,  1.1396e-01,
         2.7846e-01, -3.6510e-01, -1.9054e-01,  9.0300e-02, -3.4559e-01,
         1.4674e-01,  1.4102e-01, -2.0345e-01,  3.2667e-01,  5.7640e-01,
        -9.3950e-02,  2.4402e-01,  2.7191e-01, -4.8436e-01,  8.8162e-03,
         5.0870e-01,  9.8735e-01, -9.6114e-02,  9.3112e-03, -7.3997e-02,
         1.6817e-01, -1.7894e-01,  2.7836e-01, -4.7498e-02,  3.3836e-01,
         2.8852e-01, -3.6578e-01, -9.2604e-02,  1.4476e-01,  7.1821e-02,
        -9.9298e-02, -7.1037e-01, -2.4348e-01,  1.0096e-01, -6.0697e-01,
        -1.1344e-01, -4.2397e-01, -8.5139e-02,  2.6791e-01, -3.4610e-01,
         5.7919e-01,  2.2274e-01,  3.6622e-01,  2.1909e-02,  6.4547e-01,
        -2.9479e-01,  1.7762e-01,  4.9113e-01, -4.0611e-03, -4.1202e-01,
         5.5324e-02,  1.3703e-01,  1.3798e-01, -2.6209e-01, -2.2337e-01,
         7.1571e-02, -6.9467e-02, -6.5866e-01, -2.8166e-01,  3.1617e-01,
         1.7332e-01, -1.4714e-02, -8.1437e-02, -4.0

# Save Model & Embeddings

In [42]:
torch.save(model.state_dict(), f"models/{experiment_name}.pt")

In [43]:
# Need to convert the latent embeddings into the glove format
# word dim1 dim2 dim3 dim4 ... dimX
lines = []
for i, (word, vector) in tqdm(enumerate(latent_vectors.items())):
    line = [word] + [str(x) for x in vector.tolist()]
    lines.append(' '.join(line))

400000it [00:57, 6925.12it/s]


In [44]:
with open(f"data/embeddings/trained/{experiment_name}.glove.6B.300d.txt", "w", encoding="utf-8") as fp:
    fp.write("\n".join(lines))