In [None]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount("/content/drive/")

In [None]:
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Config

In [None]:
colab_prefix = "drive/MyDrive/CMPUT651_DL4NLP/"

learning_rate = 1e-3
epochs = 25
batch_size = 128

source_embedding_size = 300
target_embedding_size = 100
model_version = 1

distill_knowledge = False

if distill_knowledge:
    model_type = "supervised_distilled"
    target_embedding = colab_prefix + f"data/embeddings/base/clipped.glove.6B.{target_embedding_size}d.txt"
else:
    model_type = "supervised"

source_embedding = colab_prefix + f"data/embeddings/base/clipped.glove.6B.{source_embedding_size}d.txt"
experiment_name = f"{model_type}_{source_embedding_size}to{target_embedding_size}_v{model_version}"

embedding_output_dir = colab_prefix + f"data/embeddings/trained/{experiment_name}.glove.6B.300d.txt"
model_output_dir = colab_prefix + f"models/{experiment_name}.pt"

In [None]:
gpu = torch.cuda.is_available()
device = torch.device("cuda" if gpu else "cpu")

In [None]:
print(gpu, device)

# Load & Prepare Embeddings for Training

In [None]:
def get_word_embeddings(dataset):
    words = []
    vectors = []
    with open(dataset, "r", encoding='utf8') as fp:
        for line in fp:
            line = line.split()
            word = line[0]
            vector = np.asarray(line[1:], 'float32')
            words.append(word)
            vectors.append(vector)
    vectors = torch.from_numpy(np.asarray(vectors))
    return words, vectors

In [None]:
words, vectors = get_word_embeddings(source_embedding)
if distill_knowledge:
    target_words, target_vectors = get_word_embeddings(target_embedding)

In [None]:
# No train-val-test split since we want an embedding for all the words in the vocab
print(len(words), vectors.shape)
if distill_knowledge:
    print(len(target_words), target_vectors.shape)

In [None]:
word2index = {word:i for i, word in enumerate(words)}
index2word = {i:word for word, i in word2index.items()}

In [None]:
input_data = torch.as_tensor([word2index[word] for word in words])
output_data = vectors

In [None]:
input_data.shape, output_data.shape

In [None]:
num_embeddings = input_data.shape[0]

In [None]:
train_dataset = torch.utils.data.TensorDataset(input_data, output_data)

In [None]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=gpu
)

# Model Time

In [None]:
class SupervisedModel(nn.Module):
    def __init__(self, num_embeddings, source_embedding_size, target_embedding_size, target_embedding=None):
        super().__init__()
        if target_embedding is not None:
            self.embedding = nn.Embedding.from_pretrained(target_embedding, freeze=False)
        else:
            self.embedding = nn.Embedding(num_embeddings, target_embedding_size)
        self.linear = nn.Linear(in_features=target_embedding_size, out_features=source_embedding_size)
        self.activation = nn.Tanh()
    
    def forward(self, features):
        embedding = self.embedding(features)
        return self.activation(self.linear(embedding))

In [None]:
if distill_knowledge:
    model = SupervisedModel(num_embeddings, source_embedding_size, target_embedding_size, target_vectors).to(device)
else:
    model = SupervisedModel(num_embeddings, source_embedding_size, target_embedding_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

In [None]:
model

In [None]:
all_train_loss = []
for epoch in range(epochs):
    train_loss = 0
    
    # Training Loop
    for iteration, batch in enumerate(tqdm(train_dataloader)):
        # Reset gradients back to zero for this iteration
        optimizer.zero_grad()
        
        # Get inputs and outputs
        batch_inputs, batch_outputs = batch
        
        # Move data to device
        batch_inputs = batch_inputs.to(device)
        batch_outputs = batch_outputs.to(device)

        # Run our model & get outputs
        outputs = model(batch_inputs)

        # Calculate reconstruction loss
        batch_loss = criterion(outputs, batch_outputs)
                  
        # Backprop
        batch_loss.backward()
        
        # Update our optimizer parameters
        optimizer.step()
        
        # Add the batch's loss to the total loss for the epoch
        train_loss += batch_loss.item()
            
    # Compute the average losses for this epoch
    train_loss = train_loss / len(train_dataloader)
    all_train_loss.append(train_loss)
    
    # Print Metrics
    print(
        f"\nEpoch: {epoch+1}/{epochs}\nTrain Loss = {train_loss}"
    )

# Plot Loss

In [None]:
plt.rcParams["font.size"] = 10
plt.figure(figsize=(8, 6), dpi=100)
ax = plt.subplot()
plt.xlim(0, epochs)
plt.plot(list(range(epochs)), [x for x in all_train_loss], label="Train Loss")
ax.set_xlabel("Epoch")
ax.set_ylabel("Loss")
plt.legend()
plt.tight_layout()

# Generate Embeddings

In [None]:
model.eval()

In [None]:
latent_vectors = {}
with torch.no_grad():
    for i, word in enumerate(tqdm(input_data)):
        latent_vectors[index2word[int(word)]] = model.embedding(word.to(device)).detach().cpu()

In [None]:
len(latent_vectors)

In [None]:
latent_vectors['the']

# Save Model & Embeddings

In [None]:
torch.save(model.state_dict(), model_output_dir)

In [None]:
# Need to convert the embeddings into the glove format
# word dim1 dim2 dim3 dim4 ... dimX
lines = []
for i, (word, vector) in tqdm(enumerate(latent_vectors.items())):
    line = [word] + [str(x) for x in vector.tolist()]
    lines.append(' '.join(line))

In [None]:
with open(embedding_output_dir, "w", encoding="utf-8") as fp:
    fp.write("\n".join(lines))

In [None]:
experiment_name