<a href="https://colab.research.google.com/github/alexandrelombard/ai54-notebooks/blob/master/Tutorial_3_Word2Vec_Correction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Basic neural networks

In [1]:
import torch
from torch import nn
from torch.nn import functional as F
import pandas as pd

In [2]:
df = pd.read_csv('height_weight_sex_training_set.csv')
df.head()

Unnamed: 0,Height,Weight,Sex
0,165.65,35.41,Female
1,148.53,74.45,Female
2,167.04,81.22,Male
3,161.54,71.47,Male
4,174.31,78.18,Male


In [3]:
# Preparing the inputs (we want a list of vectors of size 2)
heights = torch.tensor(df['Height'], dtype=torch.float32).unsqueeze(1)
weights = torch.tensor(df['Weight'], dtype=torch.float32).unsqueeze(1)
inputs = torch.cat((heights, weights), dim=1)

In [4]:
# Preparing the outputs (we want 1-hot encoded values for the two possible classes)
outputs = F.one_hot(torch.tensor(df['Sex'].replace('Female', 0).replace('Male', 1))).float()

  outputs = F.one_hot(torch.tensor(df['Sex'].replace('Female', 0).replace('Male', 1))).float()


In [5]:
# Defining the model (i.e. the neural network)
model = nn.Sequential(
    nn.Linear(2, 16),
    nn.Linear(16, 2)
)

In [6]:
# Training the model
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

epochs = 2000
for epoch in range(1, epochs + 1):
  logits = model(inputs)                # The model is applied on all the inputs
  loss = criterion(logits, outputs)     # The error is computed for all the predictions (logits) according to expected outputs

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  # Every 10 step we print the epoch and the loss so we can see the training
  if epoch % 10 == 0:
    print(f"Epoch: {epoch}, Loss: {loss}")

Epoch: 10, Loss: 4.1945085525512695
Epoch: 20, Loss: 0.7244903445243835
Epoch: 30, Loss: 0.8891106843948364
Epoch: 40, Loss: 0.811125636100769
Epoch: 50, Loss: 0.7132465243339539
Epoch: 60, Loss: 0.7075966000556946
Epoch: 70, Loss: 0.6959497928619385
Epoch: 80, Loss: 0.6986805200576782
Epoch: 90, Loss: 0.6912343502044678
Epoch: 100, Loss: 0.6892827749252319
Epoch: 110, Loss: 0.686430037021637
Epoch: 120, Loss: 0.6835747957229614
Epoch: 130, Loss: 0.6807171106338501
Epoch: 140, Loss: 0.677827000617981
Epoch: 150, Loss: 0.6748422384262085
Epoch: 160, Loss: 0.671753466129303
Epoch: 170, Loss: 0.6685654520988464
Epoch: 180, Loss: 0.6652814745903015
Epoch: 190, Loss: 0.6619141697883606
Epoch: 200, Loss: 0.6584964990615845
Epoch: 210, Loss: 0.6550701260566711
Epoch: 220, Loss: 0.6516693830490112
Epoch: 230, Loss: 0.6483179330825806
Epoch: 240, Loss: 0.6450304985046387
Epoch: 250, Loss: 0.6418138742446899
Epoch: 260, Loss: 0.6386656761169434
Epoch: 270, Loss: 0.6355686783790588
Epoch: 280, Lo

In [7]:
# Test on a single example, just to check
if model(torch.tensor([150.0, 60.0])).argmax().item() == 0:
  print('Female')
else:
  print('Male')

Female


# Word2Vec CBOW

In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import random

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [9]:
# Selecting the right device can speed up the computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Use the GPU if available
print(device)

cuda


In [10]:
with open('romeo_and_juliet.txt') as file:
  content = file.read()

In [11]:
# Pre-processing of the text
content = content.lower()
tokens = word_tokenize(content)

# Remove punctuation
tokens = [word for word in tokens if word.isalpha()]

print(tokens[0:100])

['the', 'tragedy', 'of', 'romeo', 'and', 'juliet', 'by', 'william', 'shakespeare', 'dramatis', 'personae', 'chorus', 'escalus', 'prince', 'of', 'verona', 'paris', 'a', 'young', 'count', 'kinsman', 'to', 'the', 'prince', 'montague', 'heads', 'of', 'two', 'houses', 'at', 'variance', 'with', 'each', 'other', 'capulet', 'heads', 'of', 'two', 'houses', 'at', 'variance', 'with', 'each', 'other', 'an', 'old', 'man', 'of', 'the', 'capulet', 'family', 'romeo', 'son', 'to', 'montague', 'tybalt', 'nephew', 'to', 'lady', 'capulet', 'mercutio', 'kinsman', 'to', 'the', 'prince', 'and', 'friend', 'to', 'romeo', 'benvolio', 'nephew', 'to', 'montague', 'and', 'friend', 'to', 'romeo', 'tybalt', 'nephew', 'to', 'lady', 'capulet', 'friar', 'laurence', 'franciscan', 'friar', 'john', 'franciscan', 'balthasar', 'servant', 'to', 'romeo', 'abram', 'servant', 'to', 'montague', 'sampson', 'servant', 'to', 'capulet']


In [12]:
# Build a vocabulary and a dictionary so we have indices for each word
vocabulary = list(set(tokens))

word2idx = {}
for i in range(len(vocabulary)):
  word2idx[vocabulary[i]] = i

print(f"Vocabulary size: {len(vocabulary)}")

Vocabulary size: 3464


In [13]:
# Build the dataset
target_word_ids = []
context_words_ids = []

for position in range(2, len(tokens) - 2):
  target_word_ids.append(word2idx[tokens[position]])
  context_words_ids.append([
      word2idx[tokens[position-2]],
      word2idx[tokens[position-1]],
      word2idx[tokens[position+1]],
      word2idx[tokens[position+2]]
    ])

In [14]:
# Build the Word2Vec CBOW module
class Word2VecCBOW(nn.Module):
  def __init__(self, vocabulary_size, embedding_dim):
    super(Word2VecCBOW, self).__init__()
    # An embedding layer, to reduce the size of vectors from vocabulary size to the embedding dimension
    self.embeddings = nn.Embedding(vocabulary_size, embedding_dim)
    # An output layer, to have the probabilities for each target words from the embedding
    self.linear = nn.Linear(embedding_dim, vocabulary_size, bias=False)

  def forward(self, context):
    # Computing the embedding for the context words
    embed = self.embeddings(context)
    # Make an aggregation
    sum_embed = torch.sum(embed, dim=1)
    # Compute the output
    out = self.linear(sum_embed)

    return out

word2vec_cbow = Word2VecCBOW(len(vocabulary), 128).to(device)

In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(word2vec_cbow.parameters())

In [20]:
# Train the model (it can be really slow, there is no optimization here except training with batches)
losses = []

for epoch in range(100):
  batch_size = 2000 # We compute the loss on batches of 2000 elements, to speed up the training process
  for position in range(0, len(tokens) - batch_size, batch_size):
    batch_input = context_words_ids[position:position + batch_size]
    batch_output = target_word_ids[position:position + batch_size]

    prediction = word2vec_cbow(torch.tensor(batch_input, device=device))
    loss = criterion(prediction, torch.tensor(batch_output, device=device))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    losses.append(loss.item())

  print(f'Epoch #{epoch}, avg loss {torch.mean(torch.tensor(losses)).item()}')
  losses.clear()

Epoch #0, avg loss 1.3378357887268066
Epoch #1, avg loss 1.325903058052063
Epoch #2, avg loss 1.314109444618225
Epoch #3, avg loss 1.3024535179138184
Epoch #4, avg loss 1.2909343242645264
Epoch #5, avg loss 1.279550313949585
Epoch #6, avg loss 1.2683005332946777
Epoch #7, avg loss 1.2571839094161987
Epoch #8, avg loss 1.2461987733840942
Epoch #9, avg loss 1.2353442907333374
Epoch #10, avg loss 1.2246193885803223
Epoch #11, avg loss 1.2140225172042847
Epoch #12, avg loss 1.2035528421401978
Epoch #13, avg loss 1.1932088136672974
Epoch #14, avg loss 1.1829897165298462
Epoch #15, avg loss 1.1728936433792114
Epoch #16, avg loss 1.1629201173782349
Epoch #17, avg loss 1.1530675888061523
Epoch #18, avg loss 1.143335223197937
Epoch #19, avg loss 1.1337225437164307
Epoch #20, avg loss 1.1242305040359497
Epoch #21, avg loss 1.1148566007614136
Epoch #22, avg loss 1.105603814125061
Epoch #23, avg loss 1.096430778503418
Epoch #24, avg loss 1.0873879194259644
Epoch #25, avg loss 1.0784589052200317
Ep

In [22]:
with torch.no_grad(): # Disable the computation of gradients (useful for evaluation)
  # The weights of the embedding matrix are the embeddings of all words (the line #0 is embedding of word #0, etc.)
  embeddings = word2vec_cbow.embeddings.weight.detach()
  # Here we normalize the embeddings to be able to compute the cosine similarity by taking the dot product of embeddings
  normalized_embeddings = embeddings / torch.norm(embeddings, p=2, dim=1, keepdim=True)

  embedding = normalized_embeddings[word2idx['man']]

  # Efficiently compute the dot product of all lines of normalized_embeddings with embedding
  similarities = torch.mv(normalized_embeddings, embedding)

  # Get the 10 top indices (discard the values)
  _, top10 = torch.topk(similarities, 10, largest=True, sorted=True)

  print([vocabulary[idx.item()] for idx in top10])

['man', 'streaks', 'collars', 'sycamore', 'wings', 'italy', 'guest', 'our', 'drunk', 'desirest']
