# NLP Project - Word Embeddings

In [2]:
from datasets import load_dataset
import gensim.downloader as api
import gensim
import nltk
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import wandb

In [7]:
# Load the BoolQ dataset
dataset = load_dataset('google/boolq')
train_data = load_dataset("google/boolq", split="train[:-1000]")
validation_data = load_dataset("google/boolq", split="train[-1000:]")
test_data = load_dataset("google/boolq", split="validation")

print(train_data[0])
print(f"Number of training samples: {len(train_data)}")
print(f"Number of validation samples: {len(validation_data)}")
print(f"Number of validation samples: {len(test_data)}")

{'question': 'do iran and afghanistan speak the same language', 'answer': True, 'passage': 'Persian (/ˈpɜːrʒən, -ʃən/), also known by its endonym Farsi (فارسی fārsi (fɒːɾˈsiː) ( listen)), is one of the Western Iranian languages within the Indo-Iranian branch of the Indo-European language family. It is primarily spoken in Iran, Afghanistan (officially known as Dari since 1958), and Tajikistan (officially known as Tajiki since the Soviet era), and some other regions which historically were Persianate societies and considered part of Greater Iran. It is written in the Persian alphabet, a modified variant of the Arabic script, which itself evolved from the Aramaic alphabet.'}
Number of training samples: 8427 | 4213.5%
Number of validation samples: 1000 | 500.0%
Number of validation samples: 3270 | 1635.0%


In [3]:
# Define model and training parameters
model_name = "word2vec-google-news-300"
model_path = "word2vec_google_news_300.model"
batch_size = 10
n_epochs = 1
learning_rate = 0.0001
MAX_SEQ_LENGTH = 100

In [4]:
# Check if the model file exists
try:
    # Load the model if it exists locally
    word2vec_model = gensim.models.KeyedVectors.load(model_path)
    print("Model loaded from local storage.")
except FileNotFoundError:
    # Download and save the model if it doesn't exist
    print("Downloading Word2Vec model...")
    word2vec_model = api.load(model_name)
    word2vec_model.save(model_path)  # Save the model locally
    print("Model downloaded and saved to local storage.")


Model loaded from local storage.


In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [5]:
# Preprocessing Functions
def preprocess_text(text):
    return text.lower()

def tokenize_text(text):
    return nltk.word_tokenize(text)

def handle_oov_tokens(tokens, word2vec_model, oov_token="<UNK>"):
    """Handle out-of-vocabulary tokens by replacing them with a specified token."""
    return [token if token in word2vec_model else oov_token for token in tokens]

def pad_or_truncate(tokens, max_length, pad_token="<PAD>"):
    """Pad or truncate the list of tokens to the specified maximum length."""
    if len(tokens) > max_length:
        return tokens[:max_length]
    else:
        return tokens + [pad_token] * (max_length - len(tokens))

def preprocess_pipeline(text, word2vec_model):
    """Pipeline for preprocessing text: cleaning, tokenizing, handling OOV tokens, and padding."""
    text = preprocess_text(text)
    tokens = tokenize_text(text)
    tokens = handle_oov_tokens(tokens, word2vec_model)
    tokens = pad_or_truncate(tokens, MAX_SEQ_LENGTH)
    return tokens


In [6]:
# Convert tokens to embeddings
def tokens_to_embeddings(tokens, word2vec_model, embedding_dim=300):
    """Convert a list of tokens to their corresponding embeddings."""
    embeddings = []
    for token in tokens:
        if token in word2vec_model:
            embeddings.append(word2vec_model[token])
        else:
            embeddings.append(np.zeros(embedding_dim))  # Use zero vector for OOV
    return np.array(embeddings)


In [20]:
class BoolQDataset(Dataset):
    def __init__(self, data, word2vec_model, max_seq_length=MAX_SEQ_LENGTH):
        self.data = data
        self.word2vec_model = word2vec_model
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        question = self.data[idx]['question']
        passage = self.data[idx]['passage']
        
        # Use 'answer' instead of 'label'
        label = 1 if self.data[idx]['answer'] else 0  # Convert boolean to binary

        # Preprocess the question and passage
        question_tokens = preprocess_pipeline(question, self.word2vec_model)
        passage_tokens = preprocess_pipeline(passage, self.word2vec_model)

        # Convert tokens to embeddings
        question_embeddings = tokens_to_embeddings(question_tokens, self.word2vec_model)
        passage_embeddings = tokens_to_embeddings(passage_tokens, self.word2vec_model)

        # Concatenate embeddings
        embeddings = np.concatenate((question_embeddings, passage_embeddings), axis=0)

        # Ensure correct shape for input (1, 60000) if MAX_SEQ_LENGTH is 10
        return torch.tensor(embeddings, dtype=torch.float32), torch.tensor(label, dtype=torch.long)


In [8]:
# Create DataLoaders
train_dataset = BoolQDataset(train_data, word2vec_model)
val_dataset = BoolQDataset(validation_data, word2vec_model)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [9]:
# Initialize WandB
wandb.init(project='nlp-word-embeddings', name='test-run-1')
wandb.config.learning_rate = learning_rate
wandb.config.epochs = n_epochs
wandb.config.batch_size = batch_size


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33maintnoair[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [10]:
# Define the neural network model
class TwoLayerNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TwoLayerNN, self).__init__()

        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


In [11]:
# Model Parameters
embedding_dim = 300
sequence_length = MAX_SEQ_LENGTH * 2  # Concatenate question and passage
input_dim = sequence_length * embedding_dim  # Adjusted input dimension
hidden_dim = 128
output_dim = 2  # Binary classification


In [12]:
# Instantiate the model and move it to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = TwoLayerNN(input_dim, hidden_dim, output_dim).to(device)


Using device: cuda


In [13]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # Lower learning rate


In [14]:
# Modify the training loop
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs):
    model.train()
    
    for epoch in range(epochs):
        running_loss = 0.0
        
        # Training
        for i, data in enumerate(train_loader, 0):
            inputs, labels = data
            
            # Flatten inputs
            inputs = inputs.view(inputs.size(0), -1).to(device)  # Reshape to (batch_size, 30000)
            labels = labels.to(device)  # Move labels to GPU
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)  # Now inputs should be of shape (batch_size, 30000)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()

            # Log the loss to WandB
            if i % 10 == 0:  # Log every 10 steps
                wandb.log({"loss": loss.item()})
                print(f"Step [{i}], Loss: {loss.item():.4f}")
        
        # Log average loss for the epoch
        wandb.log({"epoch": epoch + 1, "average_loss": running_loss / len(train_loader)})
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}")
    
    print("Finished Training")

In [15]:
# Evaluation Function
def evaluate_model(model, val_loader):
    model.eval()
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)  # Move to device
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = correct / total
    print(f'Validation Accuracy: {accuracy * 100:.2f}%')
    wandb.log({"validation_accuracy": accuracy})  # Log validation accuracy to WandB


In [16]:
# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=n_epochs)


Step [0], Loss: 0.6934
Step [10], Loss: 0.6859
Step [20], Loss: 0.6344
Step [30], Loss: 0.6474
Step [40], Loss: 0.5504
Step [50], Loss: 0.6313
Step [60], Loss: 0.7589
Step [70], Loss: 0.7376
Step [80], Loss: 0.6591
Step [90], Loss: 0.6491
Step [100], Loss: 0.5686
Step [110], Loss: 0.8231
Step [120], Loss: 0.7227
Step [130], Loss: 0.5911
Step [140], Loss: 0.7782
Step [150], Loss: 0.5761
Step [160], Loss: 0.7498
Step [170], Loss: 0.5494
Step [180], Loss: 0.7995
Step [190], Loss: 0.5931
Step [200], Loss: 0.4515
Step [210], Loss: 0.5080
Step [220], Loss: 0.7135
Step [230], Loss: 0.8707
Step [240], Loss: 0.6733
Step [250], Loss: 0.5771
Step [260], Loss: 0.6548
Step [270], Loss: 0.6609
Step [280], Loss: 0.5912
Step [290], Loss: 0.6627
Step [300], Loss: 0.7234
Step [310], Loss: 0.8221
Step [320], Loss: 0.6096
Step [330], Loss: 0.8084
Step [340], Loss: 0.5453
Step [350], Loss: 0.5535
Step [360], Loss: 0.6402
Step [370], Loss: 0.5939
Step [380], Loss: 0.4990
Step [390], Loss: 0.4730
Step [400],

In [19]:
# Evaluate the model
evaluate_model(model, val_loader)


RuntimeError: mat1 and mat2 shapes cannot be multiplied (2000x300 and 60000x128)

In [21]:
# Finish the WandB run
wandb.finish()


VBox(children=(Label(value='0.009 MB of 0.009 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
average_loss,▁
epoch,▁
loss,▅▄▃▆▃▆▃▂▇▃▃▅▆▃▃▃▂▂▃▄▇▄▃▆▅▅▄▃▄▅▃▃▃▂▃▁█▂▃▂

0,1
average_loss,0.64045
epoch,1.0
loss,0.46833
