# Massive Text Embedding Benchmark


https://huggingface.co/datasets/mteb/tweet_sentiment_extraction

In [None]:
from datasets import load_dataset
# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("mteb/tweet_sentiment_extraction")

In [None]:
ds

In [None]:
#Explore the dataset
ds['train'][0]

In [None]:
label_map = {str(lbl): lbl_txt for lbl, lbl_txt in zip(ds['train']['label'], ds['train']['label_text'])}
label_map

# Training a SentencePiece Tokenizer

In [None]:
import json
with open("corpus.txt", "w", encoding="utf-8") as f:
    for line in ds["train"]:
        words = line['text']
        f.write(words + "\n")

In [None]:
import sentencepiece as spm
import json
import os


options = dict(
  input="corpus.txt",
  input_format="text",
  model_prefix="simple_nn_tok",
  model_type="bpe",
  vocab_size=2048,
  byte_fallback=True,
  num_threads=os.cpu_count()
)

spm.SentencePieceTrainer.train(**options);



In [None]:
sp = spm.SentencePieceProcessor()
sp.load('simple_nn_tok.model')
vocab = [[sp.id_to_piece(idx), idx] for idx in range(sp.get_piece_size())]
vocab[1000:1020]

# Train Test Split

In [None]:
train_text = ds['train']['text'][:-1000]
train_labels = ds['train']['label'][-1000:]
val_text= ds['train']['text'][1000:]
val_labels = ds['train']['label'][1000:]

In [None]:
tokenized_train_text = [sp.encode(text) for text in train_text]
tokenized_val_text = [sp.encode(text) for text in val_text]

In [None]:
nwords = len(sp)
ntags = 3 # Keep in mind that we have 3 different labels: 0, 1, 2

# Create the Embedding Layer

First we will see the one hot encoding Then we will create the embedding layer

In [None]:
!pip install torch

In [None]:
import torch

In [None]:
print(tokenized_train_text[0])

In [None]:
import torch.nn.functional as F
x = torch.tensor(tokenized_train_text[0], dtype=torch.long)  # (T,)
one_hot = F.one_hot(x, num_classes=nwords).float()            # (T, nwords)
print(one_hot)

In [None]:
import torch.nn as nn

weight = nn.Parameter(torch.randn(nwords, 64))
weight

In [None]:
weight.shape

In [None]:
xs = F.one_hot(x, num_classes=nwords)[:5]

xs is a vector of token IDs like [273, 1989, 1974, 356, 339, ...] and nwords is the vocabulary size (say 2000),

In [None]:
xs.shape

In [None]:
torch.matmul(xs.float(), weight).shape

# Equivalent (and faster) form

You can skip the one-hot and directly use nn.Embedding:

In [None]:
import torch.nn as nn

embedding = nn.Embedding(num_embeddings=nwords, embedding_dim=64)
out = embedding(x[:5])
print(out.shape)  # torch.Size([5, 64])


In [None]:
class Simple_NN(torch.nn.Module):
    def __init__(self, vocab_size, num_labels):
        super(Simple_NN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, num_labels)
        nn.init.xavier_uniform_(self.embedding.weight)

    def forward(self, tokens):
        emb = self.embedding(tokens)
        out = torch.sum(emb, dim=0)
        logits = out.view(1, -1)
        return logits



In [None]:
#Cross Entropy Loss Function
def ce_loss(logits, target):
    log_probs = torch.nn.functional.log_softmax(logits, dim=1)
    loss = -log_probs[:, target]
    return loss

You can use the SGD (Stochastic Gradient Descent) optimizer that was introduced in class, or this typically better optimizer Adam (we'll see it in a later class).

In [None]:
train_data = list(zip(tokenized_train_text, train_labels))
val_data   = list(zip(tokenized_val_text,   val_labels))

In [None]:
import random
import time

# initialize the model
model = Simple_NN(nwords, ntags)
criterion = ce_loss
# optimizer = torch.optim.SGD(model.parameters(), lr=5e-4)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

for ITER in range(5):
    # Perform training
    random.shuffle(train_data)
    train_loss = 0.0
    start = time.time()
    for x, y in train_data:
        x = torch.tensor(x, dtype=torch.long)
        y = torch.tensor([y])
        logits = model(x)
        loss = criterion(logits, y)
        train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (
                ITER, train_loss/len(train_data), time.time()-start))
    # Perform validation
    test_correct = 0.0
    for x, y in val_data:
        x = torch.tensor(x, dtype=torch.long)
        logits = model(x)[0].detach()
        predict = logits.argmax().item()
        if predict == y:
            test_correct += 1
    print("iter %r: valid acc=%.4f" % (ITER, test_correct/len(val_data)))



In [None]:
import tiktoken

enc = tiktoken.get_encoding("gpt2")

In [None]:
data=list(zip([enc.encode(text) for text in ds['train']['text']], ds['train']['label']))

In [None]:
from sklearn.model_selection import train_test_split
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
nwords = nwords = enc.n_vocab
ntags = 3 # Keep in mind that we have 3 different labels: 0, 1, 2


In [None]:

# initialize the model
model = Simple_NN(nwords, ntags)
criterion = ce_loss
# optimizer = torch.optim.SGD(model.parameters(), lr=5e-4)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

for ITER in range(5):
    # Perform training
    random.shuffle(train_data)
    train_loss = 0.0
    start = time.time()
    for x, y in train_data:
        x = torch.tensor(x, dtype=torch.long)
        y = torch.tensor([y])
        logits = model(x)
        loss = criterion(logits, y)
        train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (
                ITER, train_loss/len(train_data), time.time()-start))
    # Perform validation
    test_correct = 0.0
    for x, y in val_data:
        x = torch.tensor(x, dtype=torch.long)
        logits = model(x)[0].detach()
        predict = logits.argmax().item()
        if predict == y:
            test_correct += 1
    print("iter %r: valid acc=%.4f" % (ITER, test_correct/len(val_data)))

# CBOW model for word2vec

In [None]:
class CBoW(torch.nn.Module):
    def __init__(self, vocab_size, num_labels, emb_size):
        super(CBoW, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.output_layer = nn.Linear(emb_size, num_labels)

        nn.init.xavier_uniform_(self.embedding.weight)
        nn.init.xavier_uniform_(self.output_layer.weight)

    def forward(self, tokens):
        emb = self.embedding(tokens)    # [len(tokens) x emb_size]
        emb_sum = torch.sum(emb, dim=0) # [emb_size]
        h = emb_sum.view(1, -1)         # [1 x emb_size]
        logits = self.output_layer(h)   # [1 x num_labels]
        return logits

In [None]:
EMB_SIZE=32
model = CBoW(nwords, ntags, EMB_SIZE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

for ITER in range(5):
    random.shuffle(train_data)
    train_loss = 0.0
    start = time.time()
    model.train()
    for x, y in train_data:
        x = torch.tensor(x, dtype=torch.long)
        y = torch.tensor([y])
        logits = model(x)
        loss = criterion(logits, y)
        train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (
                ITER, train_loss/len(train_data), time.time()-start))
    model.eval()
    # Perform testing
    test_correct = 0.0
    for x, y in val_data:
        x = torch.tensor(x, dtype=torch.long)
        logits = model(x)[0].detach()
        predict = logits.argmax().item()
        if predict == y:
            test_correct += 1
    print("iter %r: validation acc=%.4f" % (ITER, test_correct/len(val_data)))

In [None]:
tweet = "I hate NLP!"
tokens = torch.tensor(enc.encode(tweet), dtype=torch.long)
logits = model(tokens)[0].detach()
predict = logits.argmax().item()
predict

In [None]:
label_map[str(predict)]

# Visualizing embeddings (fixed, PyTorch + tiktoken)

This will:

Grab the trained embedding matrix from the PyTorch model,

PCA to 2D,

Plot the most frequent tokens found in your training data (to keep the plot readable),

Decode tokens with tiktoken (note: GPT-2 BPE tokens are often subword pieces; many start with a leading space).

In [None]:
import numpy as np

In [None]:
import math, string
from collections import Counter, defaultdict

# --- define stopword set and helper function ---
stop_words = {
    "the","a","an","and","or","to","of","in","on","at","for","is","it","this",
    "that","with","as","by","from","are","was","be","were","so","if","but","not",
    "i","you","he","she","we","they","me","my","your","our","their", "good", "too", "like"
}

def is_meaningful(tok):
    """Return True if token is not punctuation/stopword/empty."""
    text = enc.decode([tok]).strip().lower()
    if not text or all(ch in string.punctuation for ch in text):
        return False
    if text in stop_words:
        return False
    return True

In [None]:
# --- Pure PyTorch PCA + Matplotlib (no .numpy())
import matplotlib.pyplot as plt
from collections import Counter
import torch
import math
from collections import defaultdict
freq = Counter()
for x_ids, _ in train_data:
    freq.update(x_ids)

# take top-N among most-common after simple filter
N = 30
top_tokens = [tok for tok, _ in freq.most_common(1000) if is_meaningful(tok)][:N]

# 2) PCA with pure torch (no numpy/sklearn)
with torch.no_grad():
    E = model.embedding.weight.detach().cpu()            # [vocab, emb_dim]
    E_centered = E - E.mean(dim=0, keepdim=True)
    U, S, V = torch.pca_lowrank(E_centered, q=2)
    reduced = E_centered @ V[:, :2]                      # [vocab, 2]

# 3) plot
xs = reduced[top_tokens, 0].tolist()
ys = reduced[top_tokens, 1].tolist()

plt.figure(figsize=(8, 8))
plt.scatter(xs, ys, s=10)

for tok, x, y in zip(top_tokens, xs, ys):
    label = enc.decode([tok]).replace("\n", " ").strip() or f"␀{tok}"
    plt.annotate(label, xy=(x, y), xytext=(3, 2), textcoords="offset points")

plt.title("CBOW Embeddings (Top Non-Stopword Tokens)")
plt.tight_layout()
plt.show()