# Embedding Comparison for Multi-label Text Classification

This notebook trains a simple PyTorch feed-forward network on dense embeddings
from four methods: **Word2Vec**, **GloVe**, **FastText**, and **BERT (Sentence-Transformer)**.

In [None]:
import os
import re
import random
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

import gensim
from gensim.models import Word2Vec, KeyedVectors, FastText
from sentence_transformers import SentenceTransformer

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Config
DATA_CSV = "C:\\Deep learning Lab\\Dataset\\Consumer Review of Clothing Product\\data_amazon.xlsx - Sheet1.csv"  # <-- put Kaggle CSV here
GLOVE_PATH = "Dataset\\glove.6B.100d.txt" # optional
WORD2VEC_BIN = None   # path to GoogleNews binary if you have it
FASTTEXT_VEC = None  # path to fasttext .vec if you have it
SENT_TRANS_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
BATCH_SIZE = 64
EPOCHS = 6
LR = 1e-3
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)


Using device: cpu


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
# ---------- Load data ----------
if not os.path.exists(DATA_CSV):
    raise FileNotFoundError(f"Dataset not found at {DATA_CSV}. Download from Kaggle and place the CSV there.")
df = pd.read_csv(DATA_CSV)
print('Dataset shape:', df.shape)
print('Columns:', df.columns.tolist())

if 'Review Text' in df.columns:
    df['text'] = df['Review Text'].astype(str)
elif 'Review' in df.columns:
    df['text'] = df['Review'].astype(str)
else:
    # try common alternatives
    possible = [c for c in df.columns if 'review' in c.lower()]
    if len(possible) > 0:
        df['text'] = df[possible[0]].astype(str)
        print('Using column', possible[0], 'as text')
    else:
        raise ValueError('Cannot find a review text column in the CSV.')

df = df.dropna(subset=['text']).reset_index(drop=True)
print('After dropna, rows:', len(df))


Dataset shape: (49338, 9)
Columns: ['Title', 'Review', 'Cons_rating', 'Cloth_class', 'Materials', 'Construction', 'Color', 'Finishing', 'Durability']
After dropna, rows: 49338


In [15]:
# ---------- Build multi-label targets ----------
labels_list = []
for _, row in df.iterrows():
    lbls = []
    if 'Recommended IND' in df.columns:
        try:
            if int(row.get('Recommended IND', 0)) == 1:
                lbls.append('RECOMMEND')
        except Exception:
            pass
    if 'Department Name' in df.columns:
        lbls.append('DEPT__' + str(row.get('Department Name', 'UNKNOWN')))
    if 'Class Name' in df.columns:
        lbls.append('CLASS__' + str(row.get('Class Name', 'UNKNOWN')))
    labels_list.append(lbls)

mlb = MultiLabelBinarizer(sparse_output=False)
Y = mlb.fit_transform(labels_list)
print('Number of labels:', Y.shape[1])
print('Some label classes:', mlb.classes_[:10])


Number of labels: 0
Some label classes: []


In [16]:
# ---------- Train/Val/Test split & preprocessing ----------
train_idx, test_idx = train_test_split(np.arange(len(df)), test_size=0.2, random_state=SEED)
train_idx, val_idx = train_test_split(train_idx, test_size=0.125, random_state=SEED)
print('Split sizes:', len(train_idx), len(val_idx), len(test_idx))

texts = df['text'].tolist()

def simple_clean(s):
    s = str(s).lower()
    s = re.sub(r'\s+', ' ', s)
    s = re.sub(r'[^a-z0-9\s]', ' ', s)
    return s.strip()

tokens = [word_tokenize(simple_clean(t)) for t in texts]
print('Sample tokenized text:', tokens[0][:20])


Split sizes: 34536 4934 9868
Sample tokenized text: ['absolutely', 'wonderful', 'silky', 'and', 'sexy', 'and', 'comfortable']


In [17]:
# ---------- Embedding utilities ----------
def average_vector(tokens_list, model_dict_or_kv, dim):
    vecs = []
    for w in tokens_list:
        if w in model_dict_or_kv:
            vecs.append(model_dict_or_kv[w])
    if len(vecs) == 0:
        return np.zeros(dim, dtype=np.float32)
    return np.mean(vecs, axis=0)

def load_glove(glove_path):
    emb = {}
    with open(glove_path, 'r', encoding='utf8', errors='ignore') as f:
        for line in tqdm(f, desc='Loading GloVe'):
            parts = line.strip().split()
            word = parts[0]
            vec = np.array(parts[1:]).astype(np.float32)
            emb[word] = vec
    dim = len(next(iter(emb.values())))
    print('Loaded GloVe dim:', dim)
    return emb, dim

def build_glove_embeddings(glove_path):
    glove_emb, dim = load_glove(glove_path)
    embs = np.vstack([average_vector(tok, glove_emb, dim) for tok in tokens])
    return embs, dim

def train_word2vec(tokens_corpus, size=300, window=5, min_count=2, epochs=10):
    model = Word2Vec(sentences=tokens_corpus, vector_size=size, window=window, min_count=min_count, workers=4, epochs=epochs, seed=SEED)
    return model

def build_word2vec_embeddings(pretrained_bin_path=None, train_on_corpus=True, size=300):
    if pretrained_bin_path and os.path.exists(pretrained_bin_path):
        print('Loading pretrained Word2Vec KeyedVectors ...')
        kv = KeyedVectors.load_word2vec_format(pretrained_bin_path, binary=True)
        dim = kv.vector_size
        embs = np.vstack([average_vector(tok, kv, dim) for tok in tokens])
        return embs, dim
    else:
        print('Training Word2Vec on corpus...')
        w2v = train_word2vec(tokens, size=size)
        dim = size
        embs = np.vstack([average_vector(tok, w2v.wv, dim) for tok in tokens])
        return embs, dim

def train_fasttext(tokens_corpus, size=300, window=5, min_count=2, epochs=10):
    model = FastText(sentences=tokens_corpus, vector_size=size, window=window, min_count=min_count, workers=4, epochs=epochs, seed=SEED)
    return model

def build_fasttext_embeddings(pretrained_vec_path=None, train_on_corpus=True, size=300):
    if pretrained_vec_path and os.path.exists(pretrained_vec_path):
        print('Loading pretrained FastText vectors ...')
        kv = KeyedVectors.load_word2vec_format(pretrained_vec_path, binary=False)
        dim = kv.vector_size
        embs = np.vstack([average_vector(tok, kv, dim) for tok in tokens])
        return embs, dim
    else:
        print('Training FastText on corpus...')
        ft = train_fasttext(tokens, size=size)
        dim = size
        embs = np.vstack([average_vector(tok, ft.wv, dim) for tok in tokens])
        return embs, dim

def build_bert_embeddings(model_name=SENT_TRANS_MODEL, batch_size=128):
    print('Loading SentenceTransformer:', model_name)
    sbert = SentenceTransformer(model_name, device=DEVICE)
    embeddings = sbert.encode(texts, show_progress_bar=True, batch_size=batch_size, convert_to_numpy=True)
    return embeddings, embeddings.shape[1]


In [18]:
# ---------- PyTorch dataset & model ----------
class EmbeddingDataset(Dataset):
    def __init__(self, X_emb, Y, idxs):
        self.X = X_emb[idxs]
        self.Y = Y[idxs].astype(np.float32)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, i):
        return torch.from_numpy(self.X[i]).float(), torch.from_numpy(self.Y[i]).float()

class FeedForwardMLP(nn.Module):
    def __init__(self, input_dim, hidden_dims=[512,256], output_dim=1, dropout=0.3):
        super().__init__()
        layers = []
        cur = input_dim
        for h in hidden_dims:
            layers.append(nn.Linear(cur, h))
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            cur = h
        layers.append(nn.Linear(cur, output_dim))
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(x)


In [19]:
# ---------- Training & evaluation helpers ----------
from sklearn.metrics import f1_score, precision_score, recall_score

def train_epoch(model, loader, opt, loss_fn):
    model.train()
    total_loss = 0.0
    for Xb, Yb in loader:
        Xb, Yb = Xb.to(DEVICE), Yb.to(DEVICE)
        opt.zero_grad()
        logits = model(Xb)
        loss = loss_fn(logits, Yb)
        loss.backward()
        opt.step()
        total_loss += loss.item() * Xb.size(0)
    return total_loss / len(loader.dataset)

def eval_model(model, loader, threshold=0.5):
    model.eval()
    Y_true = []
    Y_pred = []
    Y_scores = []
    with torch.no_grad():
        for Xb, Yb in loader:
            Xb = Xb.to(DEVICE)
            logits = model(Xb)
            probs = torch.sigmoid(logits).cpu().numpy()
            Y_scores.append(probs)
            preds = (probs >= threshold).astype(int)
            Y_pred.append(preds)
            Y_true.append(Yb.numpy().astype(int))
    Y_true = np.vstack(Y_true)
    Y_pred = np.vstack(Y_pred)
    Y_scores = np.vstack(Y_scores)
    f1_micro = f1_score(Y_true, Y_pred, average='micro', zero_division=0)
    f1_macro = f1_score(Y_true, Y_pred, average='macro', zero_division=0)
    precision = precision_score(Y_true, Y_pred, average='micro', zero_division=0)
    recall = recall_score(Y_true, Y_pred, average='micro', zero_division=0)
    aucs = []
    for i in range(Y_true.shape[1]):
        try:
            auc = roc_auc_score(Y_true[:,i], Y_scores[:,i])
        except Exception:
            auc = np.nan
        aucs.append(auc)
    mean_auc = np.nanmean(aucs)
    return dict(f1_micro=f1_micro, f1_macro=f1_macro, precision=precision, recall=recall, mean_auc=mean_auc)

def run_experiment(X_emb, emb_name='EMB', input_dim=None, epochs=EPOCHS):
    if input_dim is None:
        input_dim = X_emb.shape[1]
    train_ds = EmbeddingDataset(X_emb, Y, train_idx)
    val_ds = EmbeddingDataset(X_emb, Y, val_idx)
    test_ds = EmbeddingDataset(X_emb, Y, test_idx)
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
    test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

    model = FeedForwardMLP(input_dim=input_dim, hidden_dims=[512,256], output_dim=Y.shape[1], dropout=0.3).to(DEVICE)
    opt = torch.optim.Adam(model.parameters(), lr=LR)
    loss_fn = nn.BCEWithLogitsLoss()
    best_val = -np.inf
    best_state = None
    for ep in range(1, epochs+1):
        tr_loss = train_epoch(model, train_loader, opt, loss_fn)
        metrics_val = eval_model(model, val_loader)
        print(f"[{emb_name}] Epoch {ep}/{epochs} train_loss={tr_loss:.4f} val_f1={metrics_val['f1_micro']:.4f} val_auc={metrics_val['mean_auc']:.4f}")
        if metrics_val['f1_micro'] > best_val:
            best_val = metrics_val['f1_micro']
            best_state = model.state_dict()
    model.load_state_dict(best_state)
    metrics_test = eval_model(model, test_loader)
    print(f"[{emb_name}] TEST: f1_micro={metrics_test['f1_micro']:.4f}, f1_macro={metrics_test['f1_macro']:.4f}, mean_auc={metrics_test['mean_auc']:.4f}")
    return metrics_test


In [22]:
# ---------- Run experiments (GloVe, Word2Vec, FastText, BERT) ----------
results = []

# If Y has zero columns (no labels were created earlier), try to build sensible labels
if Y.shape[1] == 0:
    print("Y has zero columns. Rebuilding labels from available dataframe columns ('Cloth_class' and 'Cons_rating')...")
    labels_list = []
    for _, row in df.iterrows():
        lbls = []
        if 'Cloth_class' in df.columns and pd.notna(row.get('Cloth_class')):
            lbls.append('CLASS__' + str(row.get('Cloth_class')))
        if 'Cons_rating' in df.columns and pd.notna(row.get('Cons_rating')):
            try:
                if float(row.get('Cons_rating')) >= 4.0:
                    lbls.append('RATING_GE4')
            except Exception:
                pass
        # ensure at least one label per sample to avoid empty rows for MultiLabelBinarizer
        if len(lbls) == 0:
            lbls.append('NO_LABEL')
        labels_list.append(lbls)
    mlb = MultiLabelBinarizer(sparse_output=False)
    Y = mlb.fit_transform(labels_list)
    print('Rebuilt labels. Number of labels:', Y.shape[1], 'Some classes:', mlb.classes_[:10])

    # recompute splits because label matrix changed (optional but safer)
    train_idx, test_idx = train_test_split(np.arange(len(df)), test_size=0.2, random_state=SEED)
    train_idx, val_idx = train_test_split(train_idx, test_size=0.125, random_state=SEED)
    print('Recomputed split sizes:', len(train_idx), len(val_idx), len(test_idx))

# Helper to safely try an experiment and continue on error
def try_run(name, build_fn, *build_args, **build_kwargs):
    try:
        X_emb, dim = build_fn(*build_args, **build_kwargs)
        metrics = run_experiment(X_emb, emb_name=name, input_dim=dim)
        results.append({'embedding': name, 'dim': dim, **metrics})
    except Exception as e:
        print(f"[{name}] Failed: {e}")

# 1) GloVe (optional)
if os.path.exists(GLOVE_PATH):
    print('Building GloVe embeddings...')
    try_run('GloVe', build_glove_embeddings, GLOVE_PATH)
else:
    print('GloVe not found at', GLOVE_PATH)

# 2) Word2Vec (train on corpus or load if provided)
print('\nWord2Vec (trained on corpus or loaded pretrained if path set)')
try:
    X_w2v, dim_w2v = build_word2vec_embeddings(pretrained_bin_path=WORD2VEC_BIN, train_on_corpus=True, size=300)
    res_w2v = run_experiment(X_w2v, emb_name='Word2Vec', input_dim=dim_w2v)
    results.append({'embedding':'Word2Vec','dim':dim_w2v, **res_w2v})
except Exception as e:
    print('[Word2Vec] Failed:', e)

# 3) FastText (train on corpus or load if provided)
print('\nFastText (trained on corpus or loaded pretrained if path set)')
try:
    X_ft, dim_ft = build_fasttext_embeddings(pretrained_vec_path=FASTTEXT_VEC, train_on_corpus=True, size=300)
    res_ft = run_experiment(X_ft, emb_name='FastText', input_dim=dim_ft)
    results.append({'embedding':'FastText','dim':dim_ft, **res_ft})
except Exception as e:
    print('[FastText] Failed:', e)

# 4) BERT / SentenceTransformer
print('\nBERT / SentenceTransformer')
try:
    X_bert, dim_bert = build_bert_embeddings(batch_size=64)
    res_bert = run_experiment(X_bert, emb_name='BERT-SBERT', input_dim=dim_bert)
    results.append({'embedding':'BERT-SBERT','dim':dim_bert, **res_bert})
except Exception as e:
    print('[BERT-SBERT] Failed:', e)

# Summarize results (if any succeeded)
if len(results) > 0:
    df_res = pd.DataFrame(results)
    print('\nResults summary:')
    print(df_res.sort_values('f1_micro', ascending=False))
    df_res.to_csv('embedding_comparison_results.csv', index=False)
    print('\nSaved results to embedding_comparison_results.csv')
else:
    print('No experiment completed successfully. Check earlier errors.')


Y has zero columns. Rebuilding labels from available dataframe columns ('Cloth_class' and 'Cons_rating')...
Rebuilt labels. Number of labels: 26 Some classes: ['CLASS__Blazer' 'CLASS__Blouses' 'CLASS__Casual bottoms'
 'CLASS__Chemises' 'CLASS__Dress' 'CLASS__Dresses' 'CLASS__Fine gauge'
 'CLASS__Intimates' 'CLASS__Jackets' 'CLASS__Jeans']
Recomputed split sizes: 34536 4934 9868
GloVe not found at Dataset\glove.6B.100d.txt

Word2Vec (trained on corpus or loaded pretrained if path set)
Training Word2Vec on corpus...




[Word2Vec] Epoch 1/6 train_loss=0.1367 val_f1=0.6495 val_auc=0.8226




[Word2Vec] Epoch 2/6 train_loss=0.1098 val_f1=0.6654 val_auc=0.8471




[Word2Vec] Epoch 3/6 train_loss=0.1059 val_f1=0.6697 val_auc=0.8671




[Word2Vec] Epoch 4/6 train_loss=0.1036 val_f1=0.6797 val_auc=0.8694




[Word2Vec] Epoch 5/6 train_loss=0.1016 val_f1=0.6837 val_auc=0.8783




[Word2Vec] Epoch 6/6 train_loss=0.1004 val_f1=0.6851 val_auc=0.8912
[Word2Vec] TEST: f1_micro=0.6809, f1_macro=0.2488, mean_auc=0.8465

FastText (trained on corpus or loaded pretrained if path set)
Training FastText on corpus...




[FastText] Epoch 1/6 train_loss=0.1452 val_f1=0.6113 val_auc=0.8154




[FastText] Epoch 2/6 train_loss=0.1186 val_f1=0.6287 val_auc=0.8453




[FastText] Epoch 3/6 train_loss=0.1140 val_f1=0.6396 val_auc=0.8457




[FastText] Epoch 4/6 train_loss=0.1114 val_f1=0.6469 val_auc=0.8666




[FastText] Epoch 5/6 train_loss=0.1095 val_f1=0.6596 val_auc=0.8741




[FastText] Epoch 6/6 train_loss=0.1078 val_f1=0.6591 val_auc=0.8806
[FastText] TEST: f1_micro=0.6591, f1_macro=0.1733, mean_auc=0.8323

BERT / SentenceTransformer
Loading SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|██████████| 771/771 [07:22<00:00,  1.74it/s]


[BERT-SBERT] Epoch 1/6 train_loss=0.1161 val_f1=0.7299 val_auc=0.8860




[BERT-SBERT] Epoch 2/6 train_loss=0.0904 val_f1=0.7352 val_auc=0.9015




[BERT-SBERT] Epoch 3/6 train_loss=0.0867 val_f1=0.7379 val_auc=0.9143




[BERT-SBERT] Epoch 4/6 train_loss=0.0841 val_f1=0.7391 val_auc=0.9171




[BERT-SBERT] Epoch 5/6 train_loss=0.0821 val_f1=0.7422 val_auc=0.9286




[BERT-SBERT] Epoch 6/6 train_loss=0.0799 val_f1=0.7432 val_auc=0.9279
[BERT-SBERT] TEST: f1_micro=0.7419, f1_macro=0.4051, mean_auc=0.8861

Results summary:
    embedding  dim  f1_micro  f1_macro  precision    recall  mean_auc
2  BERT-SBERT  384  0.741893  0.405114   0.830700  0.670241  0.886084
0    Word2Vec  300  0.680893  0.248843   0.854617  0.565865  0.846452
1    FastText  300  0.659110  0.173262   0.856032  0.535845  0.832269

Saved results to embedding_comparison_results.csv
