# Use trained embeddings in downstream tasks
You can use the trained Word2Vec/ Fasttext/ Glove embeddings as input features for various downstream NLP tasks such as text classification, named entity recognition, or machine translation. Here's a simple example of how to use the embeddings for text classification using a logistic regression model.

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize


Go to Kaggle website and download the archive "Reviews.csv" from the dataset "Amazon Fine Food Reviews": https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews

In [None]:
import pandas as pd
rev = pd.read_csv("Reviews.csv")
print(rev.head())

# We create the list of the words that our corpus has

In [None]:
corpus_text = 'n'.join(rev[:1000]['Text'])
data = []
# iterate through each sentence in the file
for i in sent_tokenize(corpus_text):
    temp = []
    # tokenize the sentence into words
    for j in word_tokenize(i):
        temp.append(j.lower())
    data.append(temp)

In [None]:
import re
from tqdm import tqdm
import numpy as np
from pathlib import Path


In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("leadbest/googlenewsvectorsnegative300")

print("Path to dataset files:", path)

In [None]:
path

In [None]:
emb_path = list(Path(path).rglob("GoogleNews-vectors-negative300.bin"))[0]
print("Embedding file:", emb_path)

In [None]:
def load_word2vec_binary(path, limit=None):
    """Load GoogleNews .bin Word2Vec file (no gensim)."""
    with open(path, "rb") as f:
        header = f.readline()
        vocab_size, dim = map(int, header.split())
        print(f"Header: {vocab_size} words, dim={dim}")
        word_vectors = {}
        for i in tqdm(range(vocab_size if not limit else min(limit, vocab_size))):
            # Read word
            word = []
            while True:
                ch = f.read(1)
                if ch == b' ':
                    break
                if ch == b'':
                    break
                word.append(ch)
            word = b''.join(word).decode("utf-8", errors="ignore")
            # Read vector
            vec = np.frombuffer(f.read(dim * 4), dtype=np.float32)
            f.read(1)  # newline
            word_vectors[word] = vec
        return word_vectors, dim

In [None]:
#  Load limited subset for demo
w2v, dim = load_word2vec_binary(emb_path, limit=200000)
print("Loaded words:", len(w2v))

# The simplest representation of documents is avarage of word vectors

In [None]:
def tokenize(text):
    return re.findall(r"[A-Za-z']+", text.lower())


In [None]:
corpus_text = '\n'.join(rev.loc[:999, 'Text'].astype(str))
data = []

for i in sent_tokenize(corpus_text):
    temp = [j.lower() for j in word_tokenize(i)]
    data.append(temp)

# --- 4. Labels ---
labels = rev.loc[:999, 'Score'].astype(str).tolist()
print("Number of samples:", len(data))
print("Number of labels:", len(labels))

In [None]:
def doc_vector(tokens, w2v, dim):
    vecs = [w2v[t] for t in tokens if t in w2v]
    if not vecs:
        return np.zeros(dim)
    return np.mean(vecs, axis=0)

In [None]:
X = np.vstack([doc_vector(tokens, w2v, dim) for tokens in data[:1000]])
y = np.array(labels)

print("Feature matrix shape:", X.shape)

In [None]:
len(y)

# Example classification task

In [None]:
from sklearn.model_selection import train_test_split
#  Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y if len(set(y)) > 1 else None
)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report


#Train logistic regression
clf = LogisticRegression(max_iter=200, n_jobs=-1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

#Evaluate
labels_for_cm = sorted(set(y_test) | set(y_pred))  # safe union of actual and predicted

print(classification_report(y_test, y_pred,
                            labels=labels_for_cm,
                            zero_division=0))
