<a href="https://colab.research.google.com/github/anupamamnair/anupamamnair.github.io/blob/master/UntitledTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
!pip install torch torchtext scikit-learn
!pip install gensim



In [36]:
import torch
import sys
import re
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import gensim.downloader as api
from gensim.models import Word2Vec, FastText

In [37]:
train_data = fetch_20newsgroups(
    subset='train',
    remove=('headers', 'footers', 'quotes')
)

sample_index = 0
sample_document = train_data.data[sample_index]
sample_label = train_data.target[sample_index]
sample_category_name = train_data.target_names[sample_label]

print("\n--- Sample Document ---")
print(f"\nDocument: {sample_document[:500]}...")
print(f"\nNumeric Label: {sample_label}")
print(f"\nCategory Name: {sample_category_name}")

test_data = fetch_20newsgroups(
    subset='test',
    remove=('headers', 'footers', 'quotes')
)

X_train_text = train_data.data
y_train = train_data.target

X_test_text = test_data.data
y_test = test_data.target


--- Sample Document ---

Document: I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail....

Numeric Label: 7

Category Name: rec.autos


In [38]:
stopwords = set(ENGLISH_STOP_WORDS)

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    tokens = text.split()
    tokens = [w for w in tokens if w not in stopwords and len(w) > 2]
    return tokens


# def preprocess(text):
#     text = text.lower()
#     text = re.sub(r'[^a-z\s]', ' ', text)
#     tokens = text.split()
#     tokens = [w for w in tokens if w not in stopwords]
#     return tokens

# Adding this common method to ensure fair comparison between models

train_tokens = [preprocess(t) for t in X_train_text]
test_tokens  = [preprocess(t) for t in X_test_text]

In [39]:
print("Creating features for TF-IDF")
from sklearn.feature_extraction.text import TfidfVectorizer

def identity(x):
    return x

vectorizer = TfidfVectorizer(
    tokenizer=identity,
    preprocessor=identity,
    token_pattern=None,
    lowercase=False,
    max_features=20000
)

X_train_tfidf = vectorizer.fit_transform(train_tokens).toarray()
X_test_tfidf  = vectorizer.transform(test_tokens).toarray()

Creating features for TF-IDF


In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_temp = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
tfidf_temp.fit(train_tokens)

idf_dict = dict(zip(tfidf_temp.get_feature_names_out(), tfidf_temp.idf_))


def avg_vec(tokens, model, dim):
    vecs = []
    weights = []

    for w in tokens:
        if w in model:
            vecs.append(model[w])
            weights.append(idf_dict.get(w, 1.0))  # IDF weight

    if len(vecs) == 0:
        return np.zeros(dim)

    vecs = np.array(vecs)
    weights = np.array(weights).reshape(-1, 1)

    return np.sum(vecs * weights, axis=0) / np.sum(weights)




In [41]:
print("Creating features for Word2Vec")

dim = 300
w2v = Word2Vec(train_tokens, vector_size=dim, window=5, min_count=2)

X_train_w2v = np.array([avg_vec(t, w2v.wv, dim) for t in train_tokens])
X_test_w2v  = np.array([avg_vec(t, w2v.wv, dim) for t in test_tokens])


Creating features for Word2Vec


In [42]:
print("Loading GloVe embeddings...")

glove = api.load("glove-wiki-gigaword-300")

X_train_glove = np.array([avg_vec(t, glove, dim) for t in train_tokens])
X_test_glove  = np.array([avg_vec(t, glove, dim) for t in test_tokens])

Loading GloVe embeddings...


In [43]:
print("Creating features for FastText")

ft = FastText(train_tokens, vector_size=dim, window=5, min_count=2)

X_train_ft = np.array([avg_vec(t, ft.wv, dim) for t in train_tokens])
X_test_ft  = np.array([avg_vec(t, ft.wv, dim) for t in test_tokens])

Creating features for FastText


In [44]:
class FFNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, x):
        return self.net(x)


In [45]:
# Adding existing code to a method to reuse the neural network training and evaluation

def run_model(X_train_vec, X_test_vec, name):
  print(f"\nRunning {name}...")
  print("======================================")
  print("Training model...")

  X_train_run = torch.tensor(X_train_vec, dtype=torch.float32)
  y_train_run = torch.tensor(y_train, dtype=torch.long)

  X_test_run = torch.tensor(X_test_vec, dtype=torch.float32)
  y_test_run = torch.tensor(y_test, dtype=torch.long)

  train_dataset = TensorDataset(X_train_run, y_train_run)
  test_dataset  = TensorDataset(X_test_run, y_test_run)

  train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
  test_loader  = DataLoader(test_dataset, batch_size=64)

  input_dim = X_train_vec.shape[1]
  hidden_dim = 256
  num_classes = len(set(y_train))

  model = FFNN(input_dim, hidden_dim, num_classes)

  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

  epochs = 5

  for epoch in range(epochs):
    model.train()
    total_loss = 0

    for xb, yb in train_loader:
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

  model.eval()
  all_preds = []
  all_true = []

  with torch.no_grad():
    for xb, yb in test_loader:
      logits = model(xb)
      preds = torch.argmax(logits, dim=1)

      all_preds.extend(preds.cpu().numpy())
      all_true.extend(yb.cpu().numpy())

  acc = accuracy_score(all_true, all_preds)
  prec, rec, f1, _ = precision_recall_fscore_support(
      all_true, all_preds, average='weighted'
  )

  print("Results : ")
  print(f"{name} -> Acc:{acc:.4f}  F1:{f1:.4f}")

  return [name, acc, prec, rec, f1]

In [None]:
results = []

results.append(run_model(X_train_tfidf, X_test_tfidf, "TF-IDF"))
results.append(run_model(X_train_w2v, X_test_w2v, "Word2Vec"))
results.append(run_model(X_train_glove, X_test_glove, "GloVe"))
results.append(run_model(X_train_ft, X_test_ft, "FastText"))

df = pd.DataFrame(
    results,
    columns=["Representation", "Accuracy", "Precision", "Recall", "F1-score"]
)

print("\n========== FINAL COMPARISON ==========")
print(df)


Running TF-IDF...
Training model...
