# Feature Engineering

1. Traditional approaches:
   - Bag of Words (BoW)
   - Term Frequency-Inverse Document Frequency (TF-IDF)

2. Word embeddings:
   - Word2Vec (please run 02_Word2Vec.ipynb before running this notebook)
   - GloVe

3. Contextual embeddings:
   - BERT

## Setup and Imports

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
from ast import literal_eval

# Traditional NLP feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# For BERT
import torch
from transformers import BertTokenizer, BertModel

# Visualization settings
plt.style.use('ggplot')
sns.set(style='whitegrid')
%matplotlib inline

## Load Cleaned Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Load dataset
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/twitter_training_clean.csv')
val_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/twitter_validation_clean.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/twitter_testing_clean.csv')

print(f"Train set: {train_df.shape[0]} samples")
print(f"Validation set: {val_df.shape[0]} samples")
print(f"Test set: {test_df.shape[0]} samples")

# Display the first few rows
train_df.head()

Mounted at /content/drive
Train set: 108000 samples
Validation set: 22107 samples
Test set: 22107 samples


Unnamed: 0,content,cleaned_content,tokens,entity,sentiment
0,Hey Hey @verizonfios @verizon Why is my intern...,hey hey [USER] [USER] why is my internet speed...,"['hey', 'hey', '[', 'USER', ']', '[', 'USER', ...",Verizon,Negative
1,Gotta hate when open csgo on the wrong account...,gotta hate when open csgo on the wrong account...,"['got', 'ta', 'hate', 'when', 'open', 'csgo', ...",CS-GO,Negative
2,@EAMaddenNFL does anyone else’s speed even mat...,[USER] does anyone else’s speed even matter ot...,"['[', 'USER', ']', 'does', 'anyone', 'else', '...",MaddenNFL,Negative
3,@Rainbow6Game the attempt to kick is the most ...,[USER] the attempt to kick is the most stupid ...,"['[', 'USER', ']', 'the', 'attempt', 'to', 'ki...",TomClancysRainbowSix,Negative
4,Imagine. This is a disgrace @Beluba @Ronnie2k ...,imagine. this is a disgrace [USER] [USER] [USE...,"['imagine', '.', 'this', 'is', 'a', 'disgrace'...",NBA2K,Negative


## Create Output Directories

In [None]:
# Create directories for saving features
features_dir = '/content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/features'
os.makedirs(features_dir, exist_ok=True)

# Paths for different feature types
bow_path = os.path.join(features_dir, 'bow_features.npz')
tfidf_path = os.path.join(features_dir, 'tfidf_features.npz')
word2vec_path = os.path.join(features_dir, 'word2vec_features.npy')
glove_path = os.path.join(features_dir, 'glove_features.npy')
bert_path = os.path.join(features_dir, 'bert_features.npy')

# Path for saving vectorizers
models_dir = '/content/drive/MyDrive/Colab Notebooks/is5126/final-project/models'
os.makedirs(models_dir, exist_ok=True)

bow_vectorizer_path = os.path.join(models_dir, 'bow_vectorizer.pkl')
tfidf_vectorizer_path = os.path.join(models_dir, 'tfidf_vectorizer.pkl')

## Prepare Labels for Model Training

In [None]:
# Create label encoding for sentiment
from sklearn.preprocessing import LabelEncoder

# Encode sentiment labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_df['sentiment'])

# Display the encoding mapping
print("Label Encoding:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{label} -> {i}")

# Save the label encoder
with open(os.path.join(models_dir, 'label_encoder.pkl'), 'wb') as f:
    pickle.dump(label_encoder, f)

Label Encoding:
Irrelevant -> 0
Negative -> 1
Neutral -> 2
Positive -> 3


## Feature Engineering Approaches

### Bag of Words (BoW)

In [None]:
# Initialize CountVectorizer, with max features 5000, and min document frequency 5
print("Creating Bag of Words features...")
bow_vectorizer = CountVectorizer(max_features=5000, min_df=5)

# Fit on training data and transform all datasets
X_train_bow = bow_vectorizer.fit_transform(train_df['cleaned_content'])
X_val_bow = bow_vectorizer.transform(val_df['cleaned_content'])
X_test_bow = bow_vectorizer.transform(test_df['cleaned_content'])

print(f"BoW features shape (train): {X_train_bow.shape}")
print(f"BoW features shape (val): {X_val_bow.shape}")
print(f"BoW features shape (test): {X_test_bow.shape}")

# Save BoW features and vectorizer
import scipy.sparse as sp
sp.save_npz(bow_path.replace('.npz', '_train.npz'), X_train_bow)
sp.save_npz(bow_path.replace('.npz', '_val.npz'), X_val_bow)
sp.save_npz(bow_path.replace('.npz', '_test.npz'), X_test_bow)
with open(bow_vectorizer_path, 'wb') as f:
    pickle.dump(bow_vectorizer, f)

print(f"BoW features saved to {bow_path}")
print(f"BoW vectorizer saved to {bow_vectorizer_path}")

Creating Bag of Words features...
BoW features shape (train): (108000, 5000)
BoW features shape (val): (22107, 5000)
BoW features shape (test): (22107, 5000)
BoW features saved to /content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/features/bow_features.npz
BoW vectorizer saved to /content/drive/MyDrive/Colab Notebooks/is5126/final-project/models/bow_vectorizer.pkl


### TF-IDF

In [None]:
# Initialize TF-IDF Vectorizer, with max features 5000, and min document frequency
print("Creating TF-IDF features...")
tfidf_vectorizer = TfidfVectorizer(max_features=5000, min_df=5)

# Fit on training data and transform all datasets
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['cleaned_content'])
X_val_tfidf = tfidf_vectorizer.transform(val_df['cleaned_content'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['cleaned_content'])

print(f"TF-IDF features shape (train): {X_train_tfidf.shape}")
print(f"TF-IDF features shape (val): {X_val_tfidf.shape}")
print(f"TF-IDF features shape (test): {X_test_tfidf.shape}")

# Save TF-IDF features and vectorizer
sp.save_npz(tfidf_path.replace('.npz', '_train.npz'), X_train_tfidf)
sp.save_npz(tfidf_path.replace('.npz', '_val.npz'), X_val_tfidf)
sp.save_npz(tfidf_path.replace('.npz', '_test.npz'), X_test_tfidf)
with open(tfidf_vectorizer_path, 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

print(f"TF-IDF features saved to {tfidf_path}")
print(f"TF-IDF vectorizer saved to {tfidf_vectorizer_path}")

Creating TF-IDF features...
TF-IDF features shape (train): (108000, 5000)
TF-IDF features shape (val): (22107, 5000)
TF-IDF features shape (test): (22107, 5000)
TF-IDF features saved to /content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/features/tfidf_features.npz
TF-IDF vectorizer saved to /content/drive/MyDrive/Colab Notebooks/is5126/final-project/models/tfidf_vectorizer.pkl


### GloVe Embeddings (Pre-trained)

In [None]:
# Download GloVe embeddings if needed
import os
import requests
from zipfile import ZipFile

glove_dir = '../data/glove'
os.makedirs(glove_dir, exist_ok=True)
glove_file = os.path.join(glove_dir, 'glove.6B.100d.txt')

# Download GloVe if not already present
if not os.path.exists(glove_file):
    print("Downloading GloVe embeddings...")
    glove_url = "https://nlp.stanford.edu/data/glove.6B.zip"
    glove_zip = os.path.join(glove_dir, 'glove.6B.zip')

    # Download the zip file
    response = requests.get(glove_url)
    with open(glove_zip, 'wb') as f:
        f.write(response.content)

    # Extract the zip file
    with ZipFile(glove_zip, 'r') as zip_ref:
        zip_ref.extractall(glove_dir)

    # Remove the zip file to save space
    os.remove(glove_zip)
    print("GloVe embeddings downloaded and extracted.")
else:
    print("GloVe embeddings already downloaded.")

# Load GloVe embeddings
print("Loading GloVe embeddings...")
glove_embeddings = {}
with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.strip().split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        glove_embeddings[word] = vector

print(f"Loaded {len(glove_embeddings)} GloVe word vectors.")

# Function to create document vectors using GloVe
def get_glove_vector(tokens, embeddings, vector_size=100):
    doc_vector = np.zeros(vector_size)
    count = 0

    for token in tokens:
        if token in embeddings:
            doc_vector += embeddings[token]
            count += 1

    if count > 0:
        doc_vector /= count

    return doc_vector

# Create document vectors for each dataset using GloVe
print("Generating document vectors from GloVe...")
X_train_glove = np.array([get_glove_vector(tokens, glove_embeddings) for tokens in train_df['tokens']])
X_val_glove = np.array([get_glove_vector(tokens, glove_embeddings) for tokens in val_df['tokens']])
X_test_glove = np.array([get_glove_vector(tokens, glove_embeddings) for tokens in test_df['tokens']])

print(f"GloVe features shape (train): {X_train_glove.shape}")
print(f"GloVe features shape (val): {X_val_glove.shape}")
print(f"GloVe features shape (test): {X_test_glove.shape}")

# Save GloVe features
np.save(glove_path.replace('.npy', '_train.npy'), X_train_glove)
np.save(glove_path.replace('.npy', '_val.npy'), X_val_glove)
np.save(glove_path.replace('.npy', '_test.npy'), X_test_glove)
print(f"GloVe features saved to {glove_path}")

Downloading GloVe embeddings...
GloVe embeddings downloaded and extracted.
Loading GloVe embeddings...
Loaded 400000 GloVe word vectors.
Generating document vectors from GloVe...
GloVe features shape (train): (108000, 100)
GloVe features shape (val): (22107, 100)
GloVe features shape (test): (22107, 100)
GloVe features saved to /content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/features/glove_features.npy


### BERT Embeddings

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load pre-trained BERT model and tokenizer
print("Loading BERT model and tokenizer...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model = model.to(device)
model.eval()  # Set model to evaluation mode

# Function to get BERT embeddings for a batch of texts
def get_bert_embeddings(texts, batch_size=32):
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]

        # Tokenize and convert to tensors
        encoded_input = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors='pt')
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}

        # Compute token embeddings with no gradient computation
        with torch.no_grad():
            outputs = model(**encoded_input)

        # Use the CLS token representation as the sentence embedding
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(batch_embeddings)

        # Print progress
        if (i // batch_size) % 10 == 0:
            print(f"Processed {i}/{len(texts)} texts...")

    return np.vstack(all_embeddings)

# Generate BERT embeddings for each dataset
print("Generating BERT embeddings...")
X_train_bert = get_bert_embeddings(train_df['cleaned_content'].tolist())
X_val_bert = get_bert_embeddings(val_df['cleaned_content'].tolist())
X_test_bert = get_bert_embeddings(test_df['cleaned_content'].tolist())

print(f"BERT features shape (train): {X_train_bert.shape}")
print(f"BERT features shape (val): {X_val_bert.shape}")
print(f"BERT features shape (test): {X_test_bert.shape}")

# Save BERT features
np.save(bert_path.replace('.npy', '_train.npy'), X_train_bert)
np.save(bert_path.replace('.npy', '_val.npy'), X_val_bert)
np.save(bert_path.replace('.npy', '_test.npy'), X_test_bert)
print(f"BERT features saved to {bert_path}")

Using device: cuda
Loading BERT model and tokenizer...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Generating BERT embeddings...
Processed 0/108000 texts...
Processed 320/108000 texts...
Processed 640/108000 texts...
Processed 960/108000 texts...
Processed 1280/108000 texts...
Processed 1600/108000 texts...
Processed 1920/108000 texts...
Processed 2240/108000 texts...
Processed 2560/108000 texts...
Processed 2880/108000 texts...
Processed 3200/108000 texts...
Processed 3520/108000 texts...
Processed 3840/108000 texts...
Processed 4160/108000 texts...
Processed 4480/108000 texts...
Processed 4800/108000 texts...
Processed 5120/108000 texts...
Processed 5440/108000 texts...
Processed 5760/108000 texts...
Processed 6080/108000 texts...
Processed 6400/108000 texts...
Processed 6720/108000 texts...
Processed 7040/108000 texts...
Processed 7360/108000 texts...
Processed 7680/108000 texts...
Processed 8000/108000 texts...
Processed 8320/108000 texts...
Processed 8640/108000 texts...
Processed 8960/108000 texts...
Processed 9280/108000 texts...
Processed 9600/108000 texts...
Processed 9920/

## Compare Feature Dimensionality

In [None]:
X_train_word2vec = np.load(word2vec_path.replace('.npy', '_train.npy'))

# Create a comparison table of feature dimensions
feature_info = {
    'Feature Type': ['Bag of Words', 'TF-IDF', 'Word2Vec', 'GloVe', 'BERT'],
    'Dimensions': [X_train_bow.shape[1], X_train_tfidf.shape[1], X_train_word2vec.shape[1], X_train_glove.shape[1], X_train_bert.shape[1]],
    'Data Type': ['Sparse', 'Sparse', 'Dense', 'Dense', 'Dense']
}

# Create DataFrame for display
feature_df = pd.DataFrame(feature_info)
feature_df

Unnamed: 0,Feature Type,Dimensions,Data Type
0,Bag of Words,5000,Sparse
1,TF-IDF,5000,Sparse
2,Word2Vec,100,Dense
3,GloVe,100,Dense
4,BERT,768,Dense


## Saving

In [None]:
# Save the label and indices for train/val/test splits
np.save(os.path.join(features_dir, 'labels.npy'), y)
print(f"Labels saved to {os.path.join(features_dir, 'labels.npy')}")

Labels saved to /content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/features/labels.npy
