# Twitter Sentiment Analysis - Feature Engineering

1. Traditional approaches:
   - Bag of Words (BoW)
   - Term Frequency-Inverse Document Frequency (TF-IDF)

2. Word embeddings:
   - Word2Vec (in 02_Word2Vec.ipynb)
   - GloVe

3. Contextual embeddings:
   - BERT

## 1. Setup and Imports

In [3]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
from ast import literal_eval

# Traditional NLP feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Word embeddings
# import gensim
# from gensim.models import Word2Vec

# For BERT
import torch
from transformers import BertTokenizer, BertModel

# Visualization settings
plt.style.use('ggplot')
sns.set(style='whitegrid')
%matplotlib inline

## 2. Load Cleaned Dataset

In [4]:
from google.colab import drive
drive.mount('/content/drive')
dataset_path = '/content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/twitter_training_clean.csv'

# Load dataset
try:
    df = pd.read_csv(dataset_path)
    print(f"Cleaned dataset loaded with shape: {df.shape}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Please update the dataset path or ensure the preprocessing notebook has been run.")

Mounted at /content/drive
Cleaned dataset loaded with shape: (71255, 5)


In [5]:
# Convert tokens from string representation back to list
df['tokens'] = df['tokens'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)

# Display the first few rows
df.head()

Unnamed: 0,content,cleaned_content,tokens,entity,sentiment
0,I am coming to the borders and I will kill you...,coming border kill,"[coming, border, kill]",Borderlands,Positive
1,im getting on borderlands and i will kill you ...,im getting borderland kill,"[im, getting, borderland, kill]",Borderlands,Positive
2,im coming on borderlands and i will murder you...,im coming borderland murder,"[im, coming, borderland, murder]",Borderlands,Positive
3,im getting on borderlands 2 and i will murder ...,im getting borderland 2 murder,"[im, getting, borderland, 2, murder]",Borderlands,Positive
4,im getting into borderlands and i can murder y...,im getting borderland murder,"[im, getting, borderland, murder]",Borderlands,Positive


## 3. Create Output Directories

In [6]:
# Create directories for saving features
features_dir = '/content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/features'
os.makedirs(features_dir, exist_ok=True)

# Paths for different feature types
bow_path = os.path.join(features_dir, 'bow_features.npz')
tfidf_path = os.path.join(features_dir, 'tfidf_features.npz')
word2vec_path = os.path.join(features_dir, 'word2vec_features.npy')
glove_path = os.path.join(features_dir, 'glove_features.npy')
bert_path = os.path.join(features_dir, 'bert_features.npy')

# Path for saving vectorizers
models_dir = '/content/drive/MyDrive/Colab Notebooks/is5126/final-project/models'
os.makedirs(models_dir, exist_ok=True)

bow_vectorizer_path = os.path.join(models_dir, 'bow_vectorizer.pkl')
tfidf_vectorizer_path = os.path.join(models_dir, 'tfidf_vectorizer.pkl')
word2vec_model_path = os.path.join(models_dir, 'word2vec_model')

## 4. Prepare Labels for Model Training

In [7]:
# Create label encoding for sentiment
from sklearn.preprocessing import LabelEncoder

# Encode sentiment labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['sentiment'])

# Display the encoding mapping
print("Label Encoding:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{label} -> {i}")

# Save the label encoder
with open(os.path.join(models_dir, 'label_encoder.pkl'), 'wb') as f:
    pickle.dump(label_encoder, f)

Label Encoding:
Irrelevant -> 0
Negative -> 1
Neutral -> 2
Positive -> 3


## 5. Feature Engineering Approaches

### 5.1 Bag of Words (BoW)

In [8]:
# Initialize CountVectorizer
print("Creating Bag of Words features...")
bow_vectorizer = CountVectorizer(max_features=5000, min_df=5)

# Fit and transform the cleaned tweets
print(df['content'].isna().sum())
print(df['cleaned_content'].isna().sum())


df_filtered = df[df['cleaned_content'].isna()].copy()

print(df_filtered)

X_bow = bow_vectorizer.fit_transform(df['cleaned_content'])

print(f"BoW features shape: {X_bow.shape}")
print(f"Vocabulary size: {len(bow_vectorizer.vocabulary_)}")

# Save BoW features and vectorizer
import scipy.sparse as sp
sp.save_npz(bow_path, X_bow)
with open(bow_vectorizer_path, 'wb') as f:
    pickle.dump(bow_vectorizer, f)

print(f"BoW features saved to {bow_path}")
print(f"BoW vectorizer saved to {bow_vectorizer_path}")

Creating Bag of Words features...
0
0
Empty DataFrame
Columns: [content, cleaned_content, tokens, entity, sentiment]
Index: []
BoW features shape: (71255, 5000)
Vocabulary size: 5000
BoW features saved to /content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/features/bow_features.npz
BoW vectorizer saved to /content/drive/MyDrive/Colab Notebooks/is5126/final-project/models/bow_vectorizer.pkl


### 5.2 TF-IDF

In [9]:
# Initialize TF-IDF Vectorizer
print("Creating TF-IDF features...")
tfidf_vectorizer = TfidfVectorizer(max_features=5000, min_df=5)

# Fit and transform the cleaned tweets
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_content'])

print(f"TF-IDF features shape: {X_tfidf.shape}")
print(f"Vocabulary size: {len(tfidf_vectorizer.vocabulary_)}")

# Save TF-IDF features and vectorizer
sp.save_npz(tfidf_path, X_tfidf)
with open(tfidf_vectorizer_path, 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

print(f"TF-IDF features saved to {tfidf_path}")
print(f"TF-IDF vectorizer saved to {tfidf_vectorizer_path}")

Creating TF-IDF features...
TF-IDF features shape: (71255, 5000)
Vocabulary size: 5000
TF-IDF features saved to /content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/features/tfidf_features.npz
TF-IDF vectorizer saved to /content/drive/MyDrive/Colab Notebooks/is5126/final-project/models/tfidf_vectorizer.pkl


### 5.3 Word2Vec Embeddings

### 5.4 GloVe Embeddings (Pre-trained)

In [16]:
# Download GloVe embeddings if needed
import os
import requests
from zipfile import ZipFile

glove_dir = '../data/glove'
os.makedirs(glove_dir, exist_ok=True)
glove_file = os.path.join(glove_dir, 'glove.6B.100d.txt')

# Download GloVe if not already present
if not os.path.exists(glove_file):
    print("Downloading GloVe embeddings...")
    glove_url = "https://nlp.stanford.edu/data/glove.6B.zip"
    glove_zip = os.path.join(glove_dir, 'glove.6B.zip')

    # Download the zip file
    response = requests.get(glove_url)
    with open(glove_zip, 'wb') as f:
        f.write(response.content)

    # Extract the zip file
    with ZipFile(glove_zip, 'r') as zip_ref:
        zip_ref.extractall(glove_dir)

    # Remove the zip file to save space
    os.remove(glove_zip)
    print("GloVe embeddings downloaded and extracted.")
else:
    print("GloVe embeddings already downloaded.")

# Load GloVe embeddings
print("Loading GloVe embeddings...")
glove_embeddings = {}
with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.strip().split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        glove_embeddings[word] = vector

print(f"Loaded {len(glove_embeddings)} GloVe word vectors.")

# Function to create document vectors using GloVe
def get_glove_vector(tokens, embeddings, vector_size=100):
    doc_vector = np.zeros(vector_size)
    count = 0

    for token in tokens:
        if token in embeddings:
            doc_vector += embeddings[token]
            count += 1

    if count > 0:
        doc_vector /= count

    return doc_vector

# Create document vectors for each tweet using GloVe
print("Generating document vectors from GloVe...")
X_glove = np.array([get_glove_vector(tokens, glove_embeddings) for tokens in df['tokens']])

print(f"GloVe features shape: {X_glove.shape}")

# Save GloVe features
np.save(glove_path, X_glove)
print(f"GloVe features saved to {glove_path}")

Downloading GloVe embeddings...
GloVe embeddings downloaded and extracted.
Loading GloVe embeddings...
Loaded 400000 GloVe word vectors.
Generating document vectors from GloVe...
GloVe features shape: (71255, 100)
GloVe features saved to /content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/features/glove_features.npy


In [10]:
# # Train Word2Vec model on our corpus
# print("Training Word2Vec model...")
# word2vec_model = Word2Vec(df['tokens'].tolist(),
#                           vector_size=100,
#                           window=5,
#                           min_count=5,
#                           workers=4,
#                           sg=1) # Skip-gram model

# # Save the model for future use
# word2vec_model.save(word2vec_model_path)
# print(f"Word2Vec model saved to {word2vec_model_path}")

# # Function to create document vectors by averaging word vectors
# def get_doc_vector(tokens, model, vector_size=100):
#     # Initialize an empty array
#     doc_vector = np.zeros(vector_size)
#     count = 0

#     # Average the word vectors for each token in the document
#     for token in tokens:
#         if token in model.wv:
#             doc_vector += model.wv[token]
#             count += 1

#     # Avoid division by zero
#     if count > 0:
#         doc_vector /= count

#     return doc_vector

# # Create document vectors for each tweet
# print("Generating document vectors from Word2Vec...")
# X_word2vec = np.array([get_doc_vector(tokens, word2vec_model) for tokens in df['tokens']])

# print(f"Word2Vec features shape: {X_word2vec.shape}")

# # Save Word2Vec features
# np.save(word2vec_path, X_word2vec)
# print(f"Word2Vec features saved to {word2vec_path}")

### 5.5 BERT Embeddings

In [13]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load pre-trained BERT model and tokenizer
print("Loading BERT model and tokenizer...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model = model.to(device)
model.eval()  # Set model to evaluation mode

# Function to get BERT embeddings for a batch of texts
def get_bert_embeddings(texts, batch_size=32):
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]

        # Tokenize and convert to tensors
        encoded_input = tokenizer(batch_texts, padding=True, truncation=True, max_length=128, return_tensors='pt')
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}

        # Compute token embeddings with no gradient computation
        with torch.no_grad():
            outputs = model(**encoded_input)

        # Use the CLS token representation as the sentence embedding
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(batch_embeddings)

        # Print progress
        if (i // batch_size) % 10 == 0:
            print(f"Processed {i}/{len(texts)} texts...")

    return np.vstack(all_embeddings)

# Generate BERT embeddings for all tweets
print("Generating BERT embeddings...")
X_bert = get_bert_embeddings(df['cleaned_content'].tolist())

print(f"BERT features shape: {X_bert.shape}")

# Save BERT features
np.save(bert_path, X_bert)
print(f"BERT features saved to {bert_path}")

Using device: cuda
Loading BERT model and tokenizer...
Generating BERT embeddings...
Processed 0/71255 texts...
Processed 320/71255 texts...
Processed 640/71255 texts...
Processed 960/71255 texts...
Processed 1280/71255 texts...
Processed 1600/71255 texts...
Processed 1920/71255 texts...
Processed 2240/71255 texts...
Processed 2560/71255 texts...
Processed 2880/71255 texts...
Processed 3200/71255 texts...
Processed 3520/71255 texts...
Processed 3840/71255 texts...
Processed 4160/71255 texts...
Processed 4480/71255 texts...
Processed 4800/71255 texts...
Processed 5120/71255 texts...
Processed 5440/71255 texts...
Processed 5760/71255 texts...
Processed 6080/71255 texts...
Processed 6400/71255 texts...
Processed 6720/71255 texts...
Processed 7040/71255 texts...
Processed 7360/71255 texts...
Processed 7680/71255 texts...
Processed 8000/71255 texts...
Processed 8320/71255 texts...
Processed 8640/71255 texts...
Processed 8960/71255 texts...
Processed 9280/71255 texts...
Processed 9600/71255 

## 6. Feature Analysis

### 6.1 Compare Feature Dimensionality

In [22]:
X_word2vec = np.load(word2vec_path)

# Create a comparison table of feature dimensions
feature_info = {
    'Feature Type': ['Bag of Words', 'TF-IDF', 'Word2Vec', 'GloVe', 'BERT'],
    'Dimensions': [X_bow.shape[1], X_tfidf.shape[1], X_word2vec.shape[1], X_glove.shape[1], X_bert.shape[1]],
    'Data Type': ['Sparse', 'Sparse', 'Dense', 'Dense', 'Dense']
}

# Create DataFrame for display
feature_df = pd.DataFrame(feature_info)
feature_df

Unnamed: 0,Feature Type,Dimensions,Data Type
0,Bag of Words,5000,Sparse
1,TF-IDF,5000,Sparse
2,Word2Vec,100,Dense
3,GloVe,100,Dense
4,BERT,768,Dense


### 6.2 Visualize Word2Vec Embeddings

In [None]:
# # Visualize Word2Vec embeddings using t-SNE
# from sklearn.manifold import TSNE

# # Get vocabulary and vectors
# vocab = list(word2vec_model.wv.index_to_key)
# vectors = [word2vec_model.wv[word] for word in vocab]

# # Limit to 1000 most frequent words for visualization
# top_n = min(1000, len(vocab))
# vocab = vocab[:top_n]
# vectors = vectors[:top_n]

# # Apply t-SNE for dimensionality reduction
# tsne = TSNE(n_components=2, random_state=42)
# vectors_2d = tsne.fit_transform(vectors)

# # Plot
# plt.figure(figsize=(14, 10))
# plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1], s=10, alpha=0.5)

# # Annotate some interesting words if they exist in the vocabulary
# interesting_words = ['good', 'bad', 'awesome', 'terrible', 'happy', 'sad', 'game', 'play', 'win', 'lose']
# for word in interesting_words:
#     if word in vocab:
#         idx = vocab.index(word)
#         plt.annotate(word, vectors_2d[idx], fontsize=12)

# plt.title('t-SNE Visualization of Word2Vec Embeddings')
# plt.xlabel('Dimension 1')
# plt.ylabel('Dimension 2')
# plt.show()

### 6.3 Visualize Document-Level Embeddings

In [25]:
# Compare distributions of different embeddings
from sklearn.decomposition import PCA

# Function to reduce dimensions and plot
def plot_embedding_distribution(embeddings, title, target=y):
    # Reduce to 2D with PCA
    pca = PCA(n_components=2)
    reduced_embeddings = pca.fit_transform(embeddings)

    # Create DataFrame for easy plotting
    df_plot = pd.DataFrame({
        'x': reduced_embeddings[:, 0],
        'y': reduced_embeddings[:, 1],
        'sentiment': [label_encoder.classes_[i] for i in target]
    })

    # Plot
    plt.figure(figsize=(10, 8))
    sns.scatterplot(data=df_plot, x='x', y='y', hue='sentiment', alpha=0.6)
    plt.title(f'PCA Visualization of {title}')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    variance = pca.explained_variance_ratio_
    plt.figtext(0.5, 0.01, f'Explained variance: PC1 {variance[0]:.2%}, PC2 {variance[1]:.2%}', ha='center')
    plt.tight_layout()
    plt.show()

# Plot Word2Vec embeddings
plot_embedding_distribution(X_word2vec, 'Word2Vec Embeddings')

# Plot GloVe embeddings
plot_embedding_distribution(X_glove, 'GloVe Embeddings')

# Plot BERT embeddings (sample for faster visualization)
plot_embedding_distribution(X_bert, 'BERT Embeddings')

Output hidden; open in https://colab.research.google.com to view.

## 7. Summary and Next Steps

In [24]:
# Save the label and indices for train/val/test splits
np.save(os.path.join(features_dir, 'labels.npy'), y)
print(f"Labels saved to {os.path.join(features_dir, 'labels.npy')}")

Labels saved to /content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/features/labels.npy
