# Feature Engineering (Word2Vec)

## Setup and Imports

- the implemenetation of Word2Vec is separated from other features since a earlier version of numpy is needed
- you might need to restart session after installation

In [None]:
!pip install gensim
!pip install --force-reinstall -v "numpy==1.24.2"

Using pip 24.1.2 from /usr/local/lib/python3.11/dist-packages/pip (python 3.11)
  Link requires a different Python (3.11.12 not in: '>=3.7,<3.11'): https://files.pythonhosted.org/packages/3a/be/650f9c091ef71cb01d735775d554e068752d3ff63d7943b26316dc401749/numpy-1.21.2.zip (from https://pypi.org/simple/numpy/) (requires-python:>=3.7,<3.11)
  Link requires a different Python (3.11.12 not in: '>=3.7,<3.11'): https://files.pythonhosted.org/packages/5f/d6/ad58ded26556eaeaa8c971e08b6466f17c4ac4d786cd3d800e26ce59cc01/numpy-1.21.3.zip (from https://pypi.org/simple/numpy/) (requires-python:>=3.7,<3.11)
  Link requires a different Python (3.11.12 not in: '>=3.7,<3.11'): https://files.pythonhosted.org/packages/fb/48/b0708ebd7718a8933f0d3937513ef8ef2f4f04529f1f66ca86d873043921/numpy-1.21.4.zip (from https://pypi.org/simple/numpy/) (requires-python:>=3.7,<3.11)
  Link requires a different Python (3.11.12 not in: '>=3.7,<3.11'): https://files.pythonhosted.org/packages/c2/a8/a924a09492bdfee8c2ec3094d0

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
from ast import literal_eval

# Word embeddings
import gensim
from gensim.models import Word2Vec

# Visualization settings
plt.style.use('ggplot')
sns.set(style='whitegrid')
%matplotlib inline

## Load Cleaned Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')
train_path = '/content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/twitter_training_clean.csv'
val_path = '/content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/twitter_validation_clean.csv'
test_path = '/content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/twitter_testing_clean.csv'

# Load pre-split datasets
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

print(f"Train set: {train_df.shape[0]} samples")
print(f"Validation set: {val_df.shape[0]} samples")
print(f"Test set: {test_df.shape[0]} samples")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train set: 108000 samples
Validation set: 22107 samples
Test set: 22107 samples


## Create Output Directories

In [None]:
# Create directories for saving features
features_dir = '/content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/features'
os.makedirs(features_dir, exist_ok=True)

# Paths for different feature types
word2vec_path = os.path.join(features_dir, 'word2vec_features.npy')

# Path for saving vectorizers
models_dir = '/content/drive/MyDrive/Colab Notebooks/is5126/final-project/models'
os.makedirs(models_dir, exist_ok=True)

word2vec_model_path = os.path.join(models_dir, 'word2vec_model')

## Prepare Labels for Model Training

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode sentiment labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_df['sentiment'])

# Display the encoding mapping
print("Label Encoding:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{label} -> {i}")

Label Encoding:
Irrelevant -> 0
Negative -> 1
Neutral -> 2
Positive -> 3


### Word2Vec Embeddings

In [None]:
# Train Word2Vec model on the training dataset
print("Training Word2Vec model...")
word2vec_model = Word2Vec(train_df['tokens'].tolist(),
                          vector_size=100,
                          window=5,
                          min_count=5,
                          workers=4,
                          sg=1) # Skip-gram model

# Save the model for future use
word2vec_model.save(word2vec_model_path)
print(f"Word2Vec model saved to {word2vec_model_path}")

# Function to create document vectors by averaging word vectors
def get_doc_vector(tokens, model, vector_size=100):
    # Initialize an empty array
    doc_vector = np.zeros(vector_size)
    count = 0

    # Average the word vectors for each token in the document
    for token in tokens:
        if token in model.wv:
            doc_vector += model.wv[token]
            count += 1

    # Avoid division by zero
    if count > 0:
        doc_vector /= count

    return doc_vector

# Generate Word2Vec features for each dataset
print("Generating Word2Vec features for train, validation, and test datasets...")
X_train_word2vec = np.array([get_doc_vector(tokens, word2vec_model) for tokens in train_df['tokens']])
X_val_word2vec = np.array([get_doc_vector(tokens, word2vec_model) for tokens in val_df['tokens']])
X_test_word2vec = np.array([get_doc_vector(tokens, word2vec_model) for tokens in test_df['tokens']])

print(f"Word2Vec features shape (train): {X_train_word2vec.shape}")
print(f"Word2Vec features shape (val): {X_val_word2vec.shape}")
print(f"Word2Vec features shape (test): {X_test_word2vec.shape}")

# Save Word2Vec features
np.save(word2vec_path.replace('.npy', '_train.npy'), X_train_word2vec)
np.save(word2vec_path.replace('.npy', '_val.npy'), X_val_word2vec)
np.save(word2vec_path.replace('.npy', '_test.npy'), X_test_word2vec)
print(f"Word2Vec features saved to {word2vec_path}")



Training Word2Vec model...
Word2Vec model saved to /content/drive/MyDrive/Colab Notebooks/is5126/final-project/models/word2vec_model
Generating Word2Vec features for train, validation, and test datasets...
Word2Vec features shape (train): (108000, 100)
Word2Vec features shape (val): (22107, 100)
Word2Vec features shape (test): (22107, 100)
Word2Vec features saved to /content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/features/word2vec_features.npy
