## 1. Setup and Imports

In [1]:
!pip install gensim
!pip install --force-reinstall -v "numpy==1.24.2"

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [2]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
from ast import literal_eval

# Word embeddings
import gensim
from gensim.models import Word2Vec

# Visualization settings
plt.style.use('ggplot')
sns.set(style='whitegrid')
%matplotlib inline

## 2. Load Cleaned Dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')
dataset_path = '/content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/twitter_training_clean.csv'

# Load dataset
try:
    df = pd.read_csv(dataset_path)
    print(f"Cleaned dataset loaded with shape: {df.shape}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Please update the dataset path or ensure the preprocessing notebook has been run.")

Mounted at /content/drive
Cleaned dataset loaded with shape: (71255, 5)


In [4]:
# Convert tokens from string representation back to list
df['tokens'] = df['tokens'].apply(lambda x: literal_eval(x) if isinstance(x, str) else x)

# Display the first few rows
df.head()

Unnamed: 0,content,cleaned_content,tokens,entity,sentiment
0,I am coming to the borders and I will kill you...,coming border kill,"[coming, border, kill]",Borderlands,Positive
1,im getting on borderlands and i will kill you ...,im getting borderland kill,"[im, getting, borderland, kill]",Borderlands,Positive
2,im coming on borderlands and i will murder you...,im coming borderland murder,"[im, coming, borderland, murder]",Borderlands,Positive
3,im getting on borderlands 2 and i will murder ...,im getting borderland 2 murder,"[im, getting, borderland, 2, murder]",Borderlands,Positive
4,im getting into borderlands and i can murder y...,im getting borderland murder,"[im, getting, borderland, murder]",Borderlands,Positive


## 3. Create Output Directories

In [5]:
# Create directories for saving features
features_dir = '/content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/features'
os.makedirs(features_dir, exist_ok=True)

# Paths for different feature types
word2vec_path = os.path.join(features_dir, 'word2vec_features.npy')

# Path for saving vectorizers
models_dir = '/content/drive/MyDrive/Colab Notebooks/is5126/final-project/models'
os.makedirs(models_dir, exist_ok=True)

word2vec_model_path = os.path.join(models_dir, 'word2vec_model')

## 4. Prepare Labels for Model Training

In [8]:
from sklearn.preprocessing import LabelEncoder

# Encode sentiment labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['sentiment'])

# Display the encoding mapping
print("Label Encoding:")
for i, label in enumerate(label_encoder.classes_):
    print(f"{label} -> {i}")

Label Encoding:
Irrelevant -> 0
Negative -> 1
Neutral -> 2
Positive -> 3


## 5. Feature Engineering Approaches

### 5.3 Word2Vec Embeddings

In [9]:
# Train Word2Vec model on our corpus
print("Training Word2Vec model...")
word2vec_model = Word2Vec(df['tokens'].tolist(),
                          vector_size=100,
                          window=5,
                          min_count=5,
                          workers=4,
                          sg=1) # Skip-gram model

# Save the model for future use
word2vec_model.save(word2vec_model_path)
print(f"Word2Vec model saved to {word2vec_model_path}")

# Function to create document vectors by averaging word vectors
def get_doc_vector(tokens, model, vector_size=100):
    # Initialize an empty array
    doc_vector = np.zeros(vector_size)
    count = 0

    # Average the word vectors for each token in the document
    for token in tokens:
        if token in model.wv:
            doc_vector += model.wv[token]
            count += 1

    # Avoid division by zero
    if count > 0:
        doc_vector /= count

    return doc_vector

# Create document vectors for each tweet
print("Generating document vectors from Word2Vec...")
X_word2vec = np.array([get_doc_vector(tokens, word2vec_model) for tokens in df['tokens']])

print(f"Word2Vec features shape: {X_word2vec.shape}")

# Save Word2Vec features
np.save(word2vec_path, X_word2vec)
print(f"Word2Vec features saved to {word2vec_path}")

Training Word2Vec model...
Word2Vec model saved to /content/drive/MyDrive/Colab Notebooks/is5126/final-project/models/word2vec_model
Generating document vectors from Word2Vec...
Word2Vec features shape: (71255, 100)
Word2Vec features saved to /content/drive/MyDrive/Colab Notebooks/is5126/final-project/data/features/word2vec_features.npy
