# ELMo-LSA-SVM Model
- Uses GPU if available
- Creates pickle files for the ELMo embeddings under "/embeddings/ELMo/"

---

**TODO**
- LSA Step
- SVM Step
- Metrics

In [1]:
## turn off warnings for cleaner execution
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

## IMPORTS
# ELMo
import tensorflow as tf
import tensorflow_hub as hub
import pickle
import time
import os
from sklearn.model_selection import train_test_split

# Preprocessing
import csv
import chardet
import numpy as np
import re
import pandas as pd
import demoji


import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

2024-11-23 01:38:04.212151: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732297084.247298    1609 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732297084.257258    1609 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/walnuts/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# check if gpu is being used
num_gpu = len(tf.config.list_physical_devices('GPU'))
print("Num GPUs Available: ", num_gpu)

print("Using GPU" if num_gpu > 0 else "Not Using GPU")

Num GPUs Available:  1
Using GPU


In [3]:
# Load the dataset / read csv
# MAKE SURE CSV IS IN UTF-8 (if tweets have emojis)

## Testing if csv can open (Use for troubleshooting)
# with open('Eng_Tweets.csv') as csv_file:
#   csvFile = csv.reader(csv_file, delimiter=',')
#   for row in csv_file:
#     print(row)

pd.set_option('display.max_colwidth', None)
csv_path = 'Eng_Tweets.csv'

df = pd.read_csv(csv_path)
print("total rows: " + str(len(df)))
df.head() ## head won't show emojis unless using print function

total rows: 2147


Unnamed: 0,Username,Tweet,Date posted,Label
0,@fudaishii,i'm genuinely going to attempt tonight i can't do this anymore i can't handle all this stress i wish i was never born bro,9/11/24,1
1,@yourdystiny,"Becoming less reactive is a huge part of growth & decreasing stress. If you let everything get you worked up, you’ll damage your mind, body & spirit.",9/8/24,0
2,@ocenhxu,me ??? tired ??? stressed ??? exhausted ??? i wanna cry ??? yes.,9/8/24,1
3,@ifeelgoodto,"skipping meals, irregular sleeping habits, overthinking, stress, tired and drained. that's me, that's my everyday life",9/10/24,0
4,@ysuckme,"you deserve to be happy. not confused, not hurt, not stressed, just happy.",9/8/24,0


## Preprocessing
- Remove mentions (@)
- Remove hashtags (#)
- Remove URLs
- Replace emojis with textual description (Using demoji)
- Remove stop words

In [4]:
### PREPROCESSING STEPS
def clean_text(text):
  # Remove mentions
  text = re.sub(r'@[A-Za-z0-9_]+', '', text)
  # Remove hashtags
  text = re.sub(r'#\w+', '', text)
  # Remove URLs
  text = re.sub(r'http\S+|www\S+|https\S+', '', text)
  # Replace emoji with textual descriptions
  text = demoji.replace_with_desc(text)
  text = re.sub(r'(:[a-zA-Z\s]+:)', r' \1 ', text) # Add spaces around the shortcode
  text = re.sub(r'(:[a-zA-Z\s]+:)', lambda match: match.group(0).replace(' ', '_'), text)
  text = text.strip()
  return text

stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
  words = text.split()
  filtered_words = [word for word in words if word.lower() not in stop_words]
  return ' '.join(filtered_words)

df['Tweet'] = df['Tweet'].apply(clean_text)
df['Tweet'] = df['Tweet'].apply(remove_stopwords)

In [5]:
# df.head()

In [6]:
# print(df.loc[24])

## Run ELMo in batches
- Splits data into train-test-validate (70%-20%-10%)
- Uses the default ELMo model
- Max Sequence Length for embedding padding
- Done in batches to avoid computational overload

In [7]:
# Function to split the dataset into train, validation, and test
def split_data(df, train_size=0.7, val_size=0.1, test_size=0.2):
    # 70% into training 30% into validation + testing 
    train_df, temp_df = train_test_split(df, train_size=train_size, random_state=42)
    
    # 30% splits 10% into validation and 20% into testing
    val_df, test_df = train_test_split(temp_df, train_size=val_size / (val_size + test_size), random_state=42)
    
    return train_df, val_df, test_df

In [8]:
# Batch processing elmo with padding
def process_in_batches(df, batch_size=50, max_seq_length=280, pickle_dir="embeddings/ELMo/"):
    num_rows = len(df)
    
    # Ensure the directory exists for saving pickled files
    os.makedirs(pickle_dir, exist_ok=True)
    
    # Iterate over the dataframe in chunks of batch_size
    for start_idx in range(0, num_rows, batch_size):
        end_idx = min(start_idx + batch_size, num_rows)
        batch = df["Tweet"].iloc[start_idx:end_idx]

        # Measure time for each batch
        start = time.time()
        embeddings_tensor = elmo(tf.constant(batch))["elmo"]
        end = time.time()
        
        length = end - start
        print(f"Processed rows {start_idx} to {end_idx - 1}")
        print("Time Taken for batch: ", length, "seconds")
        print("Embedding Shape for batch: ", embeddings_tensor.shape, "\n")
        
        # Pad all sequences in the batch to the fixed max length
        padded_batch_embeddings = []
        for i in range(embeddings_tensor.shape[0]):
            seq_length = embeddings_tensor[i].shape[0]
            padding_needed = max_seq_length - seq_length
            # Pad the sequence with zeros (pad to match the max length)
            padded_seq = tf.pad(embeddings_tensor[i], [[0, padding_needed], [0, 0]], mode='CONSTANT')
            padded_batch_embeddings.append(padded_seq)
        
        # Stack the padded embeddings into a tensor
        padded_batch_embeddings = tf.stack(padded_batch_embeddings)

        # Save the embeddings for this batch to a pickle file
        pickle_file = os.path.join(pickle_dir, f"batch_{start_idx}_embeddings.pkl")
        with open(pickle_file, 'wb') as f:
            pickle.dump(padded_batch_embeddings.numpy(), f)
        
        print(f"Saved embeddings for batch {start_idx} to {pickle_file}")

In [9]:
# Load ELMo default model
print("Downloading elmo model at: \"https://tfhub.dev/google/elmo/2\"")
elmo = hub.load("https://tfhub.dev/google/elmo/2").signatures["default"]
print("Loaded elmo model")

# Sequence length for padding (should be >= max tweet length in dataset)
MAX_SEQ_LEN = 280

# Assuming 'df' is the dataframe with your dataset and the column containing text is "Tweet"
train_df, val_df, test_df = split_data(df, train_size=0.7, val_size=0.1, test_size=0.2)

# Process the train, validation, and test data
print("Processing train dataset...")
start_total = time.time()
process_in_batches(train_df, batch_size=50, max_seq_length=MAX_SEQ_LEN, pickle_dir="embeddings/ELMo/train/")
end_total = time.time()
print(f"Total Time for Training Set: {end_total - start_total} seconds")

print("Processing validation dataset...")
start_total = time.time()
process_in_batches(val_df, batch_size=50, max_seq_length=MAX_SEQ_LEN, pickle_dir="embeddings/ELMo/val/")
end_total = time.time()
print(f"Total Time for Validation Set: {end_total - start_total} seconds")

print("Processing test dataset...")
start_total = time.time()
process_in_batches(test_df, batch_size=50, max_seq_length=MAX_SEQ_LEN, pickle_dir="embeddings/ELMo/test/")
end_total = time.time()
print(f"Total Time for Test Set: {end_total - start_total} seconds")

total_time = end_total - start_total
print("Generated Embeddings for entire dataset")
print("Total Time Taken: ", total_time, "seconds")


Downloading elmo model at: "https://tfhub.dev/google/elmo/2"


I0000 00:00:1732297091.162046    1609 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3539 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


Loaded elmo model
Processing train dataset...


I0000 00:00:1732297093.107365    1676 cuda_dnn.cc:529] Loaded cuDNN version 90300


Processed rows 0 to 49
Time Taken for batch:  3.0125675201416016 seconds
Embedding Shape for batch:  (50, 37, 1024) 

Saved embeddings for batch 0 to embeddings/ELMo/train/batch_0_embeddings.pkl
Processed rows 50 to 99
Time Taken for batch:  1.1658337116241455 seconds
Embedding Shape for batch:  (50, 49, 1024) 

Saved embeddings for batch 50 to embeddings/ELMo/train/batch_50_embeddings.pkl
Processed rows 100 to 149
Time Taken for batch:  0.7889902591705322 seconds
Embedding Shape for batch:  (50, 30, 1024) 

Saved embeddings for batch 100 to embeddings/ELMo/train/batch_100_embeddings.pkl
Processed rows 150 to 199
Time Taken for batch:  0.7993190288543701 seconds
Embedding Shape for batch:  (50, 31, 1024) 

Saved embeddings for batch 150 to embeddings/ELMo/train/batch_150_embeddings.pkl
Processed rows 200 to 249
Time Taken for batch:  0.862567663192749 seconds
Embedding Shape for batch:  (50, 32, 1024) 

Saved embeddings for batch 200 to embeddings/ELMo/train/batch_200_embeddings.pkl
Pr