CS 7643 Project
Georgia Institute of Technology

Authors: Erick Ordonez and Daniel Solon

# Preprocessing 2024 US Elections Dataset using Glove Embeddings

## Import libraries

In [1]:
import os

# Torch 
import torch

# Reading Data
import pandas as pd

# NLP
from nltk.tokenize import word_tokenize
import numpy as np
import nltk

import emoji

## Download Tokenizer

In [153]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\djsal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\djsal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Tokenization and Embedding Helper Functions

In [154]:
def load_glove_embeddings(glove_file_path):
        embeddings = {}
        with open(glove_file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings[word] = vector
        return embeddings


def is_float(s):
    try:
        float(s)
        return True
    except ValueError:
        return False
    
    
def embed_text(text, glove_embeddings):
        # Tokenize the text
        tokens = word_tokenize(text)
        embedding = []
        
        # Get GloVe embeddings for each word in the tokenized text
        for word in tokens:
            
            # Applying Kudugunta's Rules
            if word == '#':
                
                word = "<hashtag>"
            
            elif word == '@':
                
                word = "<user>"
            
            elif word == "https" or word == "HTTPS":
                
                word = "<url>"
            
            elif word[0:3] == "//t":
                
                word = "<url>"
            
            elif word.isdigit() or is_float(word):
                
                word = "<number>"
                
            # Replacing emojis
            if emoji.is_emoji(word):
                word = emoji.demojize(word)
                word = "<" + word[1:-1] + ">"
                            
            # For word in all caps
            if word.isupper():
                
                first_word = word.lower()
                
                if first_word in glove_embeddings:
                    embedding.append(glove_embeddings[first_word])
                else:
                    # If the word is not in GloVe, append a zero vector (or you can handle it differently)
                    embedding.append(np.zeros(200))  # Assuming the GloVe embeddings are 200-dimensional
                    
                second_word = "<allcaps>"
                
                if second_word in glove_embeddings:
                    
                    embedding.append(glove_embeddings[second_word])
                else:
                    # If the word is not in GloVe, append a zero vector (or you can handle it differently)
                    embedding.append(np.zeros(200))  # Assuming the GloVe embeddings are 200-dimensional
            
            else: 
                word = word.lower()
                
                if word in glove_embeddings:
                    embedding.append(glove_embeddings[word])
                else:
                    # If the word is not in GloVe, append a zero vector (or you can handle it differently)
                    embedding.append(np.zeros(200))  # Assuming the GloVe embeddings are 200-dimensional

        # Return the average embedding for the entire sentence (or you can return a list of vectors)
        if embedding:
            return np.mean(embedding, axis=0)
        else:
            return np.zeros(200)  # Default if no tokens are found in the embeddings

## Data Processing Helper Functions

In [155]:
def process_user_info(s):

    # Getting Indices for Data
    user_id_index = s.find("\'id\'':")
    user_id_str_index = s.find("\'id_str\'':")
    url_index = s.find("\'url\'':")
    username_index = s.find("\'username\':")
    raw_desc_index = s.find("\'rawDescription\':")
    created_index = s.find("\'created\':")
    followers_count_index = s.find("\'followersCount\':")
    friends_count_index = s.find("\'friendsCount\':")
    statuses_count_index = s.find("\'statusesCount\':")
    favourites_count_index = s.find("\'favouritesCount\':")
    listed_count_index = s.find("\'listedCount\':")
    media_count_index = s.find("\'mediaCount\':")
    location_index = s.find("\'location\':")
    profileImageUrl_index = s.find("\'profileImageUrl\':")
    profileBannerUrl_index = s.find("\'profileBannerUrl\':")
    protected_index = s.find("\'protected\':")
    verified_index = s.find("\'verified\':")
    blue_index = s.find("\'blue\':")
    blue_type_index = s.find("\'blueType\':")
    description_links_index = s.find("\'descriptionLinks\':")
    type_index = s.find("\'x_type\':")
    
    all_data = []
    
    # Getting items for data  
    followers_count_items = s[followers_count_index: friends_count_index - 2].split(":")
    all_data.append((followers_count_items[0], followers_count_items[1]))
    
    friends_count_items = s[friends_count_index: statuses_count_index - 2].split(":")
    all_data.append((friends_count_items[0], friends_count_items[1]))
    
    favourites_count_items = s[favourites_count_index: listed_count_index - 2].split(":")
    all_data.append((favourites_count_items[0], favourites_count_items[1]))
    
    listed_count_items = s[listed_count_index: media_count_index - 2].split(":")
    all_data.append((listed_count_items[0], listed_count_items[1]))
    
    verified_items = s[verified_index: blue_index - 2].split(":")
    all_data.append((verified_items[0], verified_items[1]))
    
    numerical = ["followersCount", "friendsCount", "favouritesCount", "listedCount"]
    
    result = {}
    for data in all_data:
        
        key, value = data
        
        key = key.replace("\'",'')
        if key in numerical:
            
            result[key.strip()] = int(value)
        
        else:
            result[key.strip()] = value.strip()
        
    return result


def get_followers(user_info):
    
    return user_info['followersCount']


def get_friends(user_info):
    
    return user_info['friendsCount']


def get_favorites(user_info):
    
    return user_info['favouritesCount']


def get_listed(user_info):
    
    return user_info['listedCount']


def get_verified(user_info):
    
    return user_info['verified']


def encode_verified(row):
    if row == "False":
        
        return  0
    
    else:
        
        return 1

## Reading in Data

Read elections data for August 2024 from random days:

In [119]:
df_aug12 = pd.read_csv("../Data/aug_chunk_20.csv.gz",  compression='gzip')
df_aug12[["date"]].value_counts()

date      
2024-08-12    50000
Name: count, dtype: int64

In [118]:
df_aug18 = pd.read_csv("../Data/aug_chunk_10.csv.gz",  compression='gzip')
df_aug18[["date"]].value_counts()

date      
2024-08-18    50000
Name: count, dtype: int64

In [117]:
df_aug21 = pd.read_csv("../Data/aug_chunk_40.csv.gz",  compression='gzip')
df_aug21[["date"]].value_counts()

date      
2024-08-21    50000
Name: count, dtype: int64

Read elections data for September 2024 from random days:

In [116]:
df_sept4 = pd.read_csv("../Data/september_chunk_1.csv.gz",  compression='gzip')
df_sept4[["date"]].value_counts()

date      
2024-09-04    50000
Name: count, dtype: int64

In [115]:
df_sept17 = pd.read_csv("../Data/september_chunk_30.csv.gz",  compression='gzip')
df_sept17[["date"]].value_counts()

date      
2024-09-17    50000
Name: count, dtype: int64

In [114]:
df_sept20 = pd.read_csv("../Data/september_chunk_20.csv.gz",  compression='gzip')
df_sept20[["date"]].value_counts()

date      
2024-09-20    50000
Name: count, dtype: int64

Read elections data for October 2024 from random days:

In [113]:
df_oct3 = pd.read_csv("../Data/october_chunk_15.csv.gz",  compression='gzip')
df_oct3[["date"]].value_counts()

date      
2024-10-03    46882
2024-10-02     3118
Name: count, dtype: int64

In [111]:
df_oct15 = pd.read_csv("../Data/october_chunk_10.csv.gz",  compression='gzip')
df_oct15[["date"]].value_counts()

date      
2024-10-15    47537
2024-10-12     2463
Name: count, dtype: int64

In [112]:
df_oct31 = pd.read_csv("../Data/october_chunk_21.csv.gz",  compression='gzip')
df_oct31[["date"]].value_counts()

date      
2024-10-31    46949
2024-10-29     3051
Name: count, dtype: int64

Read elections data for November 2024 (post-elections) from random days:

In [120]:
df_nov7 = pd.read_csv("../Data/november_chunk_10.csv.gz",  compression='gzip')
df_nov7[["date"]].value_counts()

date      
2024-11-07    50000
Name: count, dtype: int64

In [121]:
df_nov24 = pd.read_csv("../Data/november_chunk_20.csv.gz",  compression='gzip')
df_nov24[["date"]].value_counts()

date      
2024-11-24    50000
Name: count, dtype: int64

In [122]:
df_nov29 = pd.read_csv("../Data/november_chunk_30.csv.gz",  compression='gzip')
df_nov29[["date"]].value_counts()

date      
2024-11-29    50000
Name: count, dtype: int64

Concatenate random days data for each month:

In [128]:
df_aug = pd.concat([df_aug12, df_aug18, df_aug21], axis=0)
df_sept = pd.concat([df_sept4, df_sept17, df_sept20], axis=0)
df_oct = pd.concat([df_oct3, df_oct15, df_oct31], axis=0)
df_nov = pd.concat([df_nov7, df_nov24, df_nov29], axis=0)

In [129]:
df_aug[["date"]].value_counts()

date      
2024-08-12    50000
2024-08-18    50000
2024-08-21    50000
Name: count, dtype: int64

In [130]:
df_sept[["date"]].value_counts()

date      
2024-09-04    50000
2024-09-17    50000
2024-09-20    50000
Name: count, dtype: int64

In [142]:
display(df_oct[["date"]].value_counts())
df_oct[["date"]].value_counts().sum()

date      
2024-10-15    47537
2024-10-31    46949
2024-10-03    46882
2024-10-02     3118
2024-10-29     3051
2024-10-12     2463
Name: count, dtype: int64

150000

In [136]:
df_nov[["date"]].value_counts()

date      
2024-11-07    50000
2024-11-24    50000
2024-11-29    50000
Name: count, dtype: int64

Extract relevant data:

In [143]:
df_aug_relevant = df_aug[['text', 'user']]
df_sept_relevant = df_sept[['text', 'user']]
df_oct_relevant = df_oct[['text', 'user']]
df_nov_relevant = df_nov[['text', 'user']]

In [150]:
display(df_aug_relevant.head())
display(df_sept_relevant.head())
display(df_oct_relevant.head())
display(df_nov_relevant.head())

Unnamed: 0,text,user
0,@FoxNews You’re running a crooked GOP candidat...,"{'id': 1087605901417156608, 'id_str': '1087605..."
1,@TheBadgerCzar @BubblegumOut @jaybleft All qui...,"{'id': 1579037977354727425, 'id_str': '1579037..."
2,@EFCJamesLynch @BrutalNz Thanks. Puts a bit mo...,"{'id': 1232939726396542977, 'id_str': '1232939..."
3,@redleg1066 @Bonderant2 @shipwreckedcrew Trump...,"{'id': 3269133127, 'id_str': '3269133127', 'ur..."
4,Dan Pena predict Donald Trump 2024 election wi...,"{'id': 1487411247024848901, 'id_str': '1487411..."


Unnamed: 0,text,user
0,@UHN_Plus Lo malo es que luego la admi istraci...,"{'id': 1434722499053379586, 'id_str': '1434722..."
1,@RossKneeDeep Because they legitimately care a...,"{'id': 1615180209422041092, 'id_str': '1615180..."
2,@JazzyJa78308052 @marklevinshow Christ?...you ...,"{'id': 31494130, 'id_str': '31494130', 'url': ..."
3,@realDonaldTrump will always be my president. ...,"{'id': 899742746688278528, 'id_str': '89974274..."
4,@MSNBC Where is Biden 🤔,"{'id': 1688299498903044096, 'id_str': '1688299..."


Unnamed: 0,text,user
0,@ndelriego Actually you should have respected ...,"{'id': 1809634223256682496, 'id_str': '1809634..."
1,@TheGrayRider @Eb67735B @KidRocker76 @TomaisMa...,"{'id': 1818287928747163648, 'id_str': '1818287..."
2,After #SCOTUS ruled presidents can't be charge...,"{'id': 1603804200336498688, 'id_str': '1603804..."
3,@PunchingCat Donald Trump? Is it you?,"{'id': 717874436, 'id_str': '717874436', 'url'..."
4,We are trying not to 🤬🤬🤬🤬,"{'id': 1920840390, 'id_str': '1920840390', 'ur..."


Unnamed: 0,text,user
0,"Barack Obama Congratulates Donald Trump, Break...","{'id': 2262532860, 'id_str': '2262532860', 'ur..."
1,@ivankatrumpo Yes\nIn far does that try to kil...,"{'id': 1591147489763131392, 'id_str': '1591147..."
2,"Over time, Southern whites increasingly shifte...","{'id': 1507777504370593795, 'id_str': '1507777..."
3,"This led to a ""Southern Strategy,"" where the R...","{'id': 1507777504370593795, 'id_str': '1507777..."
4,"@MarioNawfal A new coalition, a new Republican...","{'id': 1831664841976688640, 'id_str': '1831664..."


## Getting Tensors (tweet embeddings, metadata)

In [151]:
def process_data(df):
    # Getting user info from user col
    relevant = df.copy()
    relevant['user'] = relevant['user'].astype(str)

    relevant['user_dict'] = relevant['user'].apply(process_user_info)

    relevant['followers_count'] = relevant['user_dict'].apply(get_followers)

    relevant['friends_count'] = relevant['user_dict'].apply(get_friends)

    relevant['favourites_count'] = relevant['user_dict'].apply(get_favorites)

    relevant['listed_count'] = relevant['user_dict'].apply(get_listed)

    relevant['verified'] = relevant['user_dict'].apply(get_verified)

    # Getting Correct Data Types
    relevant['verified'] = relevant['verified'].astype(str)
    relevant['text'] = relevant['text'].astype(str)

    relevant['verified'] = relevant['verified'].apply(encode_verified)

    # Dropping unused columns
    relevant = relevant.drop(columns = ['user', 'user_dict'])

    # Tokenizing and Gtting glove embeddings for tweets
    tweets_df = relevant.copy()

    # Loading Glove Embeddings
    glove_embeddings = load_glove_embeddings('../Data/glove.6B.200d.txt')

    # Apply embedding to the 'Text' column
    tweets_df['glove_emb'] = tweets_df['text'].apply(lambda x: embed_text(x, glove_embeddings))

    embedding_list = np.vstack(tweets_df['glove_emb'].values)

    # EMBEDDINGS TENSOR
    tweet_glove_embeddings = torch.tensor(embedding_list)

    # METADATA TENSOR
    df_num_cat = relevant.copy()
    df_num_cat = df_num_cat.drop(columns = ['text'])

    metadata_tensor = torch.tensor(df_num_cat.values)

    return tweet_glove_embeddings, metadata_tensor

In [156]:
aug_tweet_embeddings, aug_metadata = process_data(df_aug_relevant)
sept_tweet_embeddings, sept_metadata = process_data(df_sept_relevant)
oct_tweet_embeddings, oct_metadata = process_data(df_oct_relevant)
nov_tweet_embeddings, nov_metadata = process_data(df_nov_relevant)

In [157]:
print(aug_tweet_embeddings.shape, aug_metadata.shape)
print(sept_tweet_embeddings.shape, sept_metadata.shape)
print(oct_tweet_embeddings.shape, oct_metadata.shape)
print(nov_tweet_embeddings.shape, nov_metadata.shape)

torch.Size([150000, 200]) torch.Size([150000, 5])
torch.Size([150000, 200]) torch.Size([150000, 5])
torch.Size([150000, 200]) torch.Size([150000, 5])
torch.Size([150000, 200]) torch.Size([150000, 5])


## Save Embeddings

In [158]:
newpath = r"../Data/Processed_Data"

if not os.path.exists(newpath):
    os.makedirs(newpath) 
    print(f"Directory created: {newpath}")
else:
    print(f"Directory already exists: {newpath}")

Directory already exists: ../Data/Processed_Data


In [159]:
# Saving tensors to files
torch.save(aug_tweet_embeddings, '../Data/Processed_Data/aug_elections_tweet_emb.pth')
torch.save(sept_tweet_embeddings, '../Data/Processed_Data/sept_elections_tweet_emb.pth')
torch.save(oct_tweet_embeddings, '../Data/Processed_Data/oct_elections_tweet_emb.pth')
torch.save(nov_tweet_embeddings, '../Data/Processed_Data/nov_elections_tweet_emb.pth')

torch.save(aug_metadata, '../Data/Processed_Data/aug_elections_metadata.pth')
torch.save(sept_metadata, '../Data/Processed_Data/sept_elections_metadata.pth')
torch.save(oct_metadata, '../Data/Processed_Data/oct_elections_metadata.pth')
torch.save(nov_metadata, '../Data/Processed_Data/nov_elections_metadata.pth')