# Imports

In [89]:
# Torch 
import torch
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms

# Reading Data
import json
import pandas as pd

# NLP
from nltk.tokenize import word_tokenize
import numpy as np
import nltk
from gensim.models.keyedvectors import KeyedVectors
import emoji

In [90]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/erickordonez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/erickordonez/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Tokenization and Embedding Helper Functions

In [91]:
def load_glove_embeddings(glove_file_path):
        embeddings = {}
        with open(glove_file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype='float32')
                embeddings[word] = vector
        return embeddings

def is_float(s):
    try:
        float(s)
        return True
    except ValueError:
        return False
    
def embed_text(text, glove_embeddings):
        # Tokenize the text
        tokens = word_tokenize(text)
        embedding = []
        
        # Get GloVe embeddings for each word in the tokenized text
        for word in tokens:
            
            # Applying Kudugunta's Rules
            if word == '#':
                
                word = "<hashtag>"
            
            elif word == '@':
                
                word = "<user>"
            
            elif word == "https" or word == "HTTPS":
                
                word = "<url>"
            
            elif word[0:3] == "//t":
                
                word = "<url>"
            
            elif word.isdigit() or is_float(word):
                
                word = "<number>"
                
            # Replacing emojis

            if emoji.is_emoji(word):
                word = emoji.demojize(word)
                word = "<" + word[1:-1] + ">"

                            
            # For word in all caps
            if word.isupper():
                
                first_word = word.lower()
                
                if first_word in glove_embeddings:
                    embedding.append(glove_embeddings[first_word])
                else:
                    # If the word is not in GloVe, append a zero vector (or you can handle it differently)
                    embedding.append(np.zeros(200))  # Assuming the GloVe embeddings are 200-dimensional
                    
                second_word = "<allcaps>"
                
                if second_word in glove_embeddings:
                    
                    embedding.append(glove_embeddings[second_word])
                else:
                    # If the word is not in GloVe, append a zero vector (or you can handle it differently)
                    embedding.append(np.zeros(200))  # Assuming the GloVe embeddings are 200-dimensional
            
            else: 
                word = word.lower()
                
                if word in glove_embeddings:
                    embedding.append(glove_embeddings[word])
                else:
                    # If the word is not in GloVe, append a zero vector (or you can handle it differently)
                    embedding.append(np.zeros(200))  # Assuming the GloVe embeddings are 200-dimensional

        # Return the average embedding for the entire sentence (or you can return a list of vectors)
        if embedding:
            return np.mean(embedding, axis=0)
        else:
            return np.zeros(200)  # Default if no tokens are found in the embeddings

# Data Processing Helper Functions

In [None]:
def process_user_info(s):
    

    # Getting Indices for Data
    user_id_index = s.find("\'id\'':")
    user_id_str_index = s.find("\'id_str\'':")
    url_index = s.find("\'url\'':")
    username_index = s.find("\'username\':")
    raw_desc_index = s.find("\'rawDescription\':")
    created_index = s.find("\'created\':")
    followers_count_index = s.find("\'followersCount\':")
    friends_count_index = s.find("\'friendsCount\':")
    statuses_count_index = s.find("\'statusesCount\':")
    favourites_count_index = s.find("\'favouritesCount\':")
    listed_count_index = s.find("\'listedCount\':")
    media_count_index = s.find("\'mediaCount\':")
    location_index = s.find("\'location\':")
    profileImageUrl_index = s.find("\'profileImageUrl\':")
    profileBannerUrl_index = s.find("\'profileBannerUrl\':")
    protected_index = s.find("\'protected\':")
    verified_index = s.find("\'verified\':")
    blue_index = s.find("\'blue\':")
    blue_type_index = s.find("\'blueType\':")
    description_links_index = s.find("\'descriptionLinks\':")
    type_index = s.find("\'x_type\':")
    
    all_data = []
    
    # Getting items for data  
    followers_count_items = s[followers_count_index: friends_count_index - 2].split(":")
    all_data.append((followers_count_items[0], followers_count_items[1]))
    
    friends_count_items = s[friends_count_index: statuses_count_index - 2].split(":")
    all_data.append((friends_count_items[0], friends_count_items[1]))
    
    favourites_count_items = s[favourites_count_index: listed_count_index - 2].split(":")
    all_data.append((favourites_count_items[0], favourites_count_items[1]))
    
    listed_count_items = s[listed_count_index: media_count_index - 2].split(":")
    all_data.append((listed_count_items[0], listed_count_items[1]))
    
    verified_items = s[verified_index: blue_index - 2].split(":")
    all_data.append((verified_items[0], verified_items[1]))
    
     
    numerical = ["followersCount", "friendsCount", "favouritesCount", "listedCount"]
    
    #print(all_data)
    result = {}
    for data in all_data:
        
        key, value = data
        
        key = key.replace("\'",'')
        if key in numerical:
            
            result[key.strip()] = int(value)
        
        else:
            result[key.strip()] = value.strip()
        
    return result
def get_followers(user_info):
    
    return user_info['followersCount']

def get_friends(user_info):
    
    return user_info['friendsCount']

def get_favorites(user_info):
    
    return user_info['favouritesCount']

def get_listed(user_info):
    
    return user_info['listedCount']

def get_verified(user_info):
    
    return user_info['verified']

def encode_verified(row):
    if row == "False":
        
        return  0
    
    else:
        
        return 1

# Reading in Data

In [93]:
df = pd.read_csv("may_july_chunk_1.csv.gz",  compression='gzip')

In [94]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,text,url,epoch,media,retweetedTweet,retweetedTweetID,retweetedUserID,id_str,...,quotedTweet,in_reply_to_screen_name,in_reply_to_status_id_str,in_reply_to_user_id_str,location,cash_app_handle,user,date,_type,type
0,0,1801041792923578484,@lukepbeasley I cant imagine anyone actually f...,https://twitter.com/orgneyezedchaos/status/180...,1718237000.0,[],False,,,1801041792923578484,...,False,lukepbeasley,1.800681e+18,1.483596e+18,,,"{'id': 942869257108455424, 'id_str': '94286925...",2024-06-12,,tweet-
1,1,1801041792630227173,Voters can also sway me away from voting for ...,https://twitter.com/Brandon62294232/status/180...,1718237000.0,[],False,,,1801041792630227173,...,False,,,,,,"{'id': 1461100431329796100, 'id_str': '1461100...",2024-06-12,,tweet-
2,2,1801041792592224521,@PoodleHead57 @BobOnderMO Can you name that am...,https://twitter.com/JohnRMBR911/status/1801041...,1718237000.0,[],False,,,1801041792592224521,...,False,PoodleHead57,1.80103e+18,1.480159e+18,,,"{'id': 1655734665955737600, 'id_str': '1655734...",2024-06-12,,tweet-
3,3,1801041791463866688,@Morning_Joe @JoeNBC The fact remains that Joe...,https://twitter.com/andy_leq/status/1801041791...,1718237000.0,[],False,,,1801041791463866688,...,False,Morning_Joe,1.800873e+18,254117400.0,,,"{'id': 1771777682587713536, 'id_str': '1771777...",2024-06-12,,tweet-
4,4,1801041790952231228,@BidenHQ That's funny you're obviously trying ...,https://twitter.com/Ranchhandlb7/status/180104...,1718237000.0,[],False,,,1801041790952231228,...,False,KamalaHQ,1.800703e+18,3315265000.0,,,"{'id': 874708668, 'id_str': '874708668', 'url'...",2024-06-12,,tweet-


In [95]:
relevant = df[['text', 'user']]

In [96]:
relevant.head()

Unnamed: 0,text,user
0,@lukepbeasley I cant imagine anyone actually f...,"{'id': 942869257108455424, 'id_str': '94286925..."
1,Voters can also sway me away from voting for ...,"{'id': 1461100431329796100, 'id_str': '1461100..."
2,@PoodleHead57 @BobOnderMO Can you name that am...,"{'id': 1655734665955737600, 'id_str': '1655734..."
3,@Morning_Joe @JoeNBC The fact remains that Joe...,"{'id': 1771777682587713536, 'id_str': '1771777..."
4,@BidenHQ That's funny you're obviously trying ...,"{'id': 874708668, 'id_str': '874708668', 'url'..."


In [101]:
def process_data(df):
    # Getting user info from user col
    relevant = df.copy()
    relevant['user'] = relevant['user'].astype(str)

    relevant['user_dict'] = relevant['user'].apply(process_user_info)

    relevant['followers_count'] = relevant['user_dict'].apply(get_followers)

    relevant['friends_count'] = relevant['user_dict'].apply(get_friends)

    relevant['favourites_count'] = relevant['user_dict'].apply(get_favorites)

    relevant['listed_count'] = relevant['user_dict'].apply(get_listed)

    relevant['verified'] = relevant['user_dict'].apply(get_verified)

    # Getting Correct Data Types

    relevant['verified'] = relevant['verified'].astype(str)
    relevant['text'] = relevant['text'].astype(str)

    relevant['verified'] = relevant['verified'].apply(encode_verified)

    # Dropping unused columns

    relevant = relevant.drop(columns = ['user', 'user_dict'])

    # Tokenizing and Gtting glove embeddings for tweets
    tweets_df = relevant.copy()

    # Loading Glove Embeddings
    glove_embeddings = load_glove_embeddings('glove.6B.200d.txt')

    # Apply embedding to the 'Text' column
    tweets_df['glove_emb'] = tweets_df['text'].apply(lambda x: embed_text(x, glove_embeddings))

    embedding_list = np.vstack(tweets_df['glove_emb'].values)

    # EMBEDDINGS TENSOR
    tweet_glove_embeddings = torch.tensor(embedding_list)

    # METADATA TENSOR
    df_num_cat = relevant.copy()
    df_num_cat = df_num_cat.drop(columns = ['text'])

    metadata_tensor = torch.tensor(df_num_cat.values)

    return tweet_glove_embeddings, metadata_tensor

# Getting Tensors (tweet embeddings, metadata)

In [102]:
tweet_embeddings, metadata = process_data(relevant)

In [104]:
print(tweet_embeddings.shape, metadata.shape)

torch.Size([50000, 200]) torch.Size([50000, 5])


In [None]:
# Saving tensors to files

torch.save(tweet_embeddings, '../Data/Processed_Data/elections_tweet_emb.pth')
torch.save(metadata, '../Data/Processed_Data/elections_metadata.pth')