In [1]:
import pandas as pd
import os
import json
import re
import ast


### 1. Preprocessing

In [2]:
es_data = pd.DataFrame()
en_data = pd.DataFrame()

for part in range(1, 21):
    folder = f"part_{part}/"
    files = os.listdir(folder)
    
    for file in files:
        if file.endswith(".csv"):  
            filepath = os.path.join(folder, file)
            data_part = pd.read_csv(filepath, low_memory=False)
            
            es_part = data_part[data_part['lang'] == 'es']
            es_size = es_part.shape[0]
            
            if es_size > 0:
                en_part = data_part[data_part['lang'] == 'en'].sample(es_size, random_state=442)
                
                es_data = pd.concat([es_data, es_part], axis=0) 
                en_data = pd.concat([en_data, en_part], axis=0)

es_data.reset_index(drop=True, inplace=True)
en_data.reset_index(drop=True, inplace=True)

  es_data = pd.concat([es_data, es_part], axis=0)
  en_data = pd.concat([en_data, en_part], axis=0)
  es_data = pd.concat([es_data, es_part], axis=0)
  en_data = pd.concat([en_data, en_part], axis=0)
  es_data = pd.concat([es_data, es_part], axis=0)
  en_data = pd.concat([en_data, en_part], axis=0)
  es_data = pd.concat([es_data, es_part], axis=0)
  en_data = pd.concat([en_data, en_part], axis=0)
  es_data = pd.concat([es_data, es_part], axis=0)
  en_data = pd.concat([en_data, en_part], axis=0)
  es_data = pd.concat([es_data, es_part], axis=0)
  en_data = pd.concat([en_data, en_part], axis=0)
  es_data = pd.concat([es_data, es_part], axis=0)
  en_data = pd.concat([en_data, en_part], axis=0)
  es_data = pd.concat([es_data, es_part], axis=0)
  en_data = pd.concat([en_data, en_part], axis=0)
  es_data = pd.concat([es_data, es_part], axis=0)
  en_data = pd.concat([en_data, en_part], axis=0)
  es_data = pd.concat([es_data, es_part], axis=0)
  en_data = pd.concat([en_data, en_part], axis=0)


In [3]:
print("English tweets raw count: ", en_data.shape[0], "\nSpanish tweets raw count: ", es_data.shape[0])

English tweets raw count:  372957 
Spanish tweets raw count:  372957


In [12]:
en_data.retweetedUserID.isna().sum()

370443

In [13]:
es_data.retweetedUserID.isna().sum()

371113

In [11]:
es_data.columns

Index(['id', 'text', 'url', 'epoch', 'media', 'retweetedTweet',
       'retweetedTweetID', 'retweetedUserID', 'id_str', 'lang', 'rawContent',
       'replyCount', 'retweetCount', 'likeCount', 'quoteCount',
       'conversationId', 'conversationIdStr', 'hashtags', 'mentionedUsers',
       'links', 'viewCount', 'quotedTweet', 'in_reply_to_screen_name',
       'in_reply_to_status_id_str', 'in_reply_to_user_id_str', 'location',
       'cash_app_handle', 'user', 'date', '_type'],
      dtype='object')

In [9]:
# Keep relevant and non-empty columns
columns_to_keep = [
    'id', 'text', 'lang', 'epoch', 'hashtags', 'links', 
    'replyCount', 'retweetCount', 'likeCount', 'quoteCount', 
    'conversationId', 'mentionedUsers', 'user'
]

In [10]:
en_data = en_data[columns_to_keep]
es_data = es_data[columns_to_keep]


In [11]:
def user_to_dict(user):
    try:
        # User metadata interested in
        user_metadata = ['id_str', 'followersCount', 'friendsCount', 'statusesCount']
        
        metadata_dict = {}
        
        for metadata in user_metadata:
            pattern = rf"'{metadata}':\s*('[^']*'|\d+|True|False|None)"
            match = re.search(pattern, user)
            
            if match:
                metadata_dict[metadata] = match.group(1)  # Extract the matched value
            else:
                metadata_dict[metadata] = None
        
        return metadata_dict
    
    except Exception as e:
        print("Error occurred:", e)
        print("User data:", user)
        raise

In [12]:
en_data['user'] = en_data['user'].apply(user_to_dict)
es_data['user'] = es_data['user'].apply(user_to_dict)


In [13]:
# Flatten user metadata into separate columns
user_metadata = ['id_str', 'followersCount', 'friendsCount', 'statusesCount']
for meta in user_metadata:
    en_data[meta] = en_data['user'].apply(lambda x: x.get(meta))
    es_data[meta] = es_data['user'].apply(lambda x: x.get(meta))



In [14]:
en_data.drop(columns=['user'], inplace=True)
es_data.drop(columns=['user'], inplace=True)


In [15]:
def convert_format(field):
    return ast.literal_eval(field)

# Convert string representation of lists into lists
en_data['hashtags'] = en_data['hashtags'].apply(convert_format)
en_data['links'] = en_data['links'].apply(convert_format)
en_data['mentionedUsers'] = en_data['mentionedUsers'].apply(convert_format)

es_data['hashtags'] = es_data['hashtags'].apply(convert_format)
es_data['links'] = es_data['links'].apply(convert_format)
es_data['mentionedUsers'] = es_data['mentionedUsers'].apply(convert_format)


In [31]:
en_data['hashtags'][372956]

[{'indices': [28, 47], 'text': 'IStandWithKejriwal'}]

### 2. Data exploration

#### 2.1 Popular hashtags

In [17]:
# en_data = pd.read_csv("en.csv")
# es_data = pd.read_csv("es.csv")
# def convert_format(field):
#     return ast.literal_eval(field)

# en_data['hashtags'] = en_data['hashtags'].apply(convert_format)
# en_data['links'] = en_data['links'].apply(convert_format)
# en_data['mentionedUsers'] = en_data['mentionedUsers'].apply(convert_format)

# es_data['hashtags'] = es_data['hashtags'].apply(convert_format)
# es_data['links'] = es_data['links'].apply(convert_format)
# es_data['mentionedUsers'] = es_data['mentionedUsers'].apply(convert_format)


In [18]:
from collections import Counter

en_tags = []
es_tags = []

for hashtags in en_data['hashtags']:
    if len(hashtags):
        en_tags.extend(tag['text'] for tag in hashtags if 'text' in tag)
        
for hashtags in es_data['hashtags']:
    if len(hashtags):
        es_tags.extend(tag['text'] for tag in hashtags if 'text' in tag)
        
en_tag_freq = Counter(en_tags)
es_tag_freq = Counter(es_tags)

In [23]:
sum(en_tag_freq.values()) 

71955

In [22]:
sum(es_tag_freq.values()) 

109529

In [24]:
# total_en_tags = sum(en_tag_freq.values())  
# total_es_tags = sum(es_tag_freq.values())  

# en_tag_percent = Counter({tag: (count / total_en_tags) * 100 for tag, count in en_tag_freq.items()})
# es_tag_percent = Counter({tag: (count / total_es_tags) * 100 for tag, count in es_tag_freq.items()})

print("\nTop 10 English Tags by Frequency:")
for tag, frequency in en_tag_freq.most_common(10):
    print(f"{tag} {frequency}")

print("\nTop 10 Spanish Tags by Frequency:")
for tag, frequency in es_tag_freq.most_common(10):
    print(f"{tag} {frequency}")


Top 10 English Tags by Frequency:
Trump2024 5180
MAGA 4516
Trump 2664
Biden 2265
BidenHarris2024 1351
DonaldTrump 667
maga 595
Biden2024 555
trump 515
GOP 504

Top 10 Spanish Tags by Frequency:
Biden 4945
Trump 3972
Trump2024 3236
EEUU 2892
MAGA 1912
Mundo 1795
DonaldTrump 1766
EstadosUnidos 1470
Internacional 1402
JoeBiden 1269


#### 2.2 Popular links

In [25]:
en_links = []
es_links = []

for links in en_data['links']:
    if len(links):
        en_links.extend(link['expanded_url'] for link in links if 'expanded_url' in link)
        
for links in es_data['links']:
    if len(links):
        es_links.extend(link['expanded_url'] for link in links if 'expanded_url' in link)
        
en_link_freq = Counter(en_links)
es_link_freq = Counter(es_links)

In [27]:
total_en_links = sum(en_link_freq.values())  
total_es_links = sum(es_link_freq.values())  
print(f"Total links shared in English data: {total_en_links}. Total links shared in Spanish data: {total_es_links}")

Total links shared in English data: 37842. Total links shared in Spanish data: 85300


In [30]:
# total_en_links = sum(en_link_freq.values())  
# total_es_links = sum(es_link_freq.values())  

# en_link_percent = Counter({link: (count / total_en_links) * 100 for link, count in en_link_freq.items()})
# es_link_percent = Counter({link: (count / total_es_links) * 100 for link, count in es_link_freq.items()})

print("\nTop 10 English Links by Frequency:")
for link, freq in en_link_freq.most_common(10):
    print(f"{link} {freq}")

print("\nTop 10 Spanish Links by Frequency:")
for link, freq in es_link_freq.most_common(10):
    print(f"{link} {freq}")


Top 10 English Links by Frequency:
https://www.politico.com/live-updates/2024/07/08/congress/defiant-biden-tells-donors-were-done-with-the-debate-00166834 43
https://x.com/DonaldToTheMoon/status/1814940269604684154 38
https://dailym.ai/ios 37
http://msn.com 24
https://www.donaldjtrump.com/agenda47 22
https://secure.actblue.com/donate/social-bfp-july-2024-v2 22
https://www.foxnews.com/politics/federal-judge-blocks-biden-title-ix-rule-4-states-abuse-power 18
http://bit.ly/news-net 17
https://opr.as/share 17
https://newsvendor.com 17

Top 10 Spanish Links by Frequency:
http://nmas.com.mx/foro-tv 95
https://linktr.ee/whitehouse 90
https://www.clarin.com/mundo/millonaria-batalla-donaciones-sacude-duelo-joe-biden-donald-trump_0_lTageeKP3H.html 84
https://www.clarin.com/mundo/atentado-donald-trump-impacto-ataque-puede-tener-elecciones-presidenciales-noviembre_0_XkoXa4yAQa.html 50
https://fbvo.short.gy/fAhuoh 44
https://fbvo.short.gy/ANrsQI 43
https://www.clarin.com/mundo/trump-retomo-campana

In [20]:
print("Unqiue users for downsampled English data:",len(set(en_data['id_str']) ))

Unqiue users for downsampled English data: 227414


In [21]:
print("Unqiue users for downsampled English data:",len(set(es_data['id_str'])))

Unqiue users for downsampled English data: 125909


#### Check the amount of bilingual posters (case study)

In [23]:
users_en = set(en_data['id_str'])  
users_es = set(es_data['id_str'])

bilingual_posters = users_en.intersection(users_es)

bilingual_posters = list(bilingual_posters)



In [24]:
print(len(bilingual_posters), "users posted in English and Spanish")

7317 users posted in English and Spanish


In [25]:
en_data.to_csv("en.csv", index=False)
es_data.to_csv("es.csv", index=False)