In [122]:
from dataclasses import dataclass, field  # Import field
import csv
import os
import random
import pandas as pd
import ast
import numpy as np
import re

In [123]:
#Remove tweets talking about the train derailment in East Palestine, OH
def not_palestine_OH(text: str, window_size: int = 50) -> bool:
    """
    Check if keywords appear near each other in text.
    Returns True if at least two keywords are found within window_size words.
    """
    keywords = ["east palestine", "oh", "train", "derailment", "e. palestine"]
    text = text.lower()
    
    words = text.split()
    for i in range(len(words)):
        window = ' '.join(words[i:i + window_size])
        found = sum(1 for k in keywords if re.search(r'\b' + re.escape(k) + r'\b', window))
        if found >= 1:
            return False
            
    return True

#Remove any potential duplication in tweets (because of multiple scraping sessions)
def remove_duplicate_rows(df, columns_to_check):
    """
    Removes duplicate rows from a DataFrame, keeping only the latest instance.
    
    Parameters:
    df (pandas.DataFrame): The input DataFrame.
    columns_to_check (list): A list of column names to check for duplicates.
    
    Returns:
    pandas.DataFrame: The DataFrame with duplicate rows removed.
    """
    # Sort the DataFrame by the columns to check, in descending order
    df = df.sort_values(by=columns_to_check, ascending=False)
    
    # Drop duplicate rows, keeping the first occurrence
    df = df.drop_duplicates(subset=columns_to_check, keep='first')
    
    return df

Step 1: CONCATENATE ALL FILES IN PARSED DATA FOLDER
The output will be the Pre-processing.csv, which will be used to Step 2 below. 

In [124]:
@dataclass
class Influencer:
    name: str
    affiliation: str
    no_followers: int = 0
    before_corpus: list = field(default_factory=list) 
    before_likes: list = field(default_factory=list)
    before_retweets: list = field(default_factory=list)
    after_corpus: list = field(default_factory=list)
    after_likes: list = field(default_factory=list)
    after_retweets: list = field(default_factory=list)

data = []
account_list = []

#Loading File paths
supplementary_folder = "Supplementary Materials"
influencers_path = os.path.join(supplementary_folder, "Followers List & Categories - Accounts Kept.csv")

#Population the data file with initial data of the available influencers
with open(influencers_path, newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader) #skip header
    for line in reader:
        name = line[0]
        account_list.append(name[1:]) #Creating a list of influencers account names

        affiliation = line[1]
        # if affiliation == " Libertarian Party":
        #     affiliation = affiliation[1:]

        followers = line[2]
        if affiliation or followers:
            data.append(Influencer(name[1:], affiliation, followers))

parsed_before_folder = os.path.join("Parsed Data", "Before")
parsed_after_folder = os.path.join("Parsed Data", "After")

before_files = [f for f in os.listdir(parsed_before_folder) if f.endswith('.csv')] #Getting all before files
after_files = [f for f in os.listdir(parsed_after_folder) if f.endswith('.csv')] #Getting all after files

def getting_values(files: list, path: str, after: bool):
    for f in files:
        with open(f"{path}/{f}", 'r') as csvfile:
            reader = csv.reader(csvfile)
            next(reader) #skip header
            
            for line in reader:
                if not line:  # Skip empty lines
                    continue
                if line[0] == "|RUN STATISTICS|": # End of file, move on to next file
                    break
            
                name = line[0].strip() if line else ""
                tweet = line[2].strip()
                date = line[1]
                like = int(line[3].strip())
                retweet = int(line[4].strip())

                for i in data:
                    if name == i.name:
                        if after and (tweet not in c for c in i.after_corpus) and like >1000 and not_palestine_OH(tweet):
                                i.after_corpus.append(tweet)
                                i.after_likes.append(like)
                                i.after_retweets.append(retweet)
                        else:
                            if (tweet not in c for c in i.before_corpus) and like > 1000 and not_palestine_OH(tweet):
                                i.before_corpus.append(tweet)
                                i.before_likes.append(like)
                                i.before_retweets.append(retweet)

getting_values(before_files, parsed_before_folder, False)
getting_values(after_files, parsed_after_folder, True)

# Check results for each influencer
for i in data:
    print(f"\nInfluencer: {i.name}")
    print(f"Before corpus size: {len(i.before_corpus)}")
    # print(f"Before likes list: {(i.before_likes)}")
    # print(f"Before retweets list: {(i.before_retweets)}")
    print(f"After corpus size: {len(i.after_corpus)}")
    # print(f"After likes list: {(i.after_likes)}")
    # print(f"After retweets list: {(i.after_retweets)}")



Influencer: SabbySabs2
Before corpus size: 0
After corpus size: 2

Influencer: MsLaToshaBrown
Before corpus size: 0
After corpus size: 1

Influencer: RonFilipkowski
Before corpus size: 3
After corpus size: 20

Influencer: KyleKulinski
Before corpus size: 0
After corpus size: 12

Influencer: funder
Before corpus size: 0
After corpus size: 4

Influencer: mmpadellan
Before corpus size: 0
After corpus size: 12

Influencer: krystalball
Before corpus size: 0
After corpus size: 8

Influencer: SteveSchmidtSES
Before corpus size: 4
After corpus size: 4

Influencer: robreiner
Before corpus size: 1
After corpus size: 0

Influencer: marceelias
Before corpus size: 0
After corpus size: 0

Influencer: TheRickWilson
Before corpus size: 5
After corpus size: 8

Influencer: davidsirota
Before corpus size: 0
After corpus size: 2

Influencer: TristanSnell
Before corpus size: 0
After corpus size: 16

Influencer: KyleClark
Before corpus size: 1
After corpus size: 0

Influencer: PatrickSvitek
Before corpus s

In [125]:
with open("Supplementary Materials/Pre-processing.csv", 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Name','Affiliation', 'Number of Followers', "Before Corpus", "Before List of Likes", "Before List of Retweets", "After Corpus", "After List of Likes", "After List of Retweets"])
    
        for i in data:
            if i.before_corpus and i.after_corpus:
                writer.writerow([i.name, i.affiliation, i.no_followers, i.before_corpus, i.before_likes, i.before_retweets, i.after_corpus, i.after_likes, i.after_retweets])

STEP 2: SUBJECT ANONYMIZATION

In [126]:
df = pd.read_csv("Supplementary Materials/Pre-processing.csv")

# Generate random IDs for all subjects - uncomment to generate initial Ledger file
# random_ids = random.sample(range(10, 99), len(df))  # Modified to match DataFrame length

# # Assign random IDs directly to the Subject ID column
# df["Subject ID"] = random_ids

# #Use to create new ledger file
# df_nametoID = df[["Name", "Subject ID"]]
# df_nametoID.head()
# df_nametoID.to_csv('Supplementary Materials/Subject Ledger.csv', index=False) 


# Load in existing Ledger file
ledger = pd.read_csv("Supplementary Materials/Subject Ledger.csv") 
ledger_records = ledger.to_dict(orient='records')

ledger_dict = {}

for l in ledger_records:
    ledger_dict[l['Name']] = l['Subject ID']

df["Subject ID"] = df["Name"].map(ledger_dict)

df.head()

Unnamed: 0,Name,Affiliation,Number of Followers,Before Corpus,Before List of Likes,Before List of Retweets,After Corpus,After List of Likes,After List of Retweets,Subject ID
0,RonFilipkowski,Democratic Party,1000000.0,['Dershowitz said Trump asked him at dinner wh...,"[1337, 7377, 7417]","[303, 1882, 1888]",['How odd for Trump to Blame America First for...,"[1520, 2370, 1244, 1440, 2369, 34866, 3056, 15...","[363, 703, 229, 476, 702, 18715, 597, 363, 477...",39
1,SteveSchmidtSES,Democratic Party,1500000.0,['A direct threat against American Jews by a d...,"[1223, 1420, 1420, 1238]","[251, 459, 459, 254]","['""The corruption of one man in Israel has bro...","[4262, 4259, 13223, 4230]","[1588, 1586, 4037, 1580]",92
2,TheRickWilson,Democratic Party,1600000.0,"['The purpose of terrorism is to terrorize.', ...","[6283, 6283, 6242, 6289, 3502]","[732, 732, 730, 732, 460]",['If you think its bad thing that the leaders ...,"[3334, 1634, 2311, 3333, 1633, 2308, 3309, 3046]","[315, 304, 218, 317, 304, 218, 316, 658]",60
3,natsechobbyist,Democratic Party,477600.0,['My daughter is at Hebrew school this morning...,[3510],[83],['I despise Bibi. I don’t think it’s a genoci...,"[1568, 2340, 4235]","[595, 540, 683]",77
4,anthonyzenkus,Democratic Party,88800.0,['You cannot say you care about women in Iran ...,"[1471, 1464]","[642, 642]",['They release this the day we find out that I...,"[1739, 1505, 1305, 5419, 1016, 1008]","[190, 619, 488, 2199, 741, 733]",63


In [127]:
df_noName = df[["Subject ID", "Affiliation", "Number of Followers", "Before Corpus", "Before List of Likes", "Before List of Retweets", "After Corpus", "After List of Likes", "After List of Retweets"]]

df_noName.to_csv("Cleaned Data/All_NN_Cleaned.csv", index=False)

CONTRAST CODING

In [128]:
df_noName["Contrast"] = 0.0

affiliation_dict = {
    "Republican Party": -0.5,
    "Democratic Party": 0.5
    # "Other": 0.0
}

df_noName["Contrast"] = df_noName["Affiliation"].map(affiliation_dict)


In [129]:
#Before
df_noName_B = df_noName[["Subject ID", "Affiliation", "Number of Followers", "Before Corpus", "Before List of Likes", "Before List of Retweets", "Contrast"]]
#After
df_noName_A = df_noName[["Subject ID", "Affiliation", "Number of Followers", "After Corpus", "After List of Likes", "After List of Retweets", "Contrast"]]

#Convert strings representation of list into list
df_noName_A.loc[:, "After Corpus"] = df_noName_A['After Corpus'].apply(ast.literal_eval)
df_noName_A.loc[:, "After List of Likes"] = df_noName_A['After List of Likes'].apply(ast.literal_eval)
df_noName_A.loc[:, "After List of Retweets"] = df_noName_A['After List of Retweets'].apply(ast.literal_eval)

df_noName_B.loc[:, "Before Corpus"] = df_noName_B['Before Corpus'].apply(ast.literal_eval)
df_noName_B.loc[:, "Before List of Likes"] = df_noName_B['Before List of Likes'].apply(ast.literal_eval)
df_noName_B.loc[:, "Before List of Retweets"] = df_noName_B['Before List of Retweets'].apply(ast.literal_eval)

#Explode all three columns
df_noName_A = df_noName_A.explode(['After Corpus', 'After List of Likes', 'After List of Retweets']).reset_index(drop=True)
df_noName_B = df_noName_B.explode(['Before Corpus', 'Before List of Likes', 'Before List of Retweets']).reset_index(drop=True)

df_noName_A = remove_duplicate_rows(df_noName_A, "After Corpus")
df_noName_B = remove_duplicate_rows(df_noName_B, "Before Corpus")


In [130]:
# # Generate random IDs for all tweets - uncomment to generate initial Ledger file
# tweets_random_ids = random.sample(range(100, 999), len(df_noName_A) + len(df_noName_B))  # Modified to match DataFrame length

# # Assign random IDs directly to the Subject ID column
# df_noName_A["Tweet ID"] = tweets_random_ids[:len(df_noName_A)]
# df_noName_B["Tweet ID"] = tweets_random_ids[len(df_noName_A):]

# # Select and rename columns to have a common column name "Corpus" 
# df_tweet_before = df_noName_B[["Tweet ID", "Before Corpus"]].rename(columns={"Before Corpus": "Corpus"}) 
# df_tweet_after = df_noName_A[["Tweet ID", "After Corpus"]].rename(columns={"After Corpus": "Corpus"}) 

# # Add a new column to indicate the source of the corpus 
# df_tweet_before["Source"] = "Before" 
# df_tweet_after["Source"] = "After" 

# # Concatenate the DataFrames 
# df_tweet_ledger = pd.concat([df_tweet_before, df_tweet_after], ignore_index=True)
# df_tweet_ledger.head()
# df_tweet_ledger.to_csv('Supplementary Materials/Tweets Ledger.csv', index=False) #Export to csv

In [131]:
# Load in existing Ledger file
tweet_ledger = pd.read_csv("Supplementary Materials/Tweets Ledger.csv") 
tweet_ledger_records = tweet_ledger.to_dict(orient='records')
tweet_ledger_dict = {}

for l in tweet_ledger_records:
    tweet_ledger_dict[l['Corpus']] = l['Tweet ID']

df_noName_A["Tweet ID"] = df_noName_A["After Corpus"].map(tweet_ledger_dict)
df_noName_B["Tweet ID"] = df_noName_B["Before Corpus"].map(tweet_ledger_dict)

# df_noName_B.head()

#Reordering the columns
df_noName_B = df_noName_B[["Subject ID", "Affiliation", "Number of Followers", "Tweet ID", "Contrast", "Before Corpus", "Before List of Likes", "Before List of Retweets"]]
df_noName_A = df_noName_A[["Subject ID", "Affiliation", "Number of Followers", "Tweet ID", "Contrast", "After Corpus", "After List of Likes", "After List of Retweets"]]

#Exporting the DF to csv
df_noName_A.to_csv('Cleaned Data/After_NN_Cleaned.csv', index=False) 
df_noName_B.to_csv('Cleaned Data/Before_NN_Cleaned.csv', index=False) 

In [134]:
print("|STATISTICS|")
print(f"Before Tweets: {len(df_noName_B)}")
print(f"After Tweets: {len(df_noName_A)}")
print(f"Number of Democratic Influencer: {sum(df_noName['Affiliation'] == 'Democratic Party')}")
print(f"Number of Republican Influencer: {sum(df_noName['Affiliation'] == 'Republican Party')}")
print(f"Number of Democrats Tweets After: {sum(df_noName_A['Affiliation'] == 'Democratic Party')}")
print(f"Number of Republican Tweets After: {sum(df_noName_A['Affiliation'] == 'Republican Party')}")
print(f"Number of Democrats Tweets Before: {sum(df_noName_B['Affiliation'] == 'Democratic Party')}")
print(f"Number of Republican Tweets Before: {sum(df_noName_B['Affiliation'] == 'Republican Party')}")

|STATISTICS|
Before Tweets: 64
After Tweets: 154
Number of Democratic Influencer: 13
Number of Republican Influencer: 14
Number of Democrats Tweets After: 67
Number of Republican Tweets After: 87
Number of Democrats Tweets Before: 30
Number of Republican Tweets Before: 34
