In [1]:
from dataclasses import dataclass, field  # Import field
import csv
import os
import random
import pandas as pd
import ast
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Step 1: CONCATENATE ALL FILES IN PARSED DATA FOLDER
The output will be the Pre-processing.csv, which will be used to Step 2 below. 

In [2]:
@dataclass
class Influencer:
    name: str
    affiliation: str
    no_followers: int = 0
    before_corpus: list = field(default_factory=list) 
    before_likes: list = field(default_factory=list)
    before_retweets: list = field(default_factory=list)
    after_corpus: list = field(default_factory=list)
    after_likes: list = field(default_factory=list)
    after_retweets: list = field(default_factory=list)

data = []
account_list = []

#Loading File paths
supplementary_folder = "Supplementary Materials"
influencers_path = os.path.join(supplementary_folder, "Followers List & Categories - Accounts Kept.csv")

#Population the data file with initial data of the available influencers
with open(influencers_path, newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader) #skip header
    for line in reader:
        name = line[0]
        account_list.append(name[1:]) #Creating a list of influencers account names

        affiliation = line[1]
        # if affiliation == " Libertarian Party":
        #     affiliation = affiliation[1:]

        followers = line[2]
        if affiliation or followers:
            data.append(Influencer(name[1:], affiliation, followers))

parsed_before_folder = os.path.join("Parsed Data", "Before")
parsed_after_folder = os.path.join("Parsed Data", "After")

before_files = [f for f in os.listdir(parsed_before_folder) if f.endswith('.csv')] #Getting all before files
after_files = [f for f in os.listdir(parsed_after_folder) if f.endswith('.csv')] #Getting all after files

def getting_values(files: list, path: str, after: bool):
    for f in files:
        with open(f"{path}/{f}", 'r') as csvfile:
            reader = csv.reader(csvfile)
            next(reader) #skip header
            
            for line in reader:
                if not line:  # Skip empty lines
                    continue
                if line[0] == "|RUN STATISTICS|": # End of file, move on to next file
                    break
            
                name = line[0].strip() if line else ""
                tweet = line[2].strip()
                date = line[1]
                like = int(line[3].strip())
                retweet = int(line[4].strip())

                for i in data:
                    if name == i.name:
                        if after and (tweet not in c for c in i.after_corpus):
                                i.after_corpus.append(tweet)
                                i.after_likes.append(like)
                                i.after_retweets.append(retweet)
                        else:
                            if (tweet not in c for c in i.before_corpus):
                                i.before_corpus.append(tweet)
                                i.before_likes.append(like)
                                i.before_retweets.append(retweet)

getting_values(before_files, parsed_before_folder, False)
getting_values(after_files, parsed_after_folder, True)


# Check results for each influencer
for i in data:
    print(f"\nInfluencer: {i.name}")
    print(f"Before corpus size: {len(i.before_corpus)}")
    # print(f"Before likes list: {(i.before_likes)}")
    # print(f"Before retweets list: {(i.before_retweets)}")
    print(f"After corpus size: {len(i.after_corpus)}")
    # print(f"After likes list: {(i.after_likes)}")
    # print(f"After retweets list: {(i.after_retweets)}")



Influencer: SabbySabs2
Before corpus size: 1
After corpus size: 2

Influencer: MsLaToshaBrown
Before corpus size: 0
After corpus size: 1

Influencer: RonFilipkowski
Before corpus size: 12
After corpus size: 20

Influencer: KyleKulinski
Before corpus size: 0
After corpus size: 12

Influencer: funder
Before corpus size: 0
After corpus size: 4

Influencer: mmpadellan
Before corpus size: 4
After corpus size: 12

Influencer: krystalball
Before corpus size: 5
After corpus size: 8

Influencer: SteveSchmidtSES
Before corpus size: 4
After corpus size: 4

Influencer: robreiner
Before corpus size: 1
After corpus size: 0

Influencer: marceelias
Before corpus size: 0
After corpus size: 0

Influencer: TheRickWilson
Before corpus size: 5
After corpus size: 8

Influencer: davidsirota
Before corpus size: 2
After corpus size: 2

Influencer: TristanSnell
Before corpus size: 6
After corpus size: 16

Influencer: KyleClark
Before corpus size: 1
After corpus size: 0

Influencer: PatrickSvitek
Before corpus 

In [3]:
with open("Supplementary Materials/Pre-processing.csv", 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Name','Affiliation', 'Number of Followers', "Before Corpus", "Before List of Likes", "Before List of Retweets", "After Corpus", "After List of Likes", "After List of Retweets"])
    
        for i in data:
            if i.before_corpus and i.after_corpus:
                writer.writerow([i.name, i.affiliation, i.no_followers, i.before_corpus, i.before_likes, i.before_retweets, i.after_corpus, i.after_likes, i.after_retweets])

STEP 2: SUBJECT ANONYMIZATION

In [119]:
df = pd.read_csv("Supplementary Materials/Pre-processing.csv")

# random_ids = random.sample(range(10, 99), 41) Can be used to create Random IDs

ledger = pd.read_csv("Supplementary Materials/Ledger.csv") # Load in existing Ledger file
ledger_records = ledger.to_dict(orient='records')

ledger_dict = {}

for l in ledger_records:
    ledger_dict[l['Name']] = l['Subject ID']

df["Subject ID"] = df["Name"].map(ledger_dict)

df.head()



Unnamed: 0,Name,Affiliation,Number of Followers,Before Corpus,Before List of Likes,Before List of Retweets,After Corpus,After List of Likes,After List of Retweets,Subject ID
0,SabbySabs2,Democratic Party,75700,"['Every resident in East Palestine, OH should ...",[1322],[488],"['Ilhan Omar: ""If you really wanted a ceasefir...","[1275, 2264]","[429, 681]",97
1,RonFilipkowski,Democratic Party,1000000,['Dershowitz said Trump asked him at dinner wh...,"[1337, 7409, 1012, 25397, 7377, 1331, 1578, 25...","[303, 1887, 208, 6635, 1882, 305, 420, 6617, 1...",['How odd for Trump to Blame America First for...,"[1520, 2370, 1244, 1440, 2369, 34866, 3056, 15...","[363, 703, 229, 476, 702, 18715, 597, 363, 477...",92
2,mmpadellan,Democratic Party,1300000,['Even Fox News knows that the deregulation by...,"[1069, 24343, 1078, 24557]","[314, 16978, 315, 17047]",['While Democrats push for ceasefire and human...,"[3900, 5003, 29181, 3901, 5852, 15884, 5008, 2...","[2146, 1190, 7332, 2149, 612, 2544, 1192, 7337...",52
3,krystalball,Democratic Party,587700,['CNN had a literal lobbyist for Norfolk South...,"[4215, 1991, 14437, 4236, 2000]","[1211, 457, 3108, 1220, 459]","['The targeting of Al Shifa hospital, is also ...","[1753, 10846, 3408, 1704, 8826, 1747, 1266, 2145]","[670, 3168, 591, 520, 2242, 666, 480, 717]",87
4,SteveSchmidtSES,Democratic Party,1500000,['A direct threat against American Jews by a d...,"[1223, 1420, 1420, 1238]","[251, 459, 459, 254]","['""The corruption of one man in Israel has bro...","[4262, 4259, 13223, 4230]","[1588, 1586, 4037, 1580]",62


In [120]:
# Use to create new ledger file
# df_nametoID = df[["Name", "Subject ID"]]
# df_nametoID.to_csv('Supplementary Materials/Ledger.csv', index=False) 

df_noName = df[["Subject ID", "Affiliation", "Number of Followers", "Before Corpus", "Before List of Likes", "Before List of Retweets", "After Corpus", "After List of Likes", "After List of Retweets"]]

df_noName.to_csv("Cleaned Data/All_NN_Cleaned.csv", index=False)

CONTRAST CODING

In [121]:
df_noName["Contrast"] = 0.0

affiliation_dict = {
    "Republican Party": -0.5,
    "Democratic Party": 0.5,
    "Other": 0.0
}

df_noName["Contrast"] = df_noName["Affiliation"].map(affiliation_dict)


In [122]:
df_noName_B = df_noName[["Subject ID", "Affiliation", "Number of Followers", "Before Corpus", "Before List of Likes", "Before List of Retweets"]]

df_noName_A = df_noName[["Subject ID", "Affiliation", "Number of Followers", "After Corpus", "After List of Likes", "After List of Retweets"]]


#Convert strings representation of list into list
df_noName_A.loc[:, "After Corpus"] = df_noName_A['After Corpus'].apply(ast.literal_eval)
df_noName_A.loc[:, "After List of Likes"] = df_noName_A['After List of Likes'].apply(ast.literal_eval)
df_noName_A.loc[:, "After List of Retweets"] = df_noName_A['After List of Retweets'].apply(ast.literal_eval)

df_noName_B.loc[:, "Before Corpus"] = df_noName_B['Before Corpus'].apply(ast.literal_eval)
df_noName_B.loc[:, "Before List of Likes"] = df_noName_B['Before List of Likes'].apply(ast.literal_eval)
df_noName_B.loc[:, "Before List of Retweets"] = df_noName_B['Before List of Retweets'].apply(ast.literal_eval)

#Explode all three columns
df_noName_A = df_noName_A.explode(['After Corpus', 'After List of Likes', 'After List of Retweets']).reset_index(drop=True)

df_noName_B = df_noName_B.explode(['Before Corpus', 'Before List of Likes', 'Before List of Retweets']).reset_index(drop=True)

#Remove any potential duplication in tweets (because of multiple scraping sessions)
def remove_duplicate_rows(df, columns_to_check):
    """
    Removes duplicate rows from a DataFrame, keeping only the latest instance.
    
    Parameters:
    df (pandas.DataFrame): The input DataFrame.
    columns_to_check (list): A list of column names to check for duplicates.
    
    Returns:
    pandas.DataFrame: The DataFrame with duplicate rows removed.
    """
    # Sort the DataFrame by the columns to check, in descending order
    df = df.sort_values(by=columns_to_check, ascending=False)
    
    # Drop duplicate rows, keeping the first occurrence
    df = df.drop_duplicates(subset=columns_to_check, keep='first')
    
    return df

df_noName_A = remove_duplicate_rows(df_noName_A, "After Corpus")

df_noName_B = remove_duplicate_rows(df_noName_B, "Before Corpus")


#Exporting the DF to csv
df_noName_A.to_csv('Cleaned Data/After_NN_Cleaned.csv', index=False) 
df_noName_B.to_csv('Cleaned Data/Before_NN_Cleaned.csv', index=False) 
