In [86]:
from dataclasses import dataclass, field  # Import field
import csv
import os
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import random
import pandas as pd


In [94]:
@dataclass
class Influencer:
    name: str
    affiliation: str
    no_followers: int = 0
    before_corpus: list = field(default_factory=list) 
    before_likes: list = field(default_factory=list)
    before_retweets: list = field(default_factory=list)
    after_corpus: list = field(default_factory=list)
    after_likes: list = field(default_factory=list)
    after_retweets: list = field(default_factory=list)

data = []
account_list = []

#Loading File paths
supplementary_folder = "Supplementary Materials"
influencers_path = os.path.join(supplementary_folder, "Followers List & Categories - Accounts Kept.csv")

#Population the data file with initial data of the available influencers
with open(influencers_path, newline='') as csvfile:
    reader = csv.reader(csvfile)
    next(reader) #skip header
    for line in reader:
        name = line[0]
        account_list.append(name[1:]) #Creating a list of influencers account names

        affiliation = line[1]
        if affiliation == " Libertarian Party":
            affiliation = affiliation[1:]

        followers = line[2]
        if affiliation or followers:
            data.append(Influencer(name[1:], affiliation, followers))

parsed_before_folder = os.path.join("Parsed Data", "Before")
parsed_after_folder = os.path.join("Parsed Data", "After")

before_files = [f for f in os.listdir(parsed_before_folder) if f.endswith('.csv')] #Getting all before files
after_files = [f for f in os.listdir(parsed_after_folder) if f.endswith('.csv')] #Getting all after files

def getting_values(files: list, path: str, after: bool):
    for f in files:
        with open(f"{path}/{f}", 'r') as csvfile:
            reader = csv.reader(csvfile)
            next(reader) #skip header
            
            for line in reader:
                if not line:  # Skip empty lines
                    continue
                if line[0] == "|RUN STATISTICS|": # End of file, move on to next file
                    break
            
                name = line[0].strip() if line else ""
                tweet = line[2].strip()
                date = line[1]
                like = int(line[3].strip())
                retweet = int(line[4].strip())

                for i in data:
                    if name == i.name:
                        if after and (tweet not in c for c in i.after_corpus):
                                i.after_corpus.append(tweet)
                                i.after_likes.append(like)
                                i.after_retweets.append(retweet)
                        else:
                            if (tweet not in c for c in i.before_corpus):
                                i.before_corpus.append(tweet)
                                i.before_likes.append(like)
                                i.before_retweets.append(retweet)

getting_values(before_files, parsed_before_folder, False)
getting_values(after_files, parsed_after_folder, True)


# Check results for each influencer
for i in data:
    print(f"\nInfluencer: {i.name}")
    print(f"Before corpus size: {len(i.before_corpus)}")
    print(f"Before likes list: {(i.before_likes)}")
    print(f"Before retweets list: {(i.before_retweets)}")
    print(f"After corpus size: {len(i.after_corpus)}")
    print(f"After likes list: {(i.after_likes)}")
    print(f"After retweets list: {(i.after_retweets)}")



Influencer: SabbySabs2
Before corpus size: 2
Before likes list: [934, 934]
Before retweets list: [343, 343]
After corpus size: 26
After likes list: [694, 1275, 201, 638, 2264, 256, 366, 579, 271, 508, 676, 226, 206, 273, 389, 588, 275, 472, 401, 322, 235, 311, 579, 509, 270, 676]
After retweets list: [223, 429, 14, 166, 681, 110, 24, 255, 161, 194, 300, 111, 78, 115, 112, 212, 135, 162, 186, 142, 78, 136, 257, 194, 158, 299]

Influencer: MsLaToshaBrown
Before corpus size: 0
Before likes list: []
Before retweets list: []
After corpus size: 2
After likes list: [290, 3726]
After retweets list: [76, 501]

Influencer: RonFilipkowski
Before corpus size: 8
Before likes list: [1337, 7409, 1012, 25397, 7417, 1338, 2024, 25418]
Before retweets list: [303, 1887, 208, 6635, 1888, 305, 485, 6637]
After corpus size: 24
After likes list: [813, 1520, 2370, 1244, 1440, 2369, 34866, 3056, 1523, 813, 1442, 34702, 3040, 2359, 4799, 1106, 2005, 5266, 1402, 1236, 1049, 954, 780, 5784]
After retweets list: 

In [None]:
with open("Pre-processing.csv", 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Name','Affiliation', 'Number of Followers', "Before Corpus", "After Corpus"])
    
        for i in data:
            if i.before_corpus and i.after_corpus:
                writer.writerow([i.name, i.affiliation, i.no_followers, i.before_corpus, i.after_corpus])

SUBJECT ANONYMIZATION

In [None]:
df = pd.read_csv("Pre-processing.csv")

random_ids = random.sample(range(10, 99), 31)

df["Subject ID"] = random_ids

df.head()


Unnamed: 0,Name,Affiliation,Number of Followers,Before Corpus,After Corpus,Subject ID
0,SabbySabs2,Democratic Party,75700,"Every resident in East Palestine, OH should re...","Israeli host threatened genocide of Gaza, Leba...",69
1,RonFilipkowski,Democratic Party,1000000,Dershowitz said Trump asked him at dinner why ...,The Republican Party wants to give Benjamin Ne...,13
2,mmpadellan,Democratic Party,1300000,Even Fox News knows that the deregulation by t...,While Democrats push for ceasefire and humanit...,81
3,krystalball,Democratic Party,587700,CNN had a literal lobbyist for Norfolk Souther...,"“With Gods help, children in Gaza will die tha...",76
4,SteveSchmidtSES,Democratic Party,1500000,"On The Warning podcast this week, I spoke with...",More than 100 people participated in a flash h...,19


In [None]:
df_nametoID = df[["Name", "Subject ID"]]
df_nametoID.to_csv('Ledger.csv', index=False) 

df_noName = df[["Subject ID", "Affiliation", "Number of Followers", "Before Corpus", "After Corpus"]]
df_noName.to_csv("DF_Cleaned", index=False)

CONTRAST CODING

In [None]:
df_noName["Contrast"] = 0.0

affiliation_dict = {
    "Republican Party": -0.5,
    "Democratic Party": 0.5,
    "Other": 0.0
}

# df_noName.loc[df_noName['Affiliation'] == "Republican Party", 'Contrast'] += -0.5
# df_noName.loc[df_noName['Affiliation'] == "Democratic Party", 'Contrast'] += 0.5
# df_noName.loc[df['Affiliation'] == "Other", 'Contrast'] = 0

df_noName["Contrast"] = df_noName["Affiliation"].map(affiliation_dict)

df_noName.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_noName["Contrast"] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_noName["Contrast"] = df_noName["Affiliation"].map(affiliation_dict)


Unnamed: 0,Subject ID,Affiliation,Number of Followers,Before Corpus,After Corpus,Contrast
0,69,Democratic Party,75700,"Every resident in East Palestine, OH should re...","Israeli host threatened genocide of Gaza, Leba...",0.5
1,13,Democratic Party,1000000,Dershowitz said Trump asked him at dinner why ...,The Republican Party wants to give Benjamin Ne...,0.5
2,81,Democratic Party,1300000,Even Fox News knows that the deregulation by t...,While Democrats push for ceasefire and humanit...,0.5
3,76,Democratic Party,587700,CNN had a literal lobbyist for Norfolk Souther...,"“With Gods help, children in Gaza will die tha...",0.5
4,19,Democratic Party,1500000,"On The Warning podcast this week, I spoke with...",More than 100 people participated in a flash h...,0.5
