### Imports

In [1]:
import json
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

### Process Politician Accounts / Party Membership

In [2]:
accs = pd.json_normalize(json.load(open("followed-accounts.json")))

accs.to_csv("accounts.csv", index=False)

# alternatively: accs = pd.read_csv("accounts.csv", converters = {x: pd.eval for x in range(5)})
politicians = np.concatenate(accs.values[0])

def extract_party(mention_str):
    return [acc_dict[mention["screen_name"]] if mention["screen_name"] in acc_dict.keys() else "Neutral" for mention in eval(mention_str)]

acc_dict = {}

for party in accs:
    for member in accs[party].values[0]:
        acc_dict[member] = party

### Load Data

In [3]:
def load_first_n_chunks(n):
    return pd.concat([pd.json_normalize(json.load(open("chunks/" + chunk))) for chunk in tqdm(os.listdir("chunks")[0:n])])

raw = load_first_n_chunks(10)
# drop tweets where content and user identical
raw = raw.drop_duplicates(subset=["text", "user.id"])
# alternatively: raw = pd.read_csv("raw_10.csv")
raw.to_csv("raw_10.csv", index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.14it/s]


### Generate Condensed Version

In [5]:
condensed = raw[["id", "created_at", "user.id", "user.screen_name", "user.followers_count",
     "text", "in_reply_to_status_id",
     "entities.hashtags", "entities.user_mentions",
     "quoted_status_id", "retweeted_status.id", "retweet_count", "favorite_count"]]

condensed = condensed.assign(is_politician=raw["user.screen_name"].isin(politicians))
condensed.loc[condensed['is_politician'] == True, 'author_party'] = 1
condensed["author_party"] = condensed["user.screen_name"].apply(lambda x: acc_dict[x] if x in acc_dict.keys() else "None")
condensed.to_csv("condensed_10.csv", index=False)

### enrich sentiment CSV for graph interpretation

In [6]:
sent = pd.read_csv("sentiment_10.csv", converters={'Party': pd.eval})

sent["Party"] = sent["entities.user_mentions"].apply(extract_party)

sent.to_csv("sentiment_10.csv", index=False)

sent["weight"] = sent["sentiment"].apply(lambda x: {"positive": 1, "neutral": 0, "negative": -1}[x])

user_party = sent.explode("Party").groupby(["user.id", "Party"],as_index=False)[["weight"]].sum()

user_party.to_csv("user_party.csv", index=False)