### Imports

In [1]:
import json
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

### Process Politician Accounts / Party Membership

In [2]:
accs = pd.json_normalize(json.load(open("followed-accounts.json")))

accs.to_csv("accounts.csv", index=False)

# alternatively: accs = pd.read_csv("accounts.csv", converters = {x: pd.eval for x in range(5)})
politicians = np.concatenate(accs.values[0])

def extract_party(mention_str):
    return [acc_dict[mention["screen_name"]] if mention["screen_name"] in acc_dict.keys() else "Neutral" for mention in eval(mention_str)]

acc_dict = {}

for party in accs:
    for member in accs[party].values[0]:
        acc_dict[member] = party

### Load Data

In [3]:
def load_first_n_chunks(n):
    return pd.concat([pd.json_normalize(json.load(open("chunks/" + chunk))) for chunk in tqdm(os.listdir("chunks")[0:n])])

raw = load_first_n_chunks(10)
# drop tweets where content and user identical
raw = raw.drop_duplicates(subset=["text", "user.id"])
# alternatively: raw = pd.read_csv("raw_10.csv")
raw.to_csv("raw_10.csv", index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.15it/s]


### Generate Condensed Version

In [4]:
condensed = raw[["id", "created_at", "user.id", "user.screen_name", "user.followers_count",
     "text", "in_reply_to_status_id",
     "entities.hashtags", "entities.user_mentions",
     "quoted_status_id", "retweeted_status.id", "retweet_count", "favorite_count"]]

condensed = condensed.assign(is_politician=raw["user.screen_name"].isin(politicians))
condensed.loc[condensed['is_politician'] == True, 'author_party'] = 1
condensed["author_party"] = condensed["user.screen_name"].apply(lambda x: acc_dict[x] if x in acc_dict.keys() else "None")
condensed.to_csv("condensed_10.csv", index=False)

### Enrich with sentiment

In [5]:
from germansentiment import SentimentModel

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
model = SentimentModel()

In [None]:
condensed = pd.read_csv("condensed_10.csv")

In [None]:
# crashes because of memory error
#sentiment = pd.DataFrame({"text": condensed.text.drop_duplicates().head(100), "sentiment": model.predict_sentiment(condensed.text.drop_duplicates()).head(100)})

In [8]:
text = condensed.text.drop_duplicates()
num_chunks = 15
chunk_size = len(text)//num_chunks
textcontent = []
sentiment = []
for i in tqdm(range(num_chunks)):
    textcontent += list(text.iloc[i*chunk_size:(i+1)*chunk_size])
    sentiment += model.predict_sentiment(text.iloc[i*chunk_size:(i+1)*chunk_size])

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [04:10<00:00, 16.71s/it]


In [9]:
df_sent = pd.DataFrame({"text": textcontent, "sentiment": sentiment})
df_sent = condensed.merge(df_sent, on="text")

In [None]:
df_sent.to_csv("sentiment_10.csv", inde)