### Imports

In [None]:
import json
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

### Process Politician Accounts / Party Membership

In [None]:
accs = pd.json_normalize(json.load(open("followed-accounts.json")))

accs.to_csv("accounts.csv", index=False)

# alternatively: accs = pd.read_csv("accounts.csv", converters = {x: pd.eval for x in range(5)})
politicians = np.concatenate(accs.values[0])

def extract_party(mention_str):
    return [acc_dict[mention["screen_name"]] if mention["screen_name"] in acc_dict.keys() else "Neutral" for mention in eval(mention_str)]

acc_dict = {}

for party in accs:
    for member in accs[party].values[0]:
        acc_dict[member] = party

### Load Data

In [None]:
def load_first_n_chunks(n):
    return pd.concat([pd.json_normalize(json.load(open("chunks/" + chunk))) for chunk in tqdm(os.listdir("chunks")[0:n])])

raw = load_first_n_chunks(10)
# drop tweets where content and user identical
raw = raw.drop_duplicates(subset=["text", "user.id"])
# alternatively: raw = pd.read_csv("raw_10.csv")
raw.to_csv("raw_10.csv", index=False)

### Generate Condensed Version

In [None]:
condensed = raw[["id", "created_at", "user.id", "user.screen_name", "user.followers_count",
     "text", "in_reply_to_status_id", "in_reply_to_user_id",
     "entities.hashtags", "entities.user_mentions",
     "quoted_status_id", "quoted_status.user.id", "retweeted_status.user.id", "retweeted_status.id"]]

condensed = condensed.assign(is_politician=raw["user.screen_name"].isin(politicians))
condensed.loc[condensed['is_politician'] == True, 'author_party'] = 1
condensed["author_party"] = condensed["user.screen_name"].apply(lambda x: acc_dict[x] if x in acc_dict.keys() else "None")
#condensed.to_csv("condensed_10.csv", index=False)

### Enrich with sentiment

In [None]:
from germansentiment import SentimentModel

In [None]:
model = SentimentModel()

In [None]:
condensed = pd.read_csv("condensed_10.csv")

In [None]:
# crashes because of memory error
#sentiment = pd.DataFrame({"text": condensed.text.drop_duplicates().head(100), "sentiment": model.predict_sentiment(condensed.text.drop_duplicates()).head(100)})

In [None]:
text = condensed.text.drop_duplicates()
num_chunks = 15
chunk_size = len(text)//num_chunks
textcontent = []
sentiment = []
for i in tqdm(range(num_chunks)):
    textcontent += list(text.iloc[i*chunk_size:(i+1)*chunk_size])
    sentiment += model.predict_sentiment(text.iloc[i*chunk_size:(i+1)*chunk_size])

In [None]:
df_sent = pd.DataFrame({"text": textcontent, "sentiment": sentiment})
df_sent = condensed.merge(df_sent, on="text")

In [None]:
df_sent.to_csv("sentiment_10.csv", index=False)

### Enrich with Engagement

In [None]:
replies = condensed.groupby("in_reply_to_status_id")[["id"]].count().rename(columns={"id":"num_replies"})
quotes = condensed.groupby("quoted_status_id")[["id"]].count().rename(columns={"id":"num_quotes"})
retweets = condensed.groupby("retweeted_status.id")[["id"]].count().rename(columns={"id":"num_retweets"})
condensed = condensed\
    .merge(replies, how="left", left_on="id", right_index=True)\
    .merge(quotes, how="left", left_on="id", right_index=True)\
    .merge(retweets, how="left", left_on="id", right_index=True)
condensed[["num_replies", "num_quotes", "num_retweets"]] = condensed[["num_replies", "num_quotes", "num_retweets"]].fillna(0)

### Enrich with Party Reference of Tweet

In [None]:
def get_refs_user(column):
    helper=[]
    for el in condensed[column].astype('float'):
        if not np.isnan(el):
            try:
                helper.append(condensed.author_party[condensed['user.id']==el].iloc[0])
            except:
                helper.append(np.NaN)
        else:
            helper.append(np.NaN)
    return helper

In [None]:
condensed['party_ref'] = get_refs_user('in_reply_to_user_id')
condensed['party_ref2'] = get_refs_user('quoted_status.user.id')
condensed['party_ref3'] = get_refs_user('retweeted_status.user.id')

In [None]:
parties=['Grüne','None','Linke','SPD','CDU/CSU','FDP','AfD']
columns = ['party_ref2','party_ref3']
for col in columns:
    for par in parties:
        condensed.loc[condensed[col] == par, 'party_ref'] = par
df.drop(columns=['party_ref2','party_ref3'],inplace=True)

In [None]:
condensed.to_csv("condensed_10.csv", index=False)