In [2]:
import pandas as pd
import numpy as np
import liwc
import re
from matplotlib import pyplot as plt
from nltk.tokenize import TweetTokenizer
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

In [64]:
#load data and dictionaries
df = pd.read_csv ('IchBinHanna.csv')
german_parse, german_categories = liwc.load_token_parser('LIWC2007_German.dic')
english_parse, english_categories = liwc.load_token_parser('LIWC2015_English.dic')

In [65]:
#prepare data, drop retweets and split it into german and english tweets
def rem_url(tweet):
     return " ".join(re.sub("([^0-9A-Za-zßäöü \t])|(\w+:\/\/\S+)", " ", tweet).split())
#only keep tweets from 01/06/2021 - 30/09/2021
df['new_date'] = pd.to_datetime(df['created_at']).dt.strftime('%Y-%m-%d %H:%M:%S')
df = df.loc[(df['new_date'] > '2021-06-01 00:00:00') & (df['new_date'] <= '2021-09-30 23:59:59')]
df = df.loc[df['reference_type'] != 'retweeted']
df['text'] = df['text'].replace(r'\\n',  ' ', regex=True)
df['text'] = df['text'].apply(rem_url)
df['text'] = df['text'].astype(str)
#tokenize tweets
tweet_tokenizer = TweetTokenizer()
df['tokenized'] = df['text'].apply(lambda x: tweet_tokenizer.tokenize(x.lower()))
df_ger = df.loc[df['lang'] == "de"]
df_en = df.loc[df['lang'] == "en"]

In [54]:
df_ger['categories'] = df_ger['tokenized'].apply(lambda x: Counter(category for token in x for category in german_parse(token)))
df_en['categories'] = df_en['tokenized'].apply(lambda x: Counter(category for token in x for category in english_parse(token)))

In [50]:
#function to get relative scores of each category
def get_relative(cats, tokens):
    out = {}
    for k in cats:
        out[k] = round(cats[k]/len(tokens),4)
    return out

In [60]:
#convert counter to dict
df_en['categories'] = df_en['categories'].apply(lambda x: dict(x))
df_ger['categories'] = df_ger['categories'].apply(lambda x: dict(x))
#get percentage of classes
df_en['categories'] = [get_relative(x, y) for x, y in zip(df_en['categories'], df_en['tokenized'])]
df_ger['categories'] = [get_relative(x, y) for x, y in zip(df_ger['categories'], df_ger['tokenized'])]