In [1]:
import pandas as pd
import numpy as np
import re
import emoji
import nltk
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim_models
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from HanTa import HanoverTagger as ht

nltk.download('wordnet')
nltk.download("stopwords")
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Topic Modelling

For topic modelling the same data set as for the sentiment analysis is used, which means that tweets not directly containing the hashtag, as well as retweets are removed. 

In [2]:
df = pd.read_csv ('data/tweets/IchBinHanna.csv')

In [3]:
#get the correct data
df['new_date'] = pd.to_datetime(df['created_at']).dt.strftime('%Y-%m-%d %H:%M:%S')
df = df.loc[(df['new_date'] > '2021-06-01 00:00:00') & (df['new_date'] <= '2021-09-30 23:59:59')]
df = df.loc[df['reference_type'] != 'retweeted']
df = df[df['text'].str.contains("#ichbinhanna", case = False)]
df['processed'] = df['text'].astype(str)
df = df.loc[(df['lang'] == 'de') | (df['lang'] == 'en')]

In [4]:
#clean the data (remove URLs, emojis and line breaks)
def rem_url(tweet):
     return " ".join(re.sub("([^0-9A-Za-zßäöü \t])|(\w+:\/\/\S+)", " ", tweet).split())
df['processed'] = df['processed'].apply(rem_url)
def rem_emojis(text):
    emojis = [x for x in text if x in emoji.UNICODE_EMOJI]
    cleaned = ' '.join([str for str in text.split() if not any(i in str for i in emojis)])
    return cleaned
df['processed'] = df['processed'].apply(lambda x: rem_emojis(x))
df['processed'] = df['text'].replace(r'\\n',  ' ', regex=True)
df['processed'] = df['processed'].astype(str)

In [5]:
#preprocessing (tokenization, stop word removal, stemming)
german_stop = set(stopwords.words('german'))
english_stop = set(stopwords.words('english'))
add_stop_all = ["ichbinhanna","#ichbinhanna", "hanna", "mehr", "innen", "#wisszeitvg", "#ichbinhannah", "@amreibahr", "amreibahr", "@bmf_bund","bmf_bund" "@drkeichhorn", "@bmbf_bund", "mehr", "innen", "schon", "gehen", "jahr","wissenschaft", "wissenschaftler", "kommen","academia", "academic", "year", "machen", "sagen", "sein","geben", "also", "werden", "german", "germany","gut", "haben", "geht", "gibt", "viele", "seit", "wäre"]
german_stop.update(set(add_stop_all))
english_stop.update(set(add_stop_all))
tweet_tokenizer = TweetTokenizer()
df['tokenized'] = df['processed'].apply(lambda x: tweet_tokenizer.tokenize(x.lower()))
df['tokenized'] = df[['tokenized','lang']].apply(lambda x: ' '.join([word for word in x['tokenized'] if word not in english_stop]).split() if x['lang'] == 'en' else ' '.join([word for word in x['tokenized'] if word not in german_stop]).split(),axis=1)
#remove all words consisting of 3 characters or less to shorten the dictionary of unique words
df['tokenized'] = df['tokenized'].apply(lambda x: [word for word in x if len(word) > 3])
lemmatizer = WordNetLemmatizer()
hannover = ht.HanoverTagger('morphmodel_ger.pgz')
df['tokenized'] = df[['tokenized','lang']].apply(lambda x: [lemmatizer.lemmatize(word).lower() for word in x['tokenized']] if x['lang'] == 'en' else [hannover.analyze(word)[0].lower() for word in x['tokenized']] ,axis=1)

In [6]:
def perform_LDA(tokens):
    #create the dictionary of lemmatized tokens
    dic = Dictionary(tokens)
    print(len(dic))
    #remove low and high frequent terms
    dic.filter_extremes(no_below=2, no_above=.99)
    print(len(dic))
    #create the bag of words 
    corpus = [dic.doc2bow(d) for d in tokens]
    #build LDA model 
    LDA = LdaMulticore(corpus= corpus, num_topics=5, id2word= dic, workers=12, passes=5)
    words = [re.findall(r'"([^"]*)"',t[1]) for t in LDA.print_topics()]
    #create topics
    topics = [' '.join(t[0:10]) for t in words]

    for id, t in enumerate(topics): 
        print(f"------ Topic {id} ------")
        print(t, end="\n\n")
    # Compute Perplexity
    perplexity = LDA.log_perplexity(corpus)
    print('\nPerplexity: ', perplexity) 
    # Compute Coherence Score
    coherence_model = CoherenceModel(model=LDA, texts=tokens, 
                                   dictionary=dic, coherence='c_v')
    coherence_lda_model = coherence_model.get_coherence()
    print('\nCoherence Score: ', coherence_lda_model)
    

In [7]:
#perform topic modelling by applying LDA on the whole data set, and on the german/english only sets (guided by: https://towardsdatascience.com/twitter-topic-modeling-e0e3315b12e2)
#full data set
perform_LDA(df['tokenized'])

47228
15469
------ Topic 0 ------
machen arbeit forschung gut @drkeichhorn haben aktuell finden müssen zeigen

------ Topic 1 ------
jahr gut werden befristet stelle @drkeichhorn gerade bekommen brauchen stellen

------ Topic 2 ------
@anjakarliczek #hannaimbundestag immer #ichbinreyhan gut frage klar @gew_bund ganz bleiben

------ Topic 3 ------
jahr problem warum @anjakarliczek hashtag gut wichtig ganz arbeit thema

------ Topic 4 ------
jahr arbeit system wissen stellen befristet vertrag immer forschung gut


Perplexity:  -8.734351923861636

Coherence Score:  0.17035760239900616


In [8]:
#only german tweets
df_ger = df.loc[df['lang'] == "de"]
df_en = df.loc[df['lang'] == "en"]
perform_LDA(df_ger['tokenized'])

39811
12601
------ Topic 0 ------
stellen befristet vertrag gut unbefristet machen wissen system wichtig gerade

------ Topic 1 ------
gut jahr arbeit @anjakarliczek forschung haben problem sehen eigentlich prekär

------ Topic 2 ------
müssen heute gut #hannaimbundestag stunde hashtag letzter aktuell arbeitsbedingung richtig

------ Topic 3 ------
machen system @drkeichhorn gut wissenschaftlich wissen sagen jahr @sebastiankubon @anjakarliczek

------ Topic 4 ------
jahr arbeit brauchen immer werden #ichbinreyhan #hannaimbundestag befristet promotion @anjakarliczek


Perplexity:  -8.47576839732385

Coherence Score:  0.16229792343764796


In [9]:
#only english tweets
perform_LDA(df_en['tokenized'])

9256
3328
------ Topic 0 ------
contract thread need #ichbinreyhan condition system working career problem work

------ Topic 1 ------
year position system contract research time researcher permanent work career

------ Topic 2 ------
thread english system work academic would university contract debate position

------ Topic 3 ------
many research career working condition work important precarious system think

------ Topic 4 ------
like system year scholar people researcher university contract many student


Perplexity:  -7.802982577701386

Coherence Score:  0.19886803045807555


In [10]:
#get topics by month
df_june = df.loc[(df['new_date'] > '2021-06-01 00:00:00') & (df['new_date'] <= '2021-06-30 23:59:59')]
df_july = df.loc[(df['new_date'] > '2021-07-01 00:00:00') & (df['new_date'] <= '2021-07-31 23:59:59')]
df_august = df.loc[(df['new_date'] > '2021-08-01 00:00:00') & (df['new_date'] <= '2021-08-31 23:59:59')]
df_september = df.loc[(df['new_date'] > '2021-09-01 00:00:00') & (df['new_date'] <= '2021-09-30 23:59:59')]

In [11]:
#tweets from june
perform_LDA(df_june['tokenized'])

31199
10502
------ Topic 0 ------
gut @anjakarliczek hashtag thread müssen danken warum jahr #hannaimbundestag letzter

------ Topic 1 ------
jahr arbeit #hannaimbundestag befristet promotion wissenschaftlich immer einfach @anjakarliczek prekär

------ Topic 2 ------
system gut #hannaimbundestag aktuell forschung sehen arbeit jahr haben @anjakarliczek

------ Topic 3 ------
jahr @drkeichhorn unbefristet gut forschung #hannaimbundestag @sebastiankubon innovation #95vswisszeitvg ganz

------ Topic 4 ------
vertrag befristet problem @anjakarliczek arbeit wissen forschung jahr lehre gerade


Perplexity:  -8.49678942834381

Coherence Score:  0.21886630566117052


In [12]:
#tweets from july
perform_LDA(df_july['tokenized'])

16767
5780
------ Topic 0 ------
@anjakarliczek ganz #ichbinreyhan @drkeichhorn @tagesthemen kommentar sprechen danken haben heute

------ Topic 1 ------
gut #ichbinreyhan jahr arbeitsbedingung brauchen lehre arbeit @gew_bund gerade perspektive

------ Topic 2 ------
jahr stellen befristet gut gerade arbeit machen immer @anjakarliczek beitrag

------ Topic 3 ------
problem wichtig system sagen gut immer lehre forschung work eigentlich

------ Topic 4 ------
#ichbinreyhan system immer condition working thread sehen arbeit @gew_bund machen


Perplexity:  -8.384196301857767

Coherence Score:  0.32438354833603916


In [13]:
#tweets from august
perform_LDA(df_august['tokenized'])

9918
3233
------ Topic 0 ------
#ichbinreyhan kommen @karolinedoering finden @richterhedwig @klios_spiegel @tinido @achimlandwehr @esteinhauer @christinaholzel

------ Topic 1 ------
gut #wissenschaft immer zeit @drkeichhorn #ichbinreyhan promotion system befristet thread

------ Topic 2 ------
machen #ichbinreyhan system stellen uni heute arbeit wichtig wissen ganz

------ Topic 3 ------
#ichbinreyhan forschung jahr gut arbeit hochschule frage aktuell #dauerstell @anjakarliczek

------ Topic 4 ------
@drkeichhorn arbeit system stellen gut eigentlich @sebastiankubon ganz müssen warum


Perplexity:  -8.069289995163897

Coherence Score:  0.3309997800843364


In [14]:
#tweets from september
perform_LDA(df_september['tokenized'])

9231
3015
------ Topic 0 ------
monat vertrag machen jahr gut #hannainzahlen lehre system @swh_hb @andreasbovensc1

------ Topic 1 ------
arbeit berlin @gew_bund #ichbinreyhan @drkeichhorn neu #dauerstell kommen wichtig gut

------ Topic 2 ------
#ichbinreyhan postdocs dürfen arbeit neu gut system immer bleiben werden

------ Topic 3 ------
stellen #ichbinreyhan hochschule gut neu arbeit unbefristet immer stelle befristet

------ Topic 4 ------
@gew_bund jahr #ichbinreyhan problem gut #dauerstell heute #entfristethanna stellen uni


Perplexity:  -8.037735602361812

Coherence Score:  0.2768987915876285


In [15]:
#Visualization of topic distance for whole data set
full_dic = Dictionary(df['tokenized'])
full_dic.filter_extremes(no_below=2, no_above=.99)
#create the bag of words 
full_corpus = [full_dic.doc2bow(d) for d in df['tokenized']]
#build LDA model for full data 
full_LDA = LdaMulticore(corpus= full_corpus, num_topics=5, id2word= full_dic, workers=12, passes=5)
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(full_LDA, full_corpus, full_dic)