In [1]:
import pandas as pd
import numpy as np
import re
import emoji
import nltk
from matplotlib import pyplot as plt
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim_models
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from HanTa import HanoverTagger as ht


nltk.download('wordnet')
nltk.download("stopwords")
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Topic Modelling

For topic modelling the same data set as for the sentiment analysis is used, which means that tweets not directly containing the hashtag, as well as retweets are removed. 

In [2]:
df = pd.read_csv ('data/tweets/IchBinHanna.csv')

In [3]:
#get the correct data
df['new_date'] = pd.to_datetime(df['created_at']).dt.strftime('%Y-%m-%d %H:%M:%S')
df = df.loc[(df['new_date'] > '2021-06-01 00:00:00') & (df['new_date'] <= '2021-09-30 23:59:59')]
df = df.loc[df['reference_type'] != 'retweeted']
df = df[df['text'].str.contains("#ichbinhanna", case = False)]
df['processed'] = df['text'].astype(str)
df = df.loc[(df['lang'] == 'de') | (df['lang'] == 'en')]

In [4]:
#clean the data (remove URLs, emojis and line breaks)
def rem_url(tweet):
     return " ".join(re.sub("([^0-9A-Za-zßäöü \t])|(\w+:\/\/\S+)", " ", tweet).split())
df['processed'] = df['processed'].apply(rem_url)
def rem_emojis(text):
    emojis = [x for x in text if x in emoji.UNICODE_EMOJI]
    cleaned = ' '.join([str for str in text.split() if not any(i in str for i in emojis)])
    return cleaned
df['processed'] = df['processed'].apply(lambda x: rem_emojis(x))
df['processed'] = df['text'].replace(r'\\n',  ' ', regex=True)
df['processed'] = df['processed'].astype(str)

In [5]:
#preprocessing (tokenization, stop word removal, stemming)
german_stop = set(stopwords.words('german'))
english_stop = set(stopwords.words('english'))
add_stop_all = ["ichbinhanna","#ichbinhanna", "hanna", "mehr", "innen", "#wisszeitvg", "#ichbinhannah", "@amreibahr", "amreibahr", "@bmf_bund","bmf_bund" "@drkeichhorn", "@bmbf_bund", "mehr", "innen", "schon", "gehen", "jahr","wissenschaft", "wissenschaftler", "kommen","academia", "academic", "year", "machen", "sagen", "sein","geben", "also", "werden", "german", "germany","gut", "haben", "geht", "gibt", "viele", "seit", "wäre"]
german_stop.update(set(add_stop_all))
english_stop.update(set(add_stop_all))
tweet_tokenizer = TweetTokenizer()
df['tokenized'] = df['processed'].apply(lambda x: tweet_tokenizer.tokenize(x.lower()))
df['tokenized'] = df[['tokenized','lang']].apply(lambda x: ' '.join([word for word in x['tokenized'] if word not in english_stop]).split() if x['lang'] == 'en' else ' '.join([word for word in x['tokenized'] if word not in german_stop]).split(),axis=1)
#remove all words consisting of 3 characters or less to shorten the dictionary of unique words
df['tokenized'] = df['tokenized'].apply(lambda x: [word for word in x if len(word) > 3])
lemmatizer = WordNetLemmatizer()
hannover = ht.HanoverTagger('morphmodel_ger.pgz')
df['tokenized'] = df[['tokenized','lang']].apply(lambda x: [lemmatizer.lemmatize(word).lower() for word in x['tokenized']] if x['lang'] == 'en' else [hannover.analyze(word)[0].lower() for word in x['tokenized']] ,axis=1)

In [43]:
def perform_LDA(tokens, topics=5, passes =5, alpha = 'symmetric', decay = 0.5):
    #create the dictionary of lemmatized tokens
    dic = Dictionary(tokens)
    print(len(dic))
    #remove low and high frequent terms
    dic.filter_extremes(no_below=2, no_above=.99)
    print(len(dic))
    #create the bag of words 
    corpus = [dic.doc2bow(d) for d in tokens]
    #build LDA model 
    LDA = LdaMulticore(corpus= corpus, num_topics=topics, id2word= dic, workers=12, passes=passes, alpha = alpha, decay = decay)
    words = [re.findall(r'"([^"]*)"',t[1]) for t in LDA.print_topics()]
    #create topics
    topics = [' '.join(t[0:10]) for t in words]

    for id, t in enumerate(topics): 
        print(f"------ Topic {id} ------")
        print(t, end="\n\n")
    # Compute Perplexity
    perplexity = LDA.log_perplexity(corpus)
    print('\nPerplexity: ', perplexity) 
    # Compute Coherence Score
    coherence_model = CoherenceModel(model=LDA, texts=tokens, 
                                   dictionary=dic, coherence='c_v')
    coherence_lda_model = coherence_model.get_coherence()
    print('\nCoherence Score: ', coherence_lda_model)
    return LDA

In [7]:
#perform topic modelling by applying LDA on the whole data set, and on the german/english only sets (guided by: https://towardsdatascience.com/twitter-topic-modeling-e0e3315b12e2)
#full data set
full_model = perform_LDA(df['tokenized'])

47228
15469
------ Topic 0 ------
werden machen problem contract system finden mensch immer scholar ganz

------ Topic 1 ------
jahr gut machen immer arbeitsbedingung forschung werden brauchen prekär finden

------ Topic 2 ------
jahr gut frage befristet #hannaimbundestag wichtig arbeit neu eigentlich zeit

------ Topic 3 ------
jahr arbeit gut befristet problem müssen warum stellen vertrag @anjakarliczek

------ Topic 4 ------
@anjakarliczek system wissen arbeit #ichbinreyhan problem aktuell research @drkeichhorn #hannaimbundestag


Perplexity:  -8.672711893652302

Coherence Score:  0.25304525502446684


In [8]:
#only german tweets
df_ger = df.loc[df['lang'] == "de"]
df_en = df.loc[df['lang'] == "en"]
ger_model = perform_LDA(df_ger['tokenized'])

39811
12601
------ Topic 0 ------
gut jahr machen befristet system arbeit wissen müssen vielen haben

------ Topic 1 ------
wichtig arbeit wissen ganz forschung wissenschaftlich befristet deutschland problem lehre

------ Topic 2 ------
gut @anjakarliczek aktuell @gew_bund eigentlich hochschule arbeit werden stunde jahr

------ Topic 3 ------
jahr vertrag befristet @anjakarliczek heute zeit promotion groß arbeit neu

------ Topic 4 ------
machen stellen jahr immer forschung haben unbefristet @anjakarliczek gerade werden


Perplexity:  -8.477697358283951

Coherence Score:  0.166008948057337


In [9]:
#only english tweets
en_model = perform_LDA(df_en['tokenized'])

9256
3328
------ Topic 0 ------
researcher contract precarious scholar research many condition career #ichbinreyhan discussion

------ Topic 1 ------
system much read need know research student thread people work

------ Topic 2 ------
year system research contract work academic position career postdoc university

------ Topic 3 ------
working condition university system problem time contract like english permanent

------ Topic 4 ------
thread contract story science share position want university people need


Perplexity:  -7.8100907723873645

Coherence Score:  0.22015641867679295


In [10]:
#get topics by month
df_june = df.loc[(df['new_date'] > '2021-06-01 00:00:00') & (df['new_date'] <= '2021-06-30 23:59:59')]
df_july = df.loc[(df['new_date'] > '2021-07-01 00:00:00') & (df['new_date'] <= '2021-07-31 23:59:59')]
df_august = df.loc[(df['new_date'] > '2021-08-01 00:00:00') & (df['new_date'] <= '2021-08-31 23:59:59')]
df_september = df.loc[(df['new_date'] > '2021-09-01 00:00:00') & (df['new_date'] <= '2021-09-30 23:59:59')]

In [11]:
#tweets from june
june_model = perform_LDA(df_june['tokenized'])

31199
10502
------ Topic 0 ------
jahr arbeit befristet wissenschaftlich #hannaimbundestag promotion finden problem stellen forschung

------ Topic 1 ------
system video deutsch #bmbf hochschule ganz gerade #hannaimbundestag statt career

------ Topic 2 ------
jahr forschung befristet machen lehre vertrag arbeit haben gut immer

------ Topic 3 ------
jahr wissen @anjakarliczek #hannaimbundestag gut hashtag immer letzter thread arbeit

------ Topic 4 ------
#hannaimbundestag @anjakarliczek gut befristet stelle müssen arbeit problem machen heute


Perplexity:  -8.492904243419366

Coherence Score:  0.2266602332809593


In [12]:
#tweets from july
july_model = perform_LDA(df_july['tokenized'])

16767
5780
------ Topic 0 ------
arbeit jahr müssen gut @drkeichhorn wissenschaftlich mensch sprechen promotion sollen

------ Topic 1 ------
@gew_bund problem story gerade #ichbinreyhan einfach #dauerstell klar werden gut

------ Topic 2 ------
gut @anjakarliczek immer wissen nachwuchs ganz arbeit system wichtig machen

------ Topic 3 ------
#ichbinreyhan @tagesthemen zeit @anjakarliczek forschung machen befristet stellen danken immer

------ Topic 4 ------
gut #ichbinreyhan jahr brauchen system @anjakarliczek arbeitsbedingung thema @gew_bund diskussion


Perplexity:  -8.39668779293529

Coherence Score:  0.26668444384190404


In [13]:
#tweets from august
august_model = perform_LDA(df_august['tokenized'])

9918
3233
------ Topic 0 ------
arbeit @drkeichhorn #ichbinreyhan @anjakarliczek forschung gut aktuell sprechen brauchen wissen

------ Topic 1 ------
#ichbinreyhan gut immer stellen bleiben machen gerade werden perspektive @drkeichhorn

------ Topic 2 ------
arbeit system @drkeichhorn #wissenschaft gut #ichbinreyhan haben year heute @heutejournal

------ Topic 3 ------
#dauerstell @gew_bund #ichbinreyhan befristet wissen ganz problem finden sollen @akellergew

------ Topic 4 ------
#ichbinreyhan machen system gut @anjakarliczek wichtig stellen müssen ganz hochschule


Perplexity:  -8.065780770680187

Coherence Score:  0.28093361137968315


In [14]:
#tweets from september
september_model = perform_LDA(df_september['tokenized'])

9231
3015
------ Topic 0 ------
#ichbinreyhan @gew_bund hochschule jahr #entfristethanna #dauerstell prekär gut #hannaorganisiertsich arbeit

------ Topic 1 ------
@gew_bund arbeit machen vertrag befristet #ichbinreyhan immer müssen hochschule jahr

------ Topic 2 ------
machen heute immer @andreasbovensc1 #ichbinreyhan ganz @swh_hb sehen gut @spdlandbremen

------ Topic 3 ------
#ichbinreyhan gut monat jahr vertrag wissen arbeit #hannainzahlen @drkeichhorn haben

------ Topic 4 ------
neu stellen #ichbinreyhan gut forschung #btw21 system lehre #berlhg jahr


Perplexity:  -8.020997995103459

Coherence Score:  0.29005676156280746


In [69]:
def visualize_topics(tokens):
    #Visualization of topic distance 
    vis_dic = Dictionary(tokens)
    vis_dic.filter_extremes(no_below=2, no_above=.99)
    #create the bag of words 
    vis_corpus = [vis_dic.doc2bow(d) for d in tokens]
    #build LDA model a 
    vis_LDA = LdaMulticore(corpus= vis_corpus, num_topics=5, id2word= vis_dic, workers=12, passes=5, random_state = 1)
    pyLDAvis.enable_notebook()
    return pyLDAvis.gensim_models.prepare(vis_LDA, vis_corpus, vis_dic)
def visualize_model(model, tokens):
    vis_dic = Dictionary(tokens)
    vis_dic.filter_extremes(no_below=2, no_above=.99)
    vis_corpus = [vis_dic.doc2bow(d) for d in tokens]
    pyLDAvis.enable_notebook()
    return pyLDAvis.gensim_models.prepare(model, vis_corpus, vis_dic)

In [61]:
visualize_topics(df['tokenized'])

In [40]:
#get optimal number of topics for each (sub)set
def compute_coherence_values_topics(tokens, limit=10, start=2, step=1):
    
    dic = Dictionary(tokens)
    dic.filter_extremes(no_below=2, no_above=.99)
    corpus = [dic.doc2bow(d) for d in tokens]
    coherence_values_topic = []
    model_list_topic = []
    for num_topics in range(start, limit, step):
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dic, random_state = 1)
        model_list_topic.append(num_topics)
        coherencemodel = CoherenceModel(model=model, texts=tokens, dictionary=dic, coherence='c_v')
        coherence_values_topic.append(coherencemodel.get_coherence())

    print(model_list_topic, coherence_values_topic)
    
def compute_coherence_values_passes(tokens,num_topics):
    
    passes = [5,10,15,20]
    dic = Dictionary(tokens)
    dic.filter_extremes(no_below=2, no_above=.99)
    corpus = [dic.doc2bow(d) for d in tokens]
    coherence_values_passes = []
    model_list_passes = []
    for num_pass in passes:
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dic, passes = num_pass, random_state = 1)
        model_list_passes.append(num_pass)
        coherencemodel = CoherenceModel(model=model, texts=tokens, dictionary=dic, coherence='c_v')
        coherence_values_passes.append(coherencemodel.get_coherence())

    print(model_list_passes, coherence_values_passes)
    
def compute_coherence_values_alpha(tokens,num_topics, passes):
    
    alpha = ['symmetric','asymmetric']
    dic = Dictionary(tokens)
    dic.filter_extremes(no_below=2, no_above=.99)
    corpus = [dic.doc2bow(d) for d in tokens]
    coherence_values_alpha = []
    model_list_alpha = []
    for a in alpha:
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dic, passes = passes, alpha = a, random_state = 1)
        model_list_alpha.append(a)
        coherencemodel = CoherenceModel(model=model, texts=tokens, dictionary=dic, coherence='c_v')
        coherence_values_alpha.append(coherencemodel.get_coherence())

    print(model_list_alpha, coherence_values_alpha)
    
def compute_coherence_values_decay(tokens,num_topics, passes, alpha):
    
    decay = [0.5,0.7,0.9]
    dic = Dictionary(tokens)
    dic.filter_extremes(no_below=2, no_above=.99)
    corpus = [dic.doc2bow(d) for d in tokens]
    coherence_values_decay = []
    model_list_decay = []
    for d in decay:
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dic, passes = passes, alpha = alpha, random_state = 1, decay =d)
        model_list_decay.append(d)
        coherencemodel = CoherenceModel(model=model, texts=tokens, dictionary=dic, coherence='c_v')
        coherence_values_decay.append(coherencemodel.get_coherence())

    print(model_list_decay, coherence_values_decay)

In [28]:
#full data
compute_coherence_values_topics(df['tokenized'])
#english data
compute_coherence_values_topics(df_en['tokenized'])
#german data
compute_coherence_values_topics(df_ger['tokenized'])
#june data
compute_coherence_values_topics(df_june['tokenized'])
#july data
compute_coherence_values_topics(df_july['tokenized'])
#august data
compute_coherence_values_topics(df_august['tokenized'])
#september data
compute_coherence_values_topics(df_september['tokenized'])

[2, 3, 4, 5, 6, 7, 8, 9] [0.12389419281098238, 0.11705462478800728, 0.12330710455576704, 0.15429370641598833, 0.21100178941858486, 0.16638388526021494, 0.15783920306806049, 0.20062631867865002]
[2, 3, 4, 5, 6, 7, 8, 9] [0.1668900092320914, 0.1692360171278822, 0.19683454198902256, 0.1966770684631832, 0.20727713470776118, 0.22494371825845016, 0.23695433706018854, 0.21506778494477674]
[2, 3, 4, 5, 6, 7, 8, 9] [0.09583580231313711, 0.10958211640962456, 0.1379404450229763, 0.11690489842666756, 0.15478297101880575, 0.15800747727594208, 0.1803250250191899, 0.16931867258229377]
[2, 3, 4, 5, 6, 7, 8, 9] [0.12837922840473015, 0.12517048445142975, 0.14824822735800955, 0.16128090424783265, 0.1900565105234718, 0.19104573664257887, 0.2213348855479249, 0.19995845461644024]
[2, 3, 4, 5, 6, 7, 8, 9] [0.19961240061184116, 0.16329878859792116, 0.2012796133778894, 0.19263729552448028, 0.21258579087603127, 0.22960642961105915, 0.2792235085771986, 0.30649709139589126]
[2, 3, 4, 5, 6, 7, 8, 9] [0.25179550503

In [31]:
#full data
compute_coherence_values_passes(df['tokenized'], 6)
#english data
compute_coherence_values_passes(df_en['tokenized'],8)
#german data
compute_coherence_values_passes(df_ger['tokenized'],8)
#june data
compute_coherence_values_passes(df_june['tokenized'],8)
#july data
compute_coherence_values_passes(df_july['tokenized'],9)
#august data
compute_coherence_values_passes(df_august['tokenized'],9)
#september data
compute_coherence_values_passes(df_september['tokenized'],8)

[5, 10, 15, 20] [0.21769261620423222, 0.2690152610411907, 0.3228316499418125, 0.3268002840348955]
[5, 10, 15, 20] [0.24915991509879745, 0.2612628445116889, 0.26380476019690424, 0.271753624443677]
[5, 10, 15, 20] [0.19513052324024732, 0.21424691226268477, 0.23010446959167213, 0.23881264878405628]
[5, 10, 15, 20] [0.24474511422358788, 0.2820813185439542, 0.32473946611873705, 0.3381229427915676]
[5, 10, 15, 20] [0.2554959019382797, 0.24402870026977952, 0.2534215279594163, 0.2621248722547103]
[5, 10, 15, 20] [0.2824516647033783, 0.29574032006188355, 0.29849572880663794, 0.30912229640979666]
[5, 10, 15, 20] [0.32174582869474205, 0.31718306381680816, 0.3142800230028888, 0.32047759338204074]


In [37]:
#full data
compute_coherence_values_alpha(df['tokenized'], 6,20)
#english data
compute_coherence_values_alpha(df_en['tokenized'],8,20)
#german data
compute_coherence_values_alpha(df_ger['tokenized'],8,20)
#june data
compute_coherence_values_alpha(df_june['tokenized'],8,20)
#july data
compute_coherence_values_alpha(df_july['tokenized'],9,20)
#august data
compute_coherence_values_alpha(df_august['tokenized'],9,20)
#september data
compute_coherence_values_alpha(df_september['tokenized'],8,5)

['symmetric', 'asymmetric'] [0.3268002840348955, 0.37571561994425395]
['symmetric', 'asymmetric'] [0.271753624443677, 0.2538859498789313]
['symmetric', 'asymmetric'] [0.23678522027750526, 0.21145266638112842]
['symmetric', 'asymmetric'] [0.3421015846690872, 0.39217997909674135]
['symmetric', 'asymmetric'] [0.2621248722547103, 0.35178133343358603]
['symmetric', 'asymmetric'] [0.30912229640979666, 0.36407715047510764]
['symmetric', 'asymmetric'] [0.32174582869474205, 0.3279922890145045]


In [41]:
#full data
compute_coherence_values_decay(df['tokenized'], 6,20,'asymmetric')
#english data
compute_coherence_values_decay(df_en['tokenized'],8,20,'symmetric')
#german data
compute_coherence_values_decay(df_ger['tokenized'],8,20,'symmetric')
#june data
compute_coherence_values_decay(df_june['tokenized'],8,20,'asymmetric')
#july data
compute_coherence_values_decay(df_july['tokenized'],9,20,'asymmetric')
#august data
compute_coherence_values_decay(df_august['tokenized'],9,20,'asymmetric')
#september data
compute_coherence_values_decay(df_september['tokenized'],8,5,'asymmetric')

[0.5, 0.7, 0.9] [0.3808379631849212, 0.2963046085250385, 0.2671810731763273]
[0.5, 0.7, 0.9] [0.271753624443677, 0.26643195527369967, 0.24938472551264299]
[0.5, 0.7, 0.9] [0.23701694936258688, 0.21988486126282533, 0.1972238567213]
[0.5, 0.7, 0.9] [0.3916118752563964, 0.3606002405298114, 0.29765623633544497]
[0.5, 0.7, 0.9] [0.35178133343358603, 0.31524882276942967, 0.30781519531133594]
[0.5, 0.7, 0.9] [0.36407715047510764, 0.34814232942355183, 0.3292749688157396]
[0.5, 0.7, 0.9] [0.3279922890145045, 0.3288853538609816, 0.33193391268330086]


In [45]:
#get the optimal models
optimal_full = perform_LDA(df['tokenized'],6,20,'asymmetric',0.5)
optimal_en = perform_LDA(df_en['tokenized'],8,20,'symmetric',0.5)
optimal_ger = perform_LDA(df_ger['tokenized'],8,20,'symmetric',0.5)
optimal_june = perform_LDA(df_june['tokenized'],8,20,'asymmetric',0.5)
optimal_july = perform_LDA(df_july['tokenized'],9,20,'asymmetric',0.5)
optimal_august = perform_LDA(df_august['tokenized'],9,20,'asymmetric',0.5)
optimal_september = perform_LDA(df_september['tokenized'],8,5,'asymmetric',0.9)

47228
15469
------ Topic 0 ------
jahr @anjakarliczek befristet gut forschung immer arbeit werden machen haben

------ Topic 1 ------
system contract research work @gew_bund year position career researcher thread

------ Topic 2 ------
stellen unbefristet prekär befristet müssen arbeit gut system heute promotion

------ Topic 3 ------
gut arbeit jahr ganz forschung sehen wissen stellen brauchen müssen

------ Topic 4 ------
#hannaimbundestag @anjakarliczek stattfinden hochschule wählen gut gerade sagen hoffen studierend

------ Topic 5 ------
machen studium eigentlich jahr @anjakarliczek wissen hochschule #ichbinreyhan immer kennen


Perplexity:  -8.486853137726243

Coherence Score:  0.2846346894950325
9256
3328
------ Topic 0 ------
year career tweet research contract english many like working time

------ Topic 1 ------
contract condition visa precarious working research colleague country academic make

------ Topic 2 ------
#ichbinreyhan researcher permanent many problem position pr

In [70]:
visualize_model(optimal_full, df['tokenized'])

In [71]:
visualize_model(optimal_en, df_en['tokenized'])

In [72]:
visualize_model(optimal_ger, df_ger['tokenized'])

In [73]:
visualize_model(optimal_june, df_june['tokenized'])

In [74]:
visualize_model(optimal_july, df_july['tokenized'])

In [75]:
visualize_model(optimal_august, df_august['tokenized'])

In [76]:
visualize_model(optimal_september, df_september['tokenized'])