In [1]:
import pandas as pd
import numpy as np
import re
import emoji
import nltk
from matplotlib import pyplot as plt
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim_models
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from HanTa import HanoverTagger as ht


nltk.download('wordnet')
nltk.download("stopwords")
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Topic Modelling

For topic modelling the same data set as for the sentiment analysis is used, which means that tweets not directly containing the hashtag, as well as retweets are removed. 

In [2]:
df = pd.read_csv ('data/tweets/IchBinHanna.csv')

In [3]:
#get the correct data
df['new_date'] = pd.to_datetime(df['created_at']).dt.strftime('%Y-%m-%d %H:%M:%S')
df = df.loc[(df['new_date'] > '2021-06-01 00:00:00') & (df['new_date'] <= '2021-09-30 23:59:59')]
df = df.loc[df['reference_type'] != 'retweeted']
df = df[df['text'].str.contains("#ichbinhanna", case = False)]
df['processed'] = df['text'].astype(str)
df = df.loc[(df['lang'] == 'de') | (df['lang'] == 'en')]

In [4]:
#clean the data (remove URLs, emojis and line breaks)
def rem_url(tweet):
     return " ".join(re.sub("([^0-9A-Za-zßäöü \t])|(\w+:\/\/\S+)", " ", tweet).split())
df['processed'] = df['processed'].apply(rem_url)
def rem_emojis(text):
    emojis = [x for x in text if x in emoji.UNICODE_EMOJI]
    cleaned = ' '.join([str for str in text.split() if not any(i in str for i in emojis)])
    return cleaned
df['processed'] = df['processed'].apply(lambda x: rem_emojis(x))
df['processed'] = df['text'].replace(r'\\n',  ' ', regex=True)
df['processed'] = df['processed'].astype(str)

In [5]:
#preprocessing (tokenization, stop word removal, stemming)
german_stop = set(stopwords.words('german'))
english_stop = set(stopwords.words('english'))
add_stop_all = ["ichbinhanna","#ichbinhanna", "hanna", "mehr", "innen", "#wisszeitvg", "#ichbinhannah", "@amreibahr", "amreibahr", "@bmf_bund","bmf_bund" "@drkeichhorn", "@bmbf_bund", "mehr", "innen", "schon", "gehen", "jahr","wissenschaft", "wissenschaftler", "kommen","academia", "academic", "year", "machen", "sagen", "sein","geben", "also", "werden", "german", "germany","gut", "haben", "geht", "gibt", "viele", "seit", "wäre"]
german_stop.update(set(add_stop_all))
english_stop.update(set(add_stop_all))
tweet_tokenizer = TweetTokenizer()
df['tokenized'] = df['processed'].apply(lambda x: tweet_tokenizer.tokenize(x.lower()))
df['tokenized'] = df[['tokenized','lang']].apply(lambda x: ' '.join([word for word in x['tokenized'] if word not in english_stop]).split() if x['lang'] == 'en' else ' '.join([word for word in x['tokenized'] if word not in german_stop]).split(),axis=1)
#remove all words consisting of 3 characters or less to shorten the dictionary of unique words
df['tokenized'] = df['tokenized'].apply(lambda x: [word for word in x if len(word) > 3])
lemmatizer = WordNetLemmatizer()
hannover = ht.HanoverTagger('morphmodel_ger.pgz')
df['tokenized'] = df[['tokenized','lang']].apply(lambda x: [lemmatizer.lemmatize(word).lower() for word in x['tokenized']] if x['lang'] == 'en' else [hannover.analyze(word)[0].lower() for word in x['tokenized']] ,axis=1)

In [6]:
def perform_LDA(tokens, topics=5, passes =5, alpha = 'symmetric', decay = 0.5):
    #create the dictionary of lemmatized tokens
    dic = Dictionary(tokens)
    print(len(dic))
    #remove low and high frequent terms
    dic.filter_extremes(no_below=2, no_above=.99)
    print(len(dic))
    #create the bag of words 
    corpus = [dic.doc2bow(d) for d in tokens]
    #build LDA model 
    LDA = LdaMulticore(corpus= corpus, num_topics=topics, id2word= dic, workers=12, passes=passes, alpha = alpha, decay = decay)
    words = [re.findall(r'"([^"]*)"',t[1]) for t in LDA.print_topics()]
    #create topics
    topics = [' '.join(t[0:10]) for t in words]

    for id, t in enumerate(topics): 
        print(f"------ Topic {id} ------")
        print(t, end="\n\n")
    # Compute Perplexity
    perplexity = LDA.log_perplexity(corpus)
    print('\nPerplexity: ', perplexity) 
    # Compute Coherence Score
    coherence_model = CoherenceModel(model=LDA, texts=tokens, 
                                   dictionary=dic, coherence='c_v')
    coherence_lda_model = coherence_model.get_coherence()
    print('\nCoherence Score: ', coherence_lda_model)
    return LDA

In [7]:
#perform topic modelling by applying LDA on the whole data set, and on the german/english only sets (guided by: https://towardsdatascience.com/twitter-topic-modeling-e0e3315b12e2)
#full data set
full_model = perform_LDA(df['tokenized'])

47228
15469
------ Topic 0 ------
gut finden lehre @anjakarliczek machen arbeit ganz arbeitsbedingung wissen forschung

------ Topic 1 ------
jahr befristet forschung arbeit vertrag gut promotion warum bekommen müssen

------ Topic 2 ------
gut @drkeichhorn #ichbinreyhan wichtig stelle wissen thread heute @sebastiankubon werden

------ Topic 3 ------
@anjakarliczek jahr system #hannaimbundestag @gew_bund problem machen aktuell perspektive immer

------ Topic 4 ------
arbeit hochschule gut stellen machen @anjakarliczek ganz immer heute befristet


Perplexity:  -8.701089009012147

Coherence Score:  0.18270245018946196


In [8]:
#only german tweets
df_ger = df.loc[df['lang'] == "de"]
df_en = df.loc[df['lang'] == "en"]
ger_model = perform_LDA(df_ger['tokenized'])

39811
12601
------ Topic 0 ------
#hannaimbundestag warum gut wichtig jahr werden @anjakarliczek hochschule deutsch wissen

------ Topic 1 ------
arbeit problem befristet jahr deutschland wissen stellen @anjakarliczek brauchen vertrag

------ Topic 2 ------
jahr befristet machen gut arbeit @gew_bund sehen immer ganz vertrag

------ Topic 3 ------
gut @anjakarliczek machen arbeitsbedingung @drkeichhorn uni system zeit forschung sprechen

------ Topic 4 ------
jahr arbeit forschung müssen stellen wissenschaftlich finden promotion heute werden


Perplexity:  -8.478811404902615

Coherence Score:  0.1422894176632658


In [9]:
#only english tweets
en_model = perform_LDA(df_en['tokenized'])

9256
3328
------ Topic 0 ------
contract @mahaelhissy many scholar position story work precarity academic issue

------ Topic 1 ------
work need year time scholar contract well system academic people

------ Topic 2 ------
researcher #ichbinreyhan working research condition permanent career precarious thread need

------ Topic 3 ------
thread system year contract many english university like career need

------ Topic 4 ------
system contract research work career student time university people change


Perplexity:  -7.812316900022977

Coherence Score:  0.22324389120296756


In [10]:
#get topics by month
df_june = df.loc[(df['new_date'] > '2021-06-01 00:00:00') & (df['new_date'] <= '2021-06-30 23:59:59')]
df_july = df.loc[(df['new_date'] > '2021-07-01 00:00:00') & (df['new_date'] <= '2021-07-31 23:59:59')]
df_august = df.loc[(df['new_date'] > '2021-08-01 00:00:00') & (df['new_date'] <= '2021-08-31 23:59:59')]
df_september = df.loc[(df['new_date'] > '2021-09-01 00:00:00') & (df['new_date'] <= '2021-09-30 23:59:59')]

In [11]:
#tweets from june
june_model = perform_LDA(df_june['tokenized'])

31199
10502
------ Topic 0 ------
jahr machen arbeit problem sehen forschung deutschland letzter universität hashtag

------ Topic 1 ------
@anjakarliczek #hannaimbundestag arbeit jahr hochschule befristet gut gerade aktuell brauchen

------ Topic 2 ------
gut arbeit jahr wissen vertrag forschung einfach haben befristet sagen

------ Topic 3 ------
jahr #hannaimbundestag stelle gut @anjakarliczek befristet system wissen eigentlich promotion

------ Topic 4 ------
thread system jahr #hannaimbundestag mensch @drkeichhorn neu @sebastiankubon befristet zeit


Perplexity:  -8.48554840120984

Coherence Score:  0.19214638929115802


In [12]:
#tweets from july
july_model = perform_LDA(df_july['tokenized'])

16767
5780
------ Topic 0 ------
deutsch @anjakarliczek hochschule forschung haben #hannabeidergew @drkeichhorn eigentlich university gerade

------ Topic 1 ------
#ichbinreyhan machen gut system @gew_bund wichtig danken problem jahr werden

------ Topic 2 ------
gut arbeit #ichbinreyhan @drkeichhorn stellen dafür jahr immer @anjakarliczek befristet

------ Topic 3 ------
jahr @anjakarliczek sehen stellen brauchen forschung beschäftigt stelle ganz @gew_bund

------ Topic 4 ------
#ichbinreyhan müssen @drkeichhorn befristet arbeit problem #hannabeidergew brauchen wissen @tagesthemen


Perplexity:  -8.403935488894236

Coherence Score:  0.25290280229756024


In [13]:
#tweets from august
august_model = perform_LDA(df_august['tokenized'])

9918
3233
------ Topic 0 ------
stellen #ichbinreyhan finden frage @drkeichhorn problem @karolinedoering bleiben system arbeit

------ Topic 1 ------
#ichbinreyhan arbeit machen heute sehen werden immer hochschule system warum

------ Topic 2 ------
gut @gew_bund zeit @anjakarliczek müssen @drkeichhorn wirklich #dauerstell neu einfach

------ Topic 3 ------
sprechen jahr aktuell denken gut arbeit #ichbinreyhan immer @drkeichhorn folge

------ Topic 4 ------
#ichbinreyhan wissen #waspostdocswoll ganz kommen akademisch gut #dauerstell arbeit system


Perplexity:  -8.061990560067759

Coherence Score:  0.30948090042844856


In [14]:
#tweets from september
september_model = perform_LDA(df_september['tokenized'])

9231
3015
------ Topic 0 ------
gut @gew_bund lehre immer jahr #ichbinreyhan promotion sehen heute forschung

------ Topic 1 ------
arbeit #ichbinreyhan neu stellen befristet vertrag heute jahr #frististfrust gut

------ Topic 2 ------
@gew_bund jahr ganz monat hochschule vertrag #ichbinreyhan aktuell #hannainzahlen #dauerstell

------ Topic 3 ------
gut #ichbinreyhan machen arbeit haben thema wissenschaftlich sprechen wählen heute

------ Topic 4 ------
#ichbinreyhan problem dürfen forschung system uni #btw21 wichtig neu groß


Perplexity:  -8.0318888456595

Coherence Score:  0.3422135791985527


In [15]:
def visualize_topics(tokens):
    #Visualization of topic distance 
    vis_dic = Dictionary(tokens)
    vis_dic.filter_extremes(no_below=2, no_above=.99)
    #create the bag of words 
    vis_corpus = [vis_dic.doc2bow(d) for d in tokens]
    #build LDA model a 
    vis_LDA = LdaMulticore(corpus= vis_corpus, num_topics=5, id2word= vis_dic, workers=12, passes=5, random_state = 1)
    pyLDAvis.enable_notebook()
    return pyLDAvis.gensim_models.prepare(vis_LDA, vis_corpus, vis_dic)
def visualize_model(model, tokens):
    vis_dic = Dictionary(tokens)
    vis_dic.filter_extremes(no_below=2, no_above=.99)
    vis_corpus = [vis_dic.doc2bow(d) for d in tokens]
    pyLDAvis.enable_notebook()
    return pyLDAvis.gensim_models.prepare(model, vis_corpus, vis_dic)

In [16]:
visualize_topics(df['tokenized'])

In [17]:
#get optimal number of topics for each (sub)set
def compute_coherence_values_topics(tokens, limit=10, start=2, step=1):
    
    dic = Dictionary(tokens)
    dic.filter_extremes(no_below=2, no_above=.99)
    corpus = [dic.doc2bow(d) for d in tokens]
    coherence_values_topic = []
    model_list_topic = []
    for num_topics in range(start, limit, step):
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dic, random_state = 1)
        model_list_topic.append(num_topics)
        coherencemodel = CoherenceModel(model=model, texts=tokens, dictionary=dic, coherence='c_v')
        coherence_values_topic.append(coherencemodel.get_coherence())

    print(model_list_topic, coherence_values_topic)
    
def compute_coherence_values_passes(tokens,num_topics):
    
    passes = [5,10,15,20]
    dic = Dictionary(tokens)
    dic.filter_extremes(no_below=2, no_above=.99)
    corpus = [dic.doc2bow(d) for d in tokens]
    coherence_values_passes = []
    model_list_passes = []
    for num_pass in passes:
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dic, passes = num_pass, random_state = 1)
        model_list_passes.append(num_pass)
        coherencemodel = CoherenceModel(model=model, texts=tokens, dictionary=dic, coherence='c_v')
        coherence_values_passes.append(coherencemodel.get_coherence())

    print(model_list_passes, coherence_values_passes)
    
def compute_coherence_values_alpha(tokens,num_topics, passes):
    
    alpha = ['symmetric','asymmetric']
    dic = Dictionary(tokens)
    dic.filter_extremes(no_below=2, no_above=.99)
    corpus = [dic.doc2bow(d) for d in tokens]
    coherence_values_alpha = []
    model_list_alpha = []
    for a in alpha:
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dic, passes = passes, alpha = a, random_state = 1)
        model_list_alpha.append(a)
        coherencemodel = CoherenceModel(model=model, texts=tokens, dictionary=dic, coherence='c_v')
        coherence_values_alpha.append(coherencemodel.get_coherence())

    print(model_list_alpha, coherence_values_alpha)
    
def compute_coherence_values_decay(tokens,num_topics, passes, alpha):
    
    decay = [0.5,0.7,0.9]
    dic = Dictionary(tokens)
    dic.filter_extremes(no_below=2, no_above=.99)
    corpus = [dic.doc2bow(d) for d in tokens]
    coherence_values_decay = []
    model_list_decay = []
    for d in decay:
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dic, passes = passes, alpha = alpha, random_state = 1, decay =d)
        model_list_decay.append(d)
        coherencemodel = CoherenceModel(model=model, texts=tokens, dictionary=dic, coherence='c_v')
        coherence_values_decay.append(coherencemodel.get_coherence())

    print(model_list_decay, coherence_values_decay)

In [18]:
#full data
compute_coherence_values_topics(df['tokenized'])
#english data
compute_coherence_values_topics(df_en['tokenized'])
#german data
compute_coherence_values_topics(df_ger['tokenized'])
#june data
compute_coherence_values_topics(df_june['tokenized'])
#july data
compute_coherence_values_topics(df_july['tokenized'])
#august data
compute_coherence_values_topics(df_august['tokenized'])
#september data
compute_coherence_values_topics(df_september['tokenized'])

[2, 3, 4, 5, 6, 7, 8, 9] [0.10456537301003273, 0.10933232716209958, 0.13983522052814684, 0.14401792967681504, 0.16532023615923622, 0.16484462935964542, 0.16624047616664991, 0.18046979098848526]
[2, 3, 4, 5, 6, 7, 8, 9] [0.1713934940401889, 0.1926387289590242, 0.20639512438015184, 0.22768174688364207, 0.2288697700012765, 0.23042403767341899, 0.2391635468428444, 0.2387414925840346]
[2, 3, 4, 5, 6, 7, 8, 9] [0.08968424088169547, 0.11143055442979366, 0.13090652216222032, 0.16296584062709765, 0.1683182159697678, 0.18246422016127548, 0.17465228169757455, 0.18142981907986136]
[2, 3, 4, 5, 6, 7, 8, 9] [0.10305715932532085, 0.12963539272547195, 0.13969566033120212, 0.17104263228258018, 0.1684735785092272, 0.20518778318038336, 0.1944392273439256, 0.19994246539845353]
[2, 3, 4, 5, 6, 7, 8, 9] [0.1596968713141924, 0.18342441323817812, 0.2316848091945012, 0.23033421945001115, 0.21452232034663432, 0.21691375176596392, 0.22270465611598422, 0.20162710324766536]
[2, 3, 4, 5, 6, 7, 8, 9] [0.218210534108

In [19]:
#full data
compute_coherence_values_passes(df['tokenized'], 6)
#english data
compute_coherence_values_passes(df_en['tokenized'],8)
#german data
compute_coherence_values_passes(df_ger['tokenized'],8)
#june data
compute_coherence_values_passes(df_june['tokenized'],8)
#july data
compute_coherence_values_passes(df_july['tokenized'],9)
#august data
compute_coherence_values_passes(df_august['tokenized'],9)
#september data
compute_coherence_values_passes(df_september['tokenized'],8)

[5, 10, 15, 20] [0.2232769219268248, 0.2690152610411907, 0.3228316499418125, 0.3268002840348955]
[5, 10, 15, 20] [0.24915991509879745, 0.2612628445116889, 0.26380476019690424, 0.271753624443677]
[5, 10, 15, 20] [0.1939851769211308, 0.21568031194880405, 0.2322086647919751, 0.23678522027750526]
[5, 10, 15, 20] [0.24474511422358788, 0.2869218881627225, 0.3260379225338824, 0.3397137101961771]
[5, 10, 15, 20] [0.2554959019382797, 0.24402870026977952, 0.2534215279594163, 0.2621248722547103]
[5, 10, 15, 20] [0.2824516647033783, 0.29574032006188355, 0.29849572880663794, 0.30912229640979666]
[5, 10, 15, 20] [0.32174582869474205, 0.31718306381680816, 0.3142800230028888, 0.32047759338204074]


In [20]:
#full data
compute_coherence_values_alpha(df['tokenized'], 6,20)
#english data
compute_coherence_values_alpha(df_en['tokenized'],8,20)
#german data
compute_coherence_values_alpha(df_ger['tokenized'],8,20)
#june data
compute_coherence_values_alpha(df_june['tokenized'],8,20)
#july data
compute_coherence_values_alpha(df_july['tokenized'],9,20)
#august data
compute_coherence_values_alpha(df_august['tokenized'],9,20)
#september data
compute_coherence_values_alpha(df_september['tokenized'],8,5)

['symmetric', 'asymmetric'] [0.3268002840348955, 0.3809749124568735]
['symmetric', 'asymmetric'] [0.271753624443677, 0.2538859498789313]
['symmetric', 'asymmetric'] [0.23701694936258688, 0.20978025739110578]
['symmetric', 'asymmetric'] [0.34050205707609915, 0.38977301977262047]
['symmetric', 'asymmetric'] [0.2621248722547103, 0.35178133343358603]
['symmetric', 'asymmetric'] [0.30912229640979666, 0.36407715047510764]
['symmetric', 'asymmetric'] [0.32174582869474205, 0.3279922890145045]


In [21]:
#full data
compute_coherence_values_decay(df['tokenized'], 6,20,'asymmetric')
#english data
compute_coherence_values_decay(df_en['tokenized'],8,20,'symmetric')
#german data
compute_coherence_values_decay(df_ger['tokenized'],8,20,'symmetric')
#june data
compute_coherence_values_decay(df_june['tokenized'],8,20,'asymmetric')
#july data
compute_coherence_values_decay(df_july['tokenized'],9,20,'asymmetric')
#august data
compute_coherence_values_decay(df_august['tokenized'],9,20,'asymmetric')
#september data
compute_coherence_values_decay(df_september['tokenized'],8,5,'asymmetric')

[0.5, 0.7, 0.9] [0.3809749124568735, 0.2963046085250385, 0.26718107317632733]
[0.5, 0.7, 0.9] [0.271753624443677, 0.26643195527369967, 0.24938472551264299]
[0.5, 0.7, 0.9] [0.2385812567691823, 0.21842055543396516, 0.19684653149208034]
[0.5, 0.7, 0.9] [0.3916118752563964, 0.3606002405298114, 0.29765623633544497]
[0.5, 0.7, 0.9] [0.35178133343358603, 0.31524882276942967, 0.30781519531133594]
[0.5, 0.7, 0.9] [0.36407715047510764, 0.34814232942355183, 0.3292749688157396]
[0.5, 0.7, 0.9] [0.3279922890145045, 0.3288853538609816, 0.33193391268330086]


In [22]:
#get the optimal models
optimal_full = perform_LDA(df['tokenized'],6,20,'asymmetric',0.5)
optimal_en = perform_LDA(df_en['tokenized'],8,20,'symmetric',0.5)
optimal_ger = perform_LDA(df_ger['tokenized'],8,20,'symmetric',0.5)
optimal_june = perform_LDA(df_june['tokenized'],8,20,'asymmetric',0.5)
optimal_july = perform_LDA(df_july['tokenized'],9,20,'asymmetric',0.5)
optimal_august = perform_LDA(df_august['tokenized'],9,20,'asymmetric',0.5)
optimal_september = perform_LDA(df_september['tokenized'],8,5,'asymmetric',0.9)

47228
15469
------ Topic 0 ------
@anjakarliczek arbeit gut forschung #hannaimbundestag wissen machen finden jahr hochschule

------ Topic 1 ------
jahr gut @drkeichhorn immer promotion haben @sebastiankubon befristet stellen stelle

------ Topic 2 ------
system contract research work year career position university researcher many

------ Topic 3 ------
jahr vertrag befristet kind monat @anjakarliczek arbeit #hannaimbundestag werden zeit

------ Topic 4 ------
erster workshop #wissenschaftler bekommen #karrieren schreiben #wissenschaftlerinnen sprechen wissenschaftlich story

------ Topic 5 ------
befristet universität promotion wissenschaftlich problem deutschland machen gut ganz promovieren


Perplexity:  -8.467444119864194

Coherence Score:  0.3548593330998684
9256
3328
------ Topic 0 ------
system science many tweet #ichbinreyhan think contract scientist like time

------ Topic 1 ------
thread system academic researcher permanent research work position university contract

------ 

In [23]:
visualize_model(optimal_full, df['tokenized'])

In [24]:
visualize_model(optimal_en, df_en['tokenized'])

In [25]:
visualize_model(optimal_ger, df_ger['tokenized'])

In [26]:
visualize_model(optimal_june, df_june['tokenized'])

In [27]:
visualize_model(optimal_july, df_july['tokenized'])

In [28]:
visualize_model(optimal_august, df_august['tokenized'])

In [29]:
visualize_model(optimal_september, df_september['tokenized'])