In [1]:
import pandas as pd
import numpy as np
import re
import emoji
import nltk

from bs4 import BeautifulSoup
from matplotlib import pyplot as plt

from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim_models
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from HanTa import HanoverTagger as ht
%matplotlib inline

nltk.download('wordnet')
nltk.download("stopwords")
nltk.download('omw-1.4')
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Load the new data additionally containing tweets from October 2021 until April 2022

In [2]:
#remove hashtags and only keep tweets with unique texts (keep oldest tweets)
df = pd.read_csv ('data/tweets/IchBinHanna_updated.csv')
df['new_date'] = pd.to_datetime(df['created_at']).dt.strftime('%Y-%m-%d %H:%M:%S')
print(len(df))
#sort by date to ensure duplicate removal keeps oldest tweet
df = df.sort_values(by='new_date')
df = df.drop_duplicates(subset=['text'], keep='first')
print(len(df))
df = df.loc[df['reference_type'] != 'retweeted']
print(len(df))
#set random seed
seed = 1337

97899
39668
31110


In [3]:
#clean the data (remove URLs, emojis and line breaks)
df['processed'] = df['text'].astype(str)
df['processed'] = df['processed'].replace(r'\\n',  ' ', regex=True)
pat1 = r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+'
pat2 = r'www.[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
split_pattern = re.compile(r'\b('  + r')\b')
def tweet_cleaner(demo):
    soup = BeautifulSoup(demo, 'lxml') # HTML
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    return stripped
df['processed'] = [tweet_cleaner(t) for t in df['processed']]
def rem_emojis(text):
    emojis = [x for x in text if x in emoji.UNICODE_EMOJI]
    cleaned = ' '.join([str for str in text.split() if not any(i in str for i in emojis)])
    return cleaned
df['processed'] = df['processed'].apply(lambda x: rem_emojis(x))
df['processed'] = df['processed'].astype(str)

## Inspect different stop word lists

In [4]:
#preprocessing (tokenization, stop word removal, lemmatizing)
german_stop = set(stopwords.words('german'))
english_stop = set(stopwords.words('english'))
add_stop_all = ["ichbinhanna","#ichbinhanna","#ichbinreyhan", "hanna", "mehr", "innen", "#wisszeitvg", "#ichbinhannah", "@amreibahr", "amreibahr", "@bmf_bund","bmf_bund", "@drkeichhorn", "drkeichhorn", "@sebastiankubon", "sebastiankubon", "@bmbf_bund", "mehr", "innen", "schon", "gehen", "jahr","wissenschaft", "wissenschaftler", "kommen","academia", "academic", "year", "machen", "sagen", "sein","geben", "also", "werden", "german", "germany","gut", "haben", "geht", "gibt", "viele", "seit", "wäre", "sehen", "ganz","bekommen","!!!","???","..."]
german_stop.update(set(add_stop_all))
english_stop.update(set(add_stop_all))
tweet_tokenizer = TweetTokenizer()
df['tokenized'] = df['processed'].apply(lambda x: tweet_tokenizer.tokenize(x.lower()))
dic = Dictionary(df['tokenized'])
print('Unique tokens without stopword removal:' ,len(dic))
df['tokenized'] = df[['tokenized','lang']].apply(lambda x: ' '.join([word for word in x['tokenized'] if word not in english_stop]).split() if x['lang'] == 'en' else ' '.join([word for word in x['tokenized'] if word not in german_stop]).split(),axis=1)
dic_stop = Dictionary(df['tokenized'])
print('Unique tokens with initial stopword removal:' ,len(dic_stop))
#add the german_stopwords list
with open('data/stopwords/german_stopwords.txt', 'r',encoding='utf8') as file:
    german_stopwords=[file.read().replace('\n', ',')]
    german_stopwords=german_stopwords[0].split(",")
print(len(german_stop))
add_german_stop = german_stop.copy()
add_german_stop.update(set(german_stopwords))
print(len(german_stopwords))
print(len(add_german_stop))
df['tokenized_ger'] = df[['tokenized','lang']].apply(lambda x: ' '.join([word for word in x['tokenized'] if word not in add_german_stop]).split(),axis=1)
dic_ger = Dictionary(df['tokenized_ger'])
print('Unique tokens with initial stopword removal + german_stopwords:' ,len(dic_ger))
#add the snowball stopword list
with open('data/stopwords/snowball.txt', 'r',encoding='utf8') as file:
    snowball_stopwords=[file.read().replace('\n', ',')]
    snowball_stopwords=snowball_stopwords[0].split(",")
add_snowball_stop = german_stop.copy()
add_snowball_stop.update(set(snowball_stopwords))
print(len(snowball_stopwords))
print(len(add_snowball_stop))
df['tokenized_snow'] = df[['tokenized','lang']].apply(lambda x: ' '.join([word for word in x['tokenized'] if word not in add_snowball_stop]).split(),axis=1)
dic_snow = Dictionary(df['tokenized_snow'])
print('Unique tokens with initial stopword removal + snowball stopwords:' ,len(dic_snow))
#remove both additional stopword lists
add_german_stop.update(set(snowball_stopwords))
df['tokenized'] = df[['tokenized','lang']].apply(lambda x: ' '.join([word for word in x['tokenized'] if word not in add_german_stop]).split(),axis=1)
dic = Dictionary(df['tokenized'])
print('Unique tokens with initial stopword removal + both additional lists:' ,len(dic))

Unique tokens without stopword removal: 64038
Unique tokens with initial stopword removal: 63806
273
1853
1879
Unique tokens with initial stopword removal + german_stopwords: 62744
275
414
Unique tokens with initial stopword removal + snowball stopwords: 63728
Unique tokens with initial stopword removal + both additional lists: 62744


In [5]:
set(german_stopwords)-german_stop

{'dorther',
 'irgendwo',
 'äusserstem',
 'zB',
 'solltet',
 'getrennt',
 'gratulierte',
 'jaehrigem',
 'denkbare',
 'ehester',
 'womit',
 'for',
 'kürzlichst',
 'heutiger',
 'letztendlich',
 'lichten',
 'folgend',
 'startet',
 'wichtig',
 'möglich',
 'machte',
 'eröffnetes',
 'mittig',
 'angesetzten',
 'dagegen',
 'allgemeinste',
 'irgendwas',
 'direkt',
 'mithin',
 'sei',
 'überll',
 'worin',
 'unerhoerter',
 'etlichem',
 'beträchtliches',
 'author',
 'sieben',
 'daher',
 'neun',
 'dahingehendes',
 'ploetzlichem',
 'unerhörte',
 'jährige',
 'ähnlichen',
 'häufigem',
 'persoenlich',
 'weiterem',
 'geworden',
 'letztes',
 'nachher',
 'allgemeiner',
 'damals',
 'ebenfalls',
 'deshalb',
 'augenscheinlichsten',
 'jährigem',
 'somit',
 'versorgte',
 'weitere',
 'damaliges',
 'ueber',
 'naturgemäss',
 'richtiggehender',
 'mehrfach',
 'tut',
 'seien',
 'allgemeinem',
 'muessen',
 'gängiger',
 'immerwaehrendem',
 'vorherig',
 'hierbei',
 'bezgl.',
 'irgend',
 'eigentlich',
 'anstatt',
 'nötige

In [6]:
df['tokenized'] = df['tokenized'].apply(lambda x: [word for word in x if len(word) > 2])
lemmatizer = WordNetLemmatizer()
hannover = ht.HanoverTagger('morphmodel_ger.pgz')
df['lemmatized'] = df[['tokenized','lang']].apply(lambda x: [lemmatizer.lemmatize(word).lower() for word in x['tokenized']] if x['lang'] == 'en' else [hannover.analyze(word)[0].lower() for word in x['tokenized']] ,axis=1)

## Perform previously used LDA approach on full, german, english and per month data

In [7]:
def perform_LDA(tokens, topics=5, passes =5, alpha = 'symmetric', decay = 0.5):
    #create the dictionary of lemmatized tokens
    dic = Dictionary(tokens)
    #print(len(dic))
    #remove low and high frequent terms
    dic.filter_extremes(no_below=2, no_above=.99)
    #print(len(dic))
    #create the bag of words 
    corpus = [dic.doc2bow(d) for d in tokens]
    #build LDA model 
    LDA = LdaMulticore(corpus= corpus, num_topics=topics, id2word= dic, workers=12, passes=passes, alpha = alpha, decay = decay)
    words = [re.findall(r'"([^"]*)"',t[1]) for t in LDA.print_topics()]
    #create topics
    topics = [' '.join(t[0:10]) for t in words]

    for id, t in enumerate(topics): 
        print(f"------ Topic {id} ------")
        print(t, end="\n\n")
    # Compute Perplexity
    perplexity = LDA.log_perplexity(corpus)
    print('\nPerplexity: ', perplexity) 
    # Compute Coherence Score
    coherence_model = CoherenceModel(model=LDA, texts=tokens, 
                                   dictionary=dic, coherence='c_v')
    coherence_lda_model = coherence_model.get_coherence()
    print('\nCoherence Score: ', coherence_lda_model)
    return LDA

In [8]:
full_model = perform_LDA(df['lemmatized'])

------ Topic 0 ------
#ichbinreyhan system thread research @gew_bund problem year frage arbeit contract

------ Topic 1 ------
#ichbinreyhan uni hochschule lehre forschung #frististfrust jahr beschäftigt zeit richtig

------ Topic 2 ------
stellen wissenschaftlich befristet zeit promotion stelle arbeit woche uni problem

------ Topic 3 ------
stellen befristet jahr wissen kind problem @anjakarliczek #hannaimbundestag #ichbinreyhan arbeitsbedingung

------ Topic 4 ------
#ichbinreyhan jahr #wisssystemfehler uni forschung system arbeit problem thema @gew_bund


Perplexity:  -9.016812370242793

Coherence Score:  0.2001953135888185


In [9]:
#only german tweets
df_ger = df.loc[df['lang'] == "de"]
df_en = df.loc[df['lang'] == "en"]
ger_model = perform_LDA(df_ger['lemmatized'])

------ Topic 0 ------
uni problem zeit arbeitsbedingung prekär deutsch wissenschaftlich thema #ichbinreyhan vertrag

------ Topic 1 ------
jahr forschung buch problem zeit arbeitsbedingung wissenschaftlich lehre #ichbinreyhan promotion

------ Topic 2 ------
#ichbinreyhan prekär @gew_bund #dauerstell deutschland befristet @suhrkamp jahr stellen hochschule

------ Topic 3 ------
#ichbinreyhan jahr uni forschung arbeit stellen system stelle gut wichtig

------ Topic 4 ------
#ichbinreyhan arbeit problem thema uni universität schaffen mensch hochschule prekär


Perplexity:  -8.983359045768525

Coherence Score:  0.1309449156745228


In [10]:
#only english tweets
en_model = perform_LDA(df_en['lemmatized'])

------ Topic 0 ------
work contract year system job research permanent many position phd

------ Topic 1 ------
year career researcher job contract position system know time research

------ Topic 2 ------
phd need contract system working condition change science english career

------ Topic 3 ------
#ichbinreyhan research system get like university want job working think

------ Topic 4 ------
thread people problem one contract @mahaelhissy system work scholar @kinofrau1


Perplexity:  -7.905068907451213

Coherence Score:  0.25655043078366013


In [11]:
#get topics by month
df_october = df.loc[(df['new_date'] > '2021-10-01 00:00:00') & (df['new_date'] <= '2021-10-31 23:59:59')]
df_november = df.loc[(df['new_date'] > '2021-11-01 00:00:00') & (df['new_date'] <= '2021-11-30 23:59:59')]
df_december = df.loc[(df['new_date'] > '2021-12-01 00:00:00') & (df['new_date'] <= '2021-12-31 23:59:59')]
df_january = df.loc[(df['new_date'] > '2022-01-01 00:00:00') & (df['new_date'] <= '2022-01-31 23:59:59')]
df_february = df.loc[(df['new_date'] > '2022-02-01 00:00:00') & (df['new_date'] <= '2022-02-28 23:59:59')]
df_march = df.loc[(df['new_date'] > '2022-03-01 00:00:00') & (df['new_date'] <= '2022-03-31 23:59:59')]
df_april = df.loc[(df['new_date'] > '2022-04-01 00:00:00') & (df['new_date'] <= '2022-04-30 23:59:59')]

In [12]:
def create_dic_corpus(tokens):
    dic = Dictionary(tokens)
    dic.filter_extremes(no_below=2, no_above=.99)
    corpus = [dic.doc2bow(d) for d in tokens]
    return dic, corpus
#get optimal number of topics for each (sub)set
def compute_coherence_values_topics(tokens, limit=10, start=2, step=1):
    dic, corpus = create_dic_corpus(tokens)
    coherence_values_topic = []
    model_list_topic = []
    for num_topics in range(start, limit, step):
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dic, random_state = 1)
        model_list_topic.append(num_topics)
        coherencemodel = CoherenceModel(model=model, texts=tokens, dictionary=dic, coherence='c_v')
        coherence_values_topic.append(coherencemodel.get_coherence())

    print(model_list_topic, coherence_values_topic)
    return(model_list_topic[np.argmax(coherence_values_topic)])

    
def compute_coherence_values_passes(tokens,num_topics):
    
    passes = [5,10,15,20]
    dic, corpus = create_dic_corpus(tokens)
    coherence_values_passes = []
    model_list_passes = []
    for num_pass in passes:
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dic, passes = num_pass, random_state = 1)
        model_list_passes.append(num_pass)
        coherencemodel = CoherenceModel(model=model, texts=tokens, dictionary=dic, coherence='c_v')
        coherence_values_passes.append(coherencemodel.get_coherence())

    print(model_list_passes, coherence_values_passes)
    return(model_list_passes[np.argmax(coherence_values_passes)])
    
def compute_coherence_values_alpha(tokens,num_topics, passes):
    
    alpha = ['symmetric','asymmetric']
    dic, corpus = create_dic_corpus(tokens)
    coherence_values_alpha = []
    model_list_alpha = []
    for a in alpha:
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dic, passes = passes, alpha = a, random_state = 1)
        model_list_alpha.append(a)
        coherencemodel = CoherenceModel(model=model, texts=tokens, dictionary=dic, coherence='c_v')
        coherence_values_alpha.append(coherencemodel.get_coherence())

    print(model_list_alpha, coherence_values_alpha)
    return(model_list_alpha[np.argmax(coherence_values_alpha)])
    
def compute_coherence_values_decay(tokens,num_topics, passes, alpha):
    
    decay = [0.5,0.7,0.9]
    dic, corpus = create_dic_corpus(tokens)
    coherence_values_decay = []
    model_list_decay = []
    for d in decay:
        model = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dic, passes = passes, alpha = alpha, random_state = 1, decay =d)
        model_list_decay.append(d)
        coherencemodel = CoherenceModel(model=model, texts=tokens, dictionary=dic, coherence='c_v')
        coherence_values_decay.append(coherencemodel.get_coherence())

    print(model_list_decay, coherence_values_decay)
    return(model_list_decay[np.argmax(coherence_values_decay)])

In [13]:
#full data
fullt = compute_coherence_values_topics(df['lemmatized'])
#english data
engt = compute_coherence_values_topics(df_en['lemmatized'])
#german data
gert = compute_coherence_values_topics(df_ger['lemmatized'])
#october data
octt = compute_coherence_values_topics(df_october['lemmatized'])
#november data
novt = compute_coherence_values_topics(df_november['lemmatized'])
#december data
dect = compute_coherence_values_topics(df_december['lemmatized'])
#january data
jant = compute_coherence_values_topics(df_january['lemmatized'])
#february data
febt = compute_coherence_values_topics(df_february['lemmatized'])
#march data
mart = compute_coherence_values_topics(df_march['lemmatized'])
#april data
aprt = compute_coherence_values_topics(df_april['lemmatized'])

[2, 3, 4, 5, 6, 7, 8, 9] [0.20942709036718882, 0.17651272488865366, 0.1655239379031196, 0.16269701716464058, 0.177887186461377, 0.17355040300847424, 0.17759540943306415, 0.17767310316737922]
[2, 3, 4, 5, 6, 7, 8, 9] [0.20150935623536642, 0.21219328479388752, 0.21923247396414478, 0.23322762215007592, 0.2587449125001096, 0.25356037620103095, 0.25229446678093814, 0.2651279236080119]
[2, 3, 4, 5, 6, 7, 8, 9] [0.1538619294560588, 0.1257248811979674, 0.1316015833282439, 0.12465620584424042, 0.12075230258930002, 0.11393765349526275, 0.13004857168308487, 0.12974155717781655]
[2, 3, 4, 5, 6, 7, 8, 9] [0.34813409349800056, 0.33175869755338877, 0.3393999594392588, 0.3618856901959886, 0.3646382056456497, 0.40035160713735074, 0.3742170053754307, 0.3610288628210986]
[2, 3, 4, 5, 6, 7, 8, 9] [0.3527748286823005, 0.37532498520346574, 0.3806727961111532, 0.3832688745153251, 0.38527927412838686, 0.3835998691653932, 0.3949504019654041, 0.38474338748173936]
[2, 3, 4, 5, 6, 7, 8, 9] [0.4897096372254163, 0.

In [14]:
#full data
fullp = compute_coherence_values_passes(df['lemmatized'], fullt)
#english data
engp = compute_coherence_values_passes(df_en['lemmatized'], engt)
#german data
gerp = compute_coherence_values_passes(df_ger['lemmatized'], gert)
#october data
octp = compute_coherence_values_passes(df_october['lemmatized'], octt)
#november data
novp = compute_coherence_values_passes(df_november['lemmatized'], novt)
#december data
decp = compute_coherence_values_passes(df_december['lemmatized'], dect)
#january data
janp = compute_coherence_values_passes(df_january['lemmatized'], jant)
#february data
febp = compute_coherence_values_passes(df_february['lemmatized'], febt)
#march data
marp = compute_coherence_values_passes(df_march['lemmatized'], mart)
#april data
aprp = compute_coherence_values_passes(df_april['lemmatized'], aprt)

[5, 10, 15, 20] [0.24209267698760617, 0.21886209933366757, 0.21446740725109142, 0.24867186465156824]
[5, 10, 15, 20] [0.26665668548613347, 0.2637055885624337, 0.25847943823749653, 0.2625805372072232]
[5, 10, 15, 20] [0.12037027384113425, 0.12100757915656632, 0.1244809149019389, 0.1244809149019389]
[5, 10, 15, 20] [0.43680018377580826, 0.4371549353278415, 0.4216693643727405, 0.4090508381269662]
[5, 10, 15, 20] [0.4102189966438579, 0.41920294048258433, 0.4119814776753317, 0.40524465690989475]
[5, 10, 15, 20] [0.5049158153286192, 0.5082212994658306, 0.5029604552557182, 0.5071165092166703]
[5, 10, 15, 20] [0.47884093153356866, 0.4786545184823492, 0.47081712928189934, 0.4752796477111014]
[5, 10, 15, 20] [0.5459866917983358, 0.5450343921586717, 0.5399708114323836, 0.5404115818073791]
[5, 10, 15, 20] [0.48351872074825897, 0.4791349059072609, 0.4794033133238538, 0.47641226788728597]
[5, 10, 15, 20] [0.5404205658584518, 0.5576526718070036, 0.5575939971765392, 0.553098294509926]


In [15]:
#full data
fulla = compute_coherence_values_alpha(df['lemmatized'], fullt, fullp)
#english data
enga = compute_coherence_values_alpha(df_en['lemmatized'], engt, engp)
#german data
gera = compute_coherence_values_alpha(df_ger['lemmatized'], gert, gerp)
#october data
octa = compute_coherence_values_alpha(df_october['lemmatized'], octt, octp)
#november data
nova = compute_coherence_values_alpha(df_november['lemmatized'], novt, novp)
#december data
deca = compute_coherence_values_alpha(df_december['lemmatized'], dect, decp)
#january data
jana = compute_coherence_values_alpha(df_january['lemmatized'], jant, janp)
#february data
feba = compute_coherence_values_alpha(df_february['lemmatized'], febt, febp)
#march data
mara = compute_coherence_values_alpha(df_march['lemmatized'], mart, marp)
#april data
apra = compute_coherence_values_alpha(df_april['lemmatized'], aprt, aprp)

['symmetric', 'asymmetric'] [0.24867186465156824, 0.4371983661135381]
['symmetric', 'asymmetric'] [0.26665668548613347, 0.2688917729439723]
['symmetric', 'asymmetric'] [0.1244809149019389, 0.13803249796904893]
['symmetric', 'asymmetric'] [0.4371549353278415, 0.4075015145199893]
['symmetric', 'asymmetric'] [0.41920294048258433, 0.396584496213706]
['symmetric', 'asymmetric'] [0.5082212994658306, 0.5182818582139231]
['symmetric', 'asymmetric'] [0.47884093153356866, 0.4747444713513567]
['symmetric', 'asymmetric'] [0.5459866917983358, 0.5213595036325166]
['symmetric', 'asymmetric'] [0.48351872074825897, 0.4364870916584099]
['symmetric', 'asymmetric'] [0.5576526718070036, 0.5548377352048373]


In [16]:
#full data
fulld = compute_coherence_values_decay(df['lemmatized'], fullt, fullp, fulla)
#english data
engd = compute_coherence_values_decay(df_en['lemmatized'], engt, engp, enga)
#german data
gerd = compute_coherence_values_decay(df_ger['lemmatized'], gert, gerp, gera)
#october data
octd = compute_coherence_values_decay(df_october['lemmatized'], octt, octp, octa)
#november data
novd = compute_coherence_values_decay(df_november['lemmatized'], novt, novp, nova)
#december data
decd = compute_coherence_values_decay(df_december['lemmatized'], dect, decp, deca)
#january data
jand = compute_coherence_values_decay(df_january['lemmatized'], jant, janp, jana)
#february data
febd = compute_coherence_values_decay(df_february['lemmatized'], febt, febp, feba)
#march data
mard = compute_coherence_values_decay(df_march['lemmatized'], mart, marp, mara)
#april data
aprd = compute_coherence_values_decay(df_april['lemmatized'], aprt, aprp, apra)

[0.5, 0.7, 0.9] [0.4371983661135381, 0.4425860936909295, 0.4309099557612427]
[0.5, 0.7, 0.9] [0.2688917729439723, 0.26070712240425403, 0.26854282554060455]
[0.5, 0.7, 0.9] [0.15675988826273551, 0.14641196770650064, 0.1478038609933699]
[0.5, 0.7, 0.9] [0.4371549353278415, 0.4267502483912712, 0.4271102330369674]
[0.5, 0.7, 0.9] [0.41920294048258433, 0.40903853881600305, 0.41116676649776485]
[0.5, 0.7, 0.9] [0.5182818582139231, 0.514136916123627, 0.5138354994017582]
[0.5, 0.7, 0.9] [0.47884093153356866, 0.47559833837451243, 0.47988035135457136]
[0.5, 0.7, 0.9] [0.5459866917983358, 0.5459866917983358, 0.5521462850077123]
[0.5, 0.7, 0.9] [0.48351872074825897, 0.48779333521485024, 0.4889898238668444]
[0.5, 0.7, 0.9] [0.5576526718070036, 0.5522268017241497, 0.5513391534273439]


In [17]:
print(fullt,fullp,fulla,fulld)
print(engt,engp,enga,engd)
print(gert,gerp,gera,gerd)
print(octt,octp,octa,octd)
print(novt,novp,nova,novd)
print(dect,decp,deca,decd)
print(jant,janp,jana,jand)
print(febt,febp,feba,febd)
print(mart,marp,mara,mard)
print(aprt,aprp,apra,aprd)
#two topic for the full and the german model do not really make sense here, so they are manually updated to 6 each
fullt = 6
gert = 6

2 20 asymmetric 0.7
9 5 asymmetric 0.5
2 15 asymmetric 0.5
7 10 symmetric 0.5
8 10 symmetric 0.5
7 10 asymmetric 0.5
9 5 symmetric 0.9
7 5 symmetric 0.9
6 5 symmetric 0.9
4 10 symmetric 0.5


In [18]:
full_model = perform_LDA(df['lemmatized'], fullt, fullp, fulla, fulld)

------ Topic 0 ------
#ichbinreyhan uni jahr forschung problem lehre stellen @gew_bund befristet hochschule

------ Topic 1 ------
system #ichbinreyhan contract job work research phd position one working

------ Topic 2 ------
#ichbinreyhan wissen thema prekär arbeit stellen uni diskussion wissenschaftlich #95vswisszeitvg

------ Topic 3 ------
#ichbinreyhan uhr #tvstud @gew_bund #dasgewinnenwir jahr @nga_wiss professur studierend tenure

------ Topic 4 ------
#ichbinreyhan frage #ichbinhannaat #ugnovelle deutschland stellen mensch trend @faznet via

------ Topic 5 ------
jahr arbeit befristet zeit stelle frage stellen vertrag wissen job


Perplexity:  -8.812228789873192

Coherence Score:  0.3353817904326009


In [19]:
eng_model = perform_LDA(df_en['lemmatized'], engt, engp, enga, engd)

------ Topic 0 ------
thread #ichbinreyhan job researcher position career need like many phd

------ Topic 1 ------
contract system #ichbinreyhan year work people phd academic get university

------ Topic 2 ------
research year system contract phd permanent position career one job

------ Topic 3 ------
@mahaelhissy movement condition working white event change @kinofrau1 scholar @akellergew

------ Topic 4 ------
working condition precarious university work good discussion going check better

------ Topic 5 ------
i'm time work research #ichbinreyhan still thanks need mean science

------ Topic 6 ------
work system researcher one need university time see change contract

------ Topic 7 ------
work career research one job working condition contract time #ichbinreyhan

------ Topic 8 ------
contract research science system education debate get people university job


Perplexity:  -8.03977096219536

Coherence Score:  0.2692967402593112


In [20]:
ger_model = perform_LDA(df_ger['lemmatized'], gert, gerp, gera, gerd)

------ Topic 0 ------
#ichbinreyhan uni jahr arbeitsbedingung stelle wissen prekär vertrag hochschule @gew_bund

------ Topic 1 ------
deutschland problem prekär stellen jahr buch forschung befristet wissenschaftlich professur

------ Topic 2 ------
streitschrift #ichbinreyhan richtig jahr wissenschaftlich wichtig uni wissenschaftssystem monat schön

------ Topic 3 ------
frage universität #ichbinreyhan arbeit mensch system forschung publikation stellen leben

------ Topic 4 ------
#ichbinreyhan problem @jenniferhenkehb arbeit uni #frististfrust @nga_wiss #dauerstellenfürdaueraufgaben deutschland befristung

------ Topic 5 ------
#tvstud universität #lauterbachruecktritt uni @gew_bund wissenschaftssystem debatte #ichbinreyhan geld industrie


Perplexity:  -8.945290741361394

Coherence Score:  0.2256829226190252


In [21]:
oct_model = perform_LDA(df_october['lemmatized'], octt, octp, octa, octd)

------ Topic 0 ------
#wisssystemfehler hochschule #dauerstell system sprechen @hrk_aktuell deutschland @gew_bund alt deutsch

------ Topic 1 ------
miller #hannasbegabung anlehnung universität stelle groß hochschule forschung prekär interessant

------ Topic 2 ------
#ichbinreyhan #wisssystemfehler jahr professur system uni lehre wissenschaftssystem deutsch forschung

------ Topic 3 ------
#wisssystemfehler #ichbinreyhan befristet professur problem vertrag arbeit richtig jahr mittelbau

------ Topic 4 ------
arbeit arbeitsbedingung @gew_bund #tvstud zeit uni diskutieren frage wissen #ichbinreyhan

------ Topic 5 ------
#tvstud beschäftigt jahr wissenschaftlich hamburg mitarbeiter uni #dasgewinnenwir #stopthecuts forschung

------ Topic 6 ------
uni system stellen jahr stelle #wisssystemfehler liebe problem richtig #ichbinreyhan


Perplexity:  -8.11839404688689

Coherence Score:  0.40434387403306976


In [22]:
nov_model = perform_LDA(df_november['lemmatized'], novt, novp, nova, novd)

------ Topic 0 ------
arbeitsbedingung @akellergew prekär #ichbinreyhan #koalitionsvertrag @gew_bund postdocs professur akademisch land

------ Topic 1 ------
system #ichbinreyhan jahr reform prekär zeit mittelbau monat postdoc thema

------ Topic 2 ------
#ichbinreyhan #tvstud #frististfrust #wisssystemfehler #wirhabenbedarf jahr beschäftigt @adressel arbeitsbedingung uni

------ Topic 3 ------
@anja_steinbeck #wisssystemfehler diskussion forschung stellen uni system karriere bildung @gew_bund

------ Topic 4 ------
#ichbinreyhan lehre uhr forschung #wisssystemfehler zeigen chance woche system problem

------ Topic 5 ------
@gew_bund #tvstud #dauerstell #dasgewinnenwir #keineausnahme deutschland hochschule #troed21 @ladybitchray1 #wisssystemfehler

------ Topic 6 ------
stellen universität #ichbinreyhan uni via frage #berlhg jahr @faznet berlin

------ Topic 7 ------
uhr #ichbinreyhan #tvstud forschung #dasgewinnenwir hochschule kolleg arbeit problem #warnstreik


Perplexity:  -8.0490

In [23]:
dec_model = perform_LDA(df_december['lemmatized'], dect, decp, deca, decd)

------ Topic 0 ------
#ichbinreyhan @hrk_aktuell jahr @gew_bund #hrkadvent türchen #adventskalender stellen lehre #wisssystemfehler

------ Topic 1 ------
uni forderung double-binds realität folge forschung jahr #ichbinreyhan befristet tag

------ Topic 2 ------
arbeit hochschule schön leute studium thread @_verdi gerne #ichbinreyhan befristet

------ Topic 3 ------
#ichbinreyhan liebe denken wissen research lehre @unibremen kosten arbeit hochschule

------ Topic 4 ------
@humboldtuni @bverfg kaputt #berlhg abschluss mensch @mliebendoerfer verpassen vorgang gehen

------ Topic 5 ------
podiumsdiskussion idee @fellercarsten @christine_blume @jule_specht entwickeln kübler andrea reden situation

------ Topic 6 ------
stellen liebe arbeit nachwuchs uni twitter gute stelle 2021 #ichbinhannaat


Perplexity:  -7.716203525626036

Coherence Score:  0.5277699483215332


In [24]:
jan_model = perform_LDA(df_january['lemmatized'], jant, janp, jana, jand)

------ Topic 0 ------
#ichbinreyhan deutsch lehre system @starkwatzinger forschung problem job studierend wissen

------ Topic 1 ------
sprechen #ichbinreyhan befristet stark system jahr uni problem @jenniferhenkehb krieg

------ Topic 2 ------
#ichbinreyhan @unileipzig @histodigitale @agehrlach @unv_nunftbegabt @mliebendoerfer thema @piczenik1 beitrag #wisssystemfehler

------ Topic 3 ------
#ichbinreyhan #wisssystemfehler akademisch problem thema zeit forschung #wissenschaft ändern warten

------ Topic 4 ------
uni #ichbinreyhan frage wissen zeit befristet system projekt @sainethina hochschule

------ Topic 5 ------
#ichbinreyhan jahr job arbeit stelle system uni vertrag mensch elternzeit

------ Topic 6 ------
stellen #ichbinreyhan problem arbeitsbedingung universität forschung befristet professur phd schuld

------ Topic 7 ------
jahr @starkwatzinger #ichbinreyhan @jenniferhenkehb @mliebendoerfer wissen führen stelle promotion glück

------ Topic 8 ------
jahr uni arbeit zeit #wiss

In [25]:
feb_model = perform_LDA(df_february['lemmatized'], febt, febp, feba, febd)

------ Topic 0 ------
#ichbinreyhan #wisssystemfehler stellen groß wissenschaftlich #hannaorganisiertsich #unigöttingen problem much @starkwatzinger

------ Topic 1 ------
#ichbinreyhan jahr wissen frage #firstgen professur stellen finanziell woche @gew_bund

------ Topic 2 ------
uni forschung #ichbinreyhan lehre gut arbeit jahr denken forschend minen

------ Topic 3 ------
#ichbinreyhan forschung #academicprecarity zeit #oneofusallofus problem system international hochschule akademisch

------ Topic 4 ------
#lenzen dieter #jlugiessen @jlugiessen uni herr arbeit jahr verstehen grund

------ Topic 5 ------
#ichbinreyhan @starkwatzinger befristung wichtig thema job liebe uni hochschule schaffen

------ Topic 6 ------
#wissenschaft #phdlife problem #ichbinreyhan aktuell zeit thema woche system befristet


Perplexity:  -7.883955659696476

Coherence Score:  0.5051161164454822


In [26]:
mar_model = perform_LDA(df_march['lemmatized'], mart, marp, mara, mard)

------ Topic 0 ------
@jenniferhenkehb @starkwatzinger arbeit stellen jahr #ichbinreyhan uni freuen bezahlen system

------ Topic 1 ------
#ichbinreyhan buch system prekär jahr deutschland #wisssystemfehler arbeit liebe forschung

------ Topic 2 ------
prekär #ichbinreyhan uni postdoc befristet frage forschung vertrag arbeit idiotisch

------ Topic 3 ------
buch @suhrkamp #ichbinreyhan vertrag prekär thema stelle arbeit deutschland herzlich

------ Topic 4 ------
forschung #ichbinreyhan #weilwirwissenschaftlieben #thesis_ev #mentalhealth deutschland #berlhg @gew_bund akademisch #dauerstell

------ Topic 5 ------
jahr befristet professur unbefristet promotion karriere stellen uni mensch wissenschaftlich


Perplexity:  -7.7100296859833

Coherence Score:  0.505062252260118


In [27]:
apr_model = perform_LDA(df_april['lemmatized'], aprt, aprp, apra, aprd)

------ Topic 0 ------
liebe @maithi_nk @drlutzboehm buch @michael_gerloff thema prekär @maithinkx deutschland richtig

------ Topic 1 ------
#ichbinreyhan streitschrift stellen forschung leute brotlose arbeit problem deutsch #unigöttingen

------ Topic 2 ------
#ichbinreyhan @maithinkx arbeitsbedingung @maithi_nk #wisssystemfehler jahr thema wissen wichtig problem

------ Topic 3 ------
wissenschaftlich @maithi_nk problem arbeitsbedingung wissenschaftssystem nachwuchs thread prof #lauterbachruecktrittjetzt endlich


Perplexity:  -7.2436653824073245

Coherence Score:  0.5609241104079168


### Descriptive statistic about documents

In [80]:
print("Full model:")
print("Number of documents:" + str(len(df)) +" Minimum word count:" + str(min(df['lemmatized'].map(len))) + " Maximum word count:" + str(max(df['lemmatized'].map(len))) + " Mean word count:" + str(df['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nEnglish model:")
print("Number of documents:" + str(len(df_en)) +" Minimum word count:" + str(min(df_en['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_en['lemmatized'].map(len))) + " Mean word count:" + str(df_en['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_en['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_en['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_en['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nGerman model:")
print("Number of documents:" + str(len(df_ger)) +" Minimum word count:" + str(min(df_ger['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_ger['lemmatized'].map(len))) + " Mean word count:" + str(df_ger['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_ger['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_ger['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_ger['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nOctober model:")
print("Number of documents:" + str(len(df_october)) +" Minimum word count:" + str(min(df_october['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_october['lemmatized'].map(len))) + " Mean word count:" + str(df_october['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_october['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_october['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_october['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nNovember model:")
print("Number of documents:" + str(len(df_november)) +" Minimum word count:" + str(min(df_november['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_november['lemmatized'].map(len))) + " Mean word count:" + str(df_november['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_november['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_november['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_november['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nDecember model:")
print("Number of documents:" + str(len(df_december)) +" Minimum word count:" + str(min(df_december['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_december['lemmatized'].map(len))) + " Mean word count:" + str(df_december['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_december['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_december['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_december['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nJanuary model:")
print("Number of documents:" + str(len(df_january)) +" Minimum word count:" + str(min(df_january['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_january['lemmatized'].map(len))) + " Mean word count:" + str(df_january['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_january['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_january['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_january['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nFebruary model:")
print("Number of documents:" + str(len(df_february)) +" Minimum word count:" + str(min(df_february['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_february['lemmatized'].map(len))) + " Mean word count:" + str(df_february['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_february['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_february['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_february['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nMarch model:")
print("Number of documents:" + str(len(df_march)) +" Minimum word count:" + str(min(df_march['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_march['lemmatized'].map(len))) + " Mean word count:" + str(df_march['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_march['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_march['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_march['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nApril model:")
print("Number of documents:" + str(len(df_april)) +" Minimum word count:" + str(min(df_april['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_april['lemmatized'].map(len))) + " Mean word count:" + str(df_april['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_april['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_april['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_april['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))


Full model:
Number of documents:31110 Minimum word count:0 Maximum word count:47 Mean word count:10.3401800064288
Minimum character count:0 Maximum character count:557 Mean character count:99.59244615879139

English model:
Number of documents:3589 Minimum word count:0 Maximum word count:34 Mean word count:13.5881861242686
Minimum character count:0 Maximum character count:364 Mean character count:109.81303984396767

German model:
Number of documents:24937 Minimum word count:0 Maximum word count:47 Mean word count:10.698119260536552
Minimum character count:0 Maximum character count:557 Mean character count:105.89902554437182

October model:
Number of documents:2144 Minimum word count:0 Maximum word count:28 Mean word count:10.71455223880597
Minimum character count:0 Maximum character count:341 Mean character count:106.37033582089552

November model:
Number of documents:1955 Minimum word count:0 Maximum word count:47 Mean word count:11.011764705882353
Minimum character count:0 Maximum cha

## Pooling tweets

As tweets with 280 are very short documents especially for the approach of LDA. To overcome this problem, the following part explores different pooling strategies. This means that tweets that are somehow related will be concatenated to one document. The assumption here is that all people that tweet under the hashtag talk about very similar topics on the same day. Former analysis showed that conversation is very event-driven (e.g. #HannahImBundestag) and topics change over time, so this assumption is reasonable. For that reason and also to keep the temporal information, tweets are always at least aggregated on days. The first pooling strategy will only pool by day. It is expected that for the monthly models at least this will most likely lead to worse performance, because there will be too few documents.

### Pool tweets by day

In [86]:
#to pool by day the date format is changed to not include daytime
df['date'] =  pd.to_datetime(df['new_date']).dt.strftime('%Y-%m-%d')
df_en['date'] =  pd.to_datetime(df_en['new_date']).dt.strftime('%Y-%m-%d')
df_ger['date'] =  pd.to_datetime(df_ger['new_date']).dt.strftime('%Y-%m-%d')
df_october['date'] =  pd.to_datetime(df_october['new_date']).dt.strftime('%Y-%m-%d')
df_november['date'] =  pd.to_datetime(df_november['new_date']).dt.strftime('%Y-%m-%d')
df_december['date'] =  pd.to_datetime(df_december['new_date']).dt.strftime('%Y-%m-%d')
df_january['date'] =  pd.to_datetime(df_january['new_date']).dt.strftime('%Y-%m-%d')
df_february['date'] =  pd.to_datetime(df_february['new_date']).dt.strftime('%Y-%m-%d')
df_march['date'] =  pd.to_datetime(df_march['new_date']).dt.strftime('%Y-%m-%d')
df_april['date'] =  pd.to_datetime(df_april['new_date']).dt.strftime('%Y-%m-%d')

In [118]:
df_p = df.groupby(['date'], as_index=False)['lemmatized'].sum()
df_en_p = df_en.groupby(['date'], as_index=False)['lemmatized'].sum()
df_ger_p = df_ger.groupby(['date'], as_index=False)['lemmatized'].sum()
df_october_p = df_october.groupby(['date'], as_index=False)['lemmatized'].sum()
df_november_p = df_november.groupby(['date'], as_index=False)['lemmatized'].sum()
df_december_p = df_december.groupby(['date'], as_index=False)['lemmatized'].sum()
df_january_p = df_january.groupby(['date'], as_index=False)['lemmatized'].sum()
df_february_p = df_february.groupby(['date'], as_index=False)['lemmatized'].sum()
df_march_p = df_march.groupby(['date'], as_index=False)['lemmatized'].sum()
df_april_p = df_april.groupby(['date'], as_index=False)['lemmatized'].sum()

### Descriptive statistic about documents pooled by day

In [119]:
print("Full model:")
print("Number of documents:" + str(len(df_p)) +" Minimum word count:" + str(min(df_p['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_p['lemmatized'].map(len))) + " Mean word count:" + str(df_p['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nEnglish model:")
print("Number of documents:" + str(len(df_en_p)) +" Minimum word count:" + str(min(df_en_p['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_en_p['lemmatized'].map(len))) + " Mean word count:" + str(df_en_p['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_en_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_en_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_en_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nGerman model:")
print("Number of documents:" + str(len(df_ger_p)) +" Minimum word count:" + str(min(df_ger_p['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_ger_p['lemmatized'].map(len))) + " Mean word count:" + str(df_ger_p['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_ger_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_ger_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_ger_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nOctober model:")
print("Number of documents:" + str(len(df_october_p)) +" Minimum word count:" + str(min(df_october_p['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_october_p['lemmatized'].map(len))) + " Mean word count:" + str(df_october_p['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_october_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_october_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_october_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nNovember model:")
print("Number of documents:" + str(len(df_november_p)) +" Minimum word count:" + str(min(df_november_p['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_november_p['lemmatized'].map(len))) + " Mean word count:" + str(df_november_p['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_november_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_november_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_november_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nDecember model:")
print("Number of documents:" + str(len(df_december_p)) +" Minimum word count:" + str(min(df_december_p['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_december_p['lemmatized'].map(len))) + " Mean word count:" + str(df_december_p['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_december_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_december_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_december_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nJanuary model:")
print("Number of documents:" + str(len(df_january_p)) +" Minimum word count:" + str(min(df_january_p['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_january_p['lemmatized'].map(len))) + " Mean word count:" + str(df_january_p['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_january_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_january_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_january_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nFebruary model:")
print("Number of documents:" + str(len(df_february_p)) +" Minimum word count:" + str(min(df_february_p['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_february['lemmatized'].map(len))) + " Mean word count:" + str(df_february_p['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_february_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_february_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_february_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nMarch model:")
print("Number of documents:" + str(len(df_march_p)) +" Minimum word count:" + str(min(df_march_p['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_march['lemmatized'].map(len))) + " Mean word count:" + str(df_march_p['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_march_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_march_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_march_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nApril model:")
print("Number of documents:" + str(len(df_april_p)) +" Minimum word count:" + str(min(df_april_p['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_april['lemmatized'].map(len))) + " Mean word count:" + str(df_april_p['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_april_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_april_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_april_p['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

Full model:
Number of documents:308 Minimum word count:37 Maximum word count:21826 Mean word count:1044.4253246753246
Minimum character count:356 Maximum character count:204192 Mean character count:10155.584415584415

English model:
Number of documents:298 Minimum word count:1 Maximum word count:4081 Mean word count:163.6510067114094
Minimum character count:8 Maximum character count:32069 Mean character count:1333.4194630872482

German model:
Number of documents:308 Minimum word count:37 Maximum word count:17926 Mean word count:866.1655844155844
Minimum character count:356 Maximum character count:170394 Mean character count:8653.66883116883

October model:
Number of documents:31 Minimum word count:248 Maximum word count:1667 Mean word count:741.0322580645161
Minimum character count:2483 Maximum character count:16793 Mean character count:7422.967741935484

November model:
Number of documents:30 Minimum word count:135 Maximum word count:2183 Mean word count:717.6
Minimum character count:

In [89]:
#full data
fullt_p = compute_coherence_values_topics(df_p['lemmatized'])
#english data
engt_p = compute_coherence_values_topics(df_en_p['lemmatized'])
#german data
gert_p = compute_coherence_values_topics(df_ger_p['lemmatized'])
#october data
octt_p = compute_coherence_values_topics(df_october_p['lemmatized'])
#november data
novt_p = compute_coherence_values_topics(df_november_p['lemmatized'])
#december data
dect_p = compute_coherence_values_topics(df_december_p['lemmatized'])
#january data
jant_p = compute_coherence_values_topics(df_january_p['lemmatized'])
#february data
febt_p = compute_coherence_values_topics(df_february_p['lemmatized'])
#march data
mart_p = compute_coherence_values_topics(df_march_p['lemmatized'])
#april data
aprt_p = compute_coherence_values_topics(df_april_p['lemmatized'])

#full data
fullp_p = compute_coherence_values_passes(df_p['lemmatized'], fullt_p)
#english data
engp_p = compute_coherence_values_passes(df_en_p['lemmatized'], engt_p)
#german data
gerp_p = compute_coherence_values_passes(df_ger_p['lemmatized'], gert_p)
#october data
octp_p = compute_coherence_values_passes(df_october_p['lemmatized'], octt_p)
#november data
novp_p = compute_coherence_values_passes(df_november_p['lemmatized'], novt_p)
#december data
decp_p = compute_coherence_values_passes(df_december_p['lemmatized'], dect_p)
#january data
janp_p = compute_coherence_values_passes(df_january_p['lemmatized'], jant_p)
#february data
febp_p = compute_coherence_values_passes(df_february_p['lemmatized'], febt_p)
#march data
marp_p = compute_coherence_values_passes(df_march_p['lemmatized'], mart_p)
#april data
aprp_p = compute_coherence_values_passes(df_april_p['lemmatized'], aprt_p)

#full data
fulla_p = compute_coherence_values_alpha(df_p['lemmatized'], fullt_p, fullp_p)
#english data
enga_p = compute_coherence_values_alpha(df_en_p['lemmatized'], engt_p, engp_p)
#german data
gera_p = compute_coherence_values_alpha(df_ger_p['lemmatized'], gert_p, gerp_p)
#october data
octa_p = compute_coherence_values_alpha(df_october_p['lemmatized'], octt_p, octp_p)
#november data
nova_p = compute_coherence_values_alpha(df_november_p['lemmatized'], novt_p, novp_p)
#december data
deca_p = compute_coherence_values_alpha(df_december_p['lemmatized'], dect_p, decp_p)
#january data
jana_p = compute_coherence_values_alpha(df_january_p['lemmatized'], jant_p, janp_p)
#february data
feba_p = compute_coherence_values_alpha(df_february_p['lemmatized'], febt_p, febp_p)
#march data
mara_p = compute_coherence_values_alpha(df_march_p['lemmatized'], mart_p, marp_p)
#april data
apra_p = compute_coherence_values_alpha(df_april_p['lemmatized'], aprt_p, aprp_p)

#full data
fulld_p = compute_coherence_values_decay(df_p['lemmatized'], fullt_p, fullp_p, fulla_p)
#english data
engd_p = compute_coherence_values_decay(df_en_p['lemmatized'], engt_p, engp_p, enga_p)
#german data
gerd_p = compute_coherence_values_decay(df_ger_p['lemmatized'], gert_p, gerp_p, gera_p)
#october data
octd_p = compute_coherence_values_decay(df_october_p['lemmatized'], octt_p, octp_p, octa_p)
#november data
novd_p = compute_coherence_values_decay(df_november_p['lemmatized'], novt_p, novp_p, nova_p)
#december data
decd_p = compute_coherence_values_decay(df_december_p['lemmatized'], dect_p, decp_p, deca_p)
#january data
jand_p = compute_coherence_values_decay(df_january_p['lemmatized'], jant_p, janp_p, jana_p)
#february data
febd_p = compute_coherence_values_decay(df_february_p['lemmatized'], febt_p, febp_p, feba_p)
#march data
mard_p = compute_coherence_values_decay(df_march_p['lemmatized'], mart_p, marp_p, mara_p)
#april data
aprd_p = compute_coherence_values_decay(df_april_p['lemmatized'], aprt_p, aprp_p, apra_p)

[2, 3, 4, 5, 6, 7, 8, 9] [0.26460865667191613, 0.2635516829916517, 0.2630809197235411, 0.2652905169717595, 0.26710975636057843, 0.26533238180236685, 0.2651216603586509, 0.26525735016699037]
[2, 3, 4, 5, 6, 7, 8, 9] [0.3005328299923204, 0.3092932345075032, 0.30012160890821654, 0.2967215288476258, 0.2955122385517411, 0.3011540265377396, 0.2959008585099691, 0.29855344572735143]
[2, 3, 4, 5, 6, 7, 8, 9] [0.2398956848411093, 0.23831281432044324, 0.23977368050371461, 0.23807199219061054, 0.2370149770019786, 0.23744632867756152, 0.23905617096407816, 0.23766316285795813]
[2, 3, 4, 5, 6, 7, 8, 9] [0.2182550014210474, 0.2248031600347641, 0.2274131532445296, 0.22762027236435797, 0.22781382703226363, 0.22584349623214345, 0.22718007721447314, 0.22874017432679888]
[2, 3, 4, 5, 6, 7, 8, 9] [0.2114277965476843, 0.21726763762240822, 0.22206505311354569, 0.21738136680463233, 0.21424393885491047, 0.20860576710910725, 0.21637194421354172, 0.207346226980305]
[2, 3, 4, 5, 6, 7, 8, 9] [0.2560250640824412, 0.

In [122]:
print(fullt_p,fullp_p,fulla_p,fulld_p)
print(engt_p,engp_p,enga_p,engd_p)
print(gert_p,gerp_p,gera_p,gerd_p)
print(octt_p,octp_p,octa_p,octd_p)
print(novt_p,novp_p,nova_p,novd_p)
print(dect_p,decp_p,deca_p,decd_p)
print(jant_p,janp_p,jana_p,jand_p)
print(febt_p,febp_p,feba_p,febd_p)
print(mart_p,marp_p,mara_p,mard_p)
print(aprt_p,aprp_p,apra_p,aprd_p)
#manually increase number of topics
engt_p = 6
gert_p = 6
febt_p = 4

6 15 symmetric 0.5
6 5 asymmetric 0.5
6 20 symmetric 0.5
9 20 asymmetric 0.5
4 20 symmetric 0.5
8 20 symmetric 0.5
4 15 symmetric 0.5
4 10 symmetric 0.7
8 20 symmetric 0.5
7 20 asymmetric 0.9


In [94]:
full_model_p = perform_LDA(df_p['lemmatized'], fullt, fullp, fulla, fulld)

------ Topic 0 ------
#ichbinreyhan jahr uni stellen zeit prekär @gew_bund system arbeit problem

------ Topic 1 ------
#ichbinreyhan uni jahr stellen system @gew_bund problem arbeit forschung #wisssystemfehler

------ Topic 2 ------
#ichbinreyhan uni jahr vertrag @gew_bund stellen arbeit forschung hochschule system

------ Topic 3 ------
#ichbinreyhan system jahr forschung problem karriere uni zeit stelle wissen

------ Topic 4 ------
#hannaimbundestag @anjakarliczek jahr hochschule bundestag uni aktuell problem forschung arbeitsbedingung

------ Topic 5 ------
jahr uni befristet system forschung @anjakarliczek problem arbeit stellen vertrag


Perplexity:  -8.733679674173711

Coherence Score:  0.2946533306225232


In [95]:
eng_model_p = perform_LDA(df_en_p['lemmatized'], engt, engp, enga, engd)

------ Topic 0 ------
system #ichbinreyhan time work one would year working university @diballestero

------ Topic 1 ------
career contract #ichbinreyhan working condition phd one system student research

------ Topic 2 ------
#ichbinreyhan thread contract work scholar research one time teaching working

------ Topic 3 ------
thread contract system #ichbinreyhan condition #hannabeidergew permanent working one academic

------ Topic 4 ------
system contract job research work phd many researcher year career

------ Topic 5 ------
job solidarity contract work read #oneofusallofus year law conference day

------ Topic 6 ------
something year working phd postdoc even one time many permanent

------ Topic 7 ------
#ichbinreyhan want international #academicprecarity working phd great problem scientific weekend

------ Topic 8 ------
#ichbinreyhan contract research system year position postdoc job thread career


Perplexity:  -7.817932976072848

Coherence Score:  0.2668483561514083


In [96]:
ger_model_p = perform_LDA(df_ger_p['lemmatized'], gert, gerp, gera, gerd)

------ Topic 0 ------
jahr @anjakarliczek uni #hannaimbundestag befristet forschung stellen arbeit promotion system

------ Topic 1 ------
#ichbinreyhan jahr vertrag monat uni befristet stellen #hannainzahlen stelle zeit

------ Topic 2 ------
#ichbinreyhan #tvstud arbeitsbedingung #frististfrust jahr prekär #wisssystemfehler uni arbeit hochschule

------ Topic 3 ------
#ichbinreyhan #waspostdocswoll @karolinedoering @christinaholzel @esteinhauer @klios_spiegel @tinido wissen @richterhedwig @c_kenneweg

------ Topic 4 ------
#ichbinreyhan uni jahr stellen hochschule arbeit forschung @gew_bund problem zeit

------ Topic 5 ------
#ichbinreyhan jahr @gew_bund uni forschung stellen befristet arbeit thema prekär


Perplexity:  -8.503111455517988

Coherence Score:  0.28803291130495884


In [97]:
oct_model_p = perform_LDA(df_october_p['lemmatized'], octt, octp, octa, octd)

------ Topic 0 ------
#ichbinreyhan #wisssystemfehler stellen problem jahr wissenschaftssystem system zeit forschung @humboldtuni

------ Topic 1 ------
#ichbinreyhan jahr professur stellen #wisssystemfehler uni befristet system befristung wissenschaftlich

------ Topic 2 ------
#wisssystemfehler #ichbinreyhan #tvstud uni hochschule arbeit jahr beschäftigt forschung lehre

------ Topic 3 ------
#ichbinreyhan jahr #wisssystemfehler arbeit uni hochschule @gew_bund studierend forschung arbeitsbedingung

------ Topic 4 ------
#ichbinreyhan @gew_bund uni perspektive schaffen beschäftigt #tvstud #dauerstell studierend jahr

------ Topic 5 ------
#ichbinreyhan #95vswisszeitvg #wisssystemfehler arbeitsbedingung hochschule zeit problem system lehre befristet

------ Topic 6 ------
problem #dauerstell zeigen berlin #wisssystemfehler system befristung #ichbinreyhan uni frage


Perplexity:  -7.791309302311857

Coherence Score:  0.25592225335715807


In [98]:
nov_model_p = perform_LDA(df_november_p['lemmatized'], novt, novp, nova, novd)

------ Topic 0 ------
@gew_bund #dauerstell jahr #wissmobb uni #wisssystemfehler #aktionskonferenz #werdarfhannasein frage system

------ Topic 1 ------
zeit @jule_specht uni #dauerstell stelle befristet #wisssystemfehler problem jahr tag

------ Topic 2 ------
#wirhabenbedarf @adressel #tvstud #frististfrust #tvstudjetzt arbeitsbedingung arbeit forschung beschäftigt studentisch

------ Topic 3 ------
#tvstud uni @gew_bund #wisssystemfehler #dasgewinnenwir arbeit hochschule #frististfrust arbeitsbedingung stellen

------ Topic 4 ------
karriere lukman christopher @lisajanotta system trotz dauermobilität wissenschaftlich leisten unsicherheit

------ Topic 5 ------
#koalitionsvertrag koalitionsvertrag fdp #ampel bildung forschung @gew_bund arbeitsbedingung wichtig schaffen

------ Topic 6 ------
#wisssystemfehler hochschule #tvstud system @anja_steinbeck @gew_bund problem arbeit befristet stellen

------ Topic 7 ------
system stellen @nga_wiss @akellergew #tvstud problem zeigen woche #wi

In [121]:
dec_model_p = perform_LDA(df_december_p['lemmatized'], dect, decp, deca, decd)

------ Topic 0 ------
#ichbinreyhan arbeit sprechen lehre freuen #wisssystemfehler thema forschung stelle befristet

------ Topic 1 ------
#ichbinreyhan 2022 2021 stellen @humboldtuni @bverfg juni initiative erfolg einsatz

------ Topic 2 ------
#ichbinreyhan uni arbeit unbefristet befristet schön system tag forschung leute

------ Topic 3 ------
#ichbinreyhan toxisch leistungsfähigkeit stellen verein system deadlines legen uni langfristig

------ Topic 4 ------
beruflich #ichbinreyhan uni promotion jahr stellen toll lassen @realsci_de gute

------ Topic 5 ------
#ichbinreyhan @gew_bund uni @humboldtuni #wissenschaft hochschule @hrk_aktuell #dauerstell arbeit arbeitsbedingung

------ Topic 6 ------
jahr forschung #ichbinreyhan befristet vertrag stellen professur chance laufen heißen


Perplexity:  -7.430627014442017

Coherence Score:  0.313390458076842


In [100]:
jan_model_p = perform_LDA(df_january_p['lemmatized'], jant, janp, jana, jand)

------ Topic 0 ------
#ichbinreyhan @w_jahr vertrag schön arbeitsbedingung #wisssystemfehler jahr wissen @starkwatzinger forschung

------ Topic 1 ------
#ichbinreyhan @mliebendoerfer @hrk_aktuell arbeit problem stellen @agehrlach @unileipzig @histodigitale job

------ Topic 2 ------
#ichbinreyhan #wisssystemfehler uni befristet frage zeit stellen thema jahr system

------ Topic 3 ------
#ichbinreyhan lehre stellen uni forschung jahr warten zeit hochschule sprechen

------ Topic 4 ------
#ichbinreyhan uni wissen problem frage lehre #wisssystemfehler jahr thema forschung

------ Topic 5 ------
#ichbinreyhan system jahr uni @jenniferhenkehb forschung frage vertrag @sainethina groß

------ Topic 6 ------
stellen schuld problem mensch @sainethina contract krank glück bedingung generation

------ Topic 7 ------
#ichbinreyhan system promotion arbeit wisszeitvg semester @mliebendoerfer schaffen @sainethina problem

------ Topic 8 ------
#ichbinreyhan problem jahr system zeit job @unileipzig @

In [101]:
feb_model_p = perform_LDA(df_february_p['lemmatized'], febt, febp, feba, febd)

------ Topic 0 ------
#ichbinreyhan @jlugiessen @starkwatzinger #jlugiessen daueraufgabe #wissenschaft #wisssystemfehler #hannaorganisiertsich #dauerstellenfürdaueraufgaben forschung

------ Topic 1 ------
#ichbinreyhan uni system jahr #oneofusallofus @starkwatzinger #sinderechostampocohayciencia #wisssystemfehler change @gew_bund

------ Topic 2 ------
#ichbinreyhan uni zeit problem stellen jahr system groß @dievilla4 #wisssystemfehler

------ Topic 3 ------
problem @nga_wiss schlecht betrieb @_verdi uni job schön befristung lassen

------ Topic 4 ------
#ichbinreyhan jahr problem frage uni aktuell forschung #wisssystemfehler zeit kurz

------ Topic 5 ------
#ichbinreyhan thread uni karriere wissenschaftlich #wisssystemfehler stellen #oneofusallofus schön @jenniferhenkehb

------ Topic 6 ------
#ichbinreyhan #lenzen jahr forschung #wisssystemfehler thema arbeit wissen uni dieter


Perplexity:  -7.525286288716111

Coherence Score:  0.30252489562462115


In [102]:
mar_model_p = perform_LDA(df_march_p['lemmatized'], mart, marp, mara, mard)

------ Topic 0 ------
#mentalhealth #weilwirwissenschaftlieben #promotion #hochschulpolitik #thesis_ev #ichbinreyhan postdoc jahr forschung health

------ Topic 1 ------
#ichbinreyhan uni professor jahr hochschule @jenniferhenkehb verwaltung gehalt application problem

------ Topic 2 ------
#ichbinreyhan frage jahr befristet job thema woche vertrag zeit lehre

------ Topic 3 ------
#ichbinreyhan buch @suhrkamp arbeit befristet jahr @jenniferhenkehb #wisssystemfehler tweet publikation

------ Topic 4 ------
#ichbinreyhan buch thema science arbeit jahr deutschland uni freuen system

------ Topic 5 ------
#ichbinreyhan prekär buch deutschland jahr forschung @suhrkamp uni system professur


Perplexity:  -7.381128901208511

Coherence Score:  0.3089138661857515


In [103]:
apr_model_p = perform_LDA(df_april_p['lemmatized'], aprt, aprp, apra, aprd)

------ Topic 0 ------
problem @maithinkx arbeitsbedingung prekär @drlutzboehm @diballestero zeit job streitschrift system

------ Topic 1 ------
@maithi_nk arbeitsbedingung @maithinkx thema #wisssystemfehler #maithinkx @gew_bund problem folge sendung

------ Topic 2 ------
@maithi_nk problem jahr uni @maithinkx kunst argument geisteswissenschaft @michael_gerloff betreffen

------ Topic 3 ------
@maithinkx arbeitsbedingung @maithi_nk problem @drlutzboehm @diballestero system prekär uni jahr


Perplexity:  -6.8891145486229455

Coherence Score:  0.28197061079472246


### Pool tweets by day and user

Pool tweets by day and user to get a greater amount of topics

In [123]:
df_pu = df.groupby(['date','author.username'], as_index=False)['lemmatized'].sum()
df_en_pu = df_en.groupby(['date','author.username'], as_index=False)['lemmatized'].sum()
df_ger_pu = df_ger.groupby(['date','author.username'], as_index=False)['lemmatized'].sum()
df_october_pu = df_october.groupby(['date','author.username'], as_index=False)['lemmatized'].sum()
df_november_pu = df_november.groupby(['date','author.username'], as_index=False)['lemmatized'].sum()
df_december_pu = df_december.groupby(['date','author.username'], as_index=False)['lemmatized'].sum()
df_january_pu = df_january.groupby(['date','author.username'], as_index=False)['lemmatized'].sum()
df_february_pu = df_february.groupby(['date','author.username'], as_index=False)['lemmatized'].sum()
df_march_pu = df_march.groupby(['date','author.username'], as_index=False)['lemmatized'].sum()
df_april_pu = df_april.groupby(['date','author.username'], as_index=False)['lemmatized'].sum()

### Descriptive statistic about documents pooled by day and user

In [124]:
print("Full model:")
print("Number of documents:" + str(len(df_pu)) +" Minimum word count:" + str(min(df_pu['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_pu['lemmatized'].map(len))) + " Mean word count:" + str(df_pu['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nEnglish model:")
print("Number of documents:" + str(len(df_en_pu)) +" Minimum word count:" + str(min(df_en_pu['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_en_pu['lemmatized'].map(len))) + " Mean word count:" + str(df_en_pu['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_en_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_en_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_en_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nGerman model:")
print("Number of documents:" + str(len(df_ger_pu)) +" Minimum word count:" + str(min(df_ger_pu['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_ger_pu['lemmatized'].map(len))) + " Mean word count:" + str(df_ger_pu['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_ger_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_ger_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_ger_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nOctober model:")
print("Number of documents:" + str(len(df_october_pu)) +" Minimum word count:" + str(min(df_october_pu['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_october_pu['lemmatized'].map(len))) + " Mean word count:" + str(df_october_pu['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_october_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_october_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_october_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nNovember model:")
print("Number of documents:" + str(len(df_november_pu)) +" Minimum word count:" + str(min(df_november_pu['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_november_pu['lemmatized'].map(len))) + " Mean word count:" + str(df_november_pu['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_november_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_november_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_november_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nDecember model:")
print("Number of documents:" + str(len(df_december_pu)) +" Minimum word count:" + str(min(df_december_pu['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_december_pu['lemmatized'].map(len))) + " Mean word count:" + str(df_december_pu['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_december_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_december_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_december_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nJanuary model:")
print("Number of documents:" + str(len(df_january_pu)) +" Minimum word count:" + str(min(df_january_pu['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_january_pu['lemmatized'].map(len))) + " Mean word count:" + str(df_january_pu['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_january_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_january_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_january_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nFebruary model:")
print("Number of documents:" + str(len(df_february_pu)) +" Minimum word count:" + str(min(df_february_pu['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_february['lemmatized'].map(len))) + " Mean word count:" + str(df_february_pu['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_february_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_february_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_february_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nMarch model:")
print("Number of documents:" + str(len(df_march_pu)) +" Minimum word count:" + str(min(df_march_pu['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_march['lemmatized'].map(len))) + " Mean word count:" + str(df_march_pu['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_march_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_march_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_march_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

print("\nApril model:")
print("Number of documents:" + str(len(df_april_pu)) +" Minimum word count:" + str(min(df_april_pu['lemmatized'].map(len))) + " Maximum word count:" + str(max(df_april['lemmatized'].map(len))) + " Mean word count:" + str(df_april_pu['lemmatized'].map(len).mean()))
print("Minimum character count:" + str(min(df_april_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Maximum character count:" + str(max(df_april_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len))) + " Mean character count:" +str(df_april_pu['lemmatized'].apply(lambda x: ' '.join(x)).map(len).mean()))

Full model:
Number of documents:20289 Minimum word count:0 Maximum word count:716 Mean word count:15.85504460545123
Minimum character count:0 Maximum character count:6455 Mean character count:153.2040021686628

English model:
Number of documents:2809 Minimum word count:0 Maximum word count:710 Mean word count:17.36133855464578
Minimum character count:0 Maximum character count:6399 Mean character count:140.58027767888927

German model:
Number of documents:16987 Minimum word count:0 Maximum word count:480 Mean word count:15.704891976217107
Minimum character count:0 Maximum character count:4819 Mean character count:155.92547242008595

October model:
Number of documents:1476 Minimum word count:0 Maximum word count:194 Mean word count:15.563685636856368
Minimum character count:0 Maximum character count:1885 Mean character count:154.94850948509486

November model:
Number of documents:1366 Minimum word count:0 Maximum word count:282 Mean word count:15.759882869692532
Minimum character count:0

In [125]:
dect_pu = compute_coherence_values_topics(df_december_pu['lemmatized'])
decp_pu = compute_coherence_values_passes(df_december_pu['lemmatized'], dect_pu)
deca_pu = compute_coherence_values_alpha(df_december_pu['lemmatized'], dect_pu, decp_pu)
decd_pu = compute_coherence_values_decay(df_december_pu['lemmatized'], dect_pu, decp_pu, deca_pu)


[2, 3, 4, 5, 6, 7, 8, 9] [0.3398847006943793, 0.34529539840731954, 0.3923369930674224, 0.3957666356280699, 0.4041385621712125, 0.3974320370078908, 0.4318614592145518, 0.42701985072551096]
[5, 10, 15, 20] [0.4243476061331424, 0.4311417156296843, 0.42864149264307316, 0.43026328855690865]
['symmetric', 'asymmetric'] [0.4311417156296843, 0.4042443234256079]
[0.5, 0.7, 0.9] [0.4311417156296843, 0.4298871586743064, 0.4243476061331423]


In [106]:
#full data
fullt_pu = compute_coherence_values_topics(df_pu['lemmatized'])
#english data
engt_pu = compute_coherence_values_topics(df_en_pu['lemmatized'])
#german data
gert_pu = compute_coherence_values_topics(df_ger_pu['lemmatized'])
#october data
octt_pu = compute_coherence_values_topics(df_october_pu['lemmatized'])
#november data
novt_pu = compute_coherence_values_topics(df_november_pu['lemmatized'])
#december data
dect_pu = compute_coherence_values_topics(df_december_pu['lemmatized'])
#january data
jant_pu = compute_coherence_values_topics(df_january_pu['lemmatized'])
#february data
febt_pu = compute_coherence_values_topics(df_february_pu['lemmatized'])
#march data
mart_pu = compute_coherence_values_topics(df_march_pu['lemmatized'])
#april data
aprt_pu = compute_coherence_values_topics(df_april_pu['lemmatized'])

#full data
fullp_pu = compute_coherence_values_passes(df_pu['lemmatized'], fullt_pu)
#english data
engp_pu = compute_coherence_values_passes(df_en_pu['lemmatized'], engt_pu)
#german data
gerp_pu = compute_coherence_values_passes(df_ger_pu['lemmatized'], gert_pu)
#october data
octp_pu = compute_coherence_values_passes(df_october_pu['lemmatized'], octt_pu)
#november data
novp_pu = compute_coherence_values_passes(df_november_pu['lemmatized'], novt_pu)
#december data
decp_pu = compute_coherence_values_passes(df_december_pu['lemmatized'], dect_pu)
#january data
janp_pu = compute_coherence_values_passes(df_january_pu['lemmatized'], jant_pu)
#february data
febp_pu = compute_coherence_values_passes(df_february_pu['lemmatized'], febt_pu)
#march data
marp_pu = compute_coherence_values_passes(df_march_pu['lemmatized'], mart_pu)
#april data
aprp_pu = compute_coherence_values_passes(df_april_pu['lemmatized'], aprt_pu)

#full data
fulla_pu = compute_coherence_values_alpha(df_pu['lemmatized'], fullt_pu, fullp_pu)
#english data
enga_pu = compute_coherence_values_alpha(df_en_pu['lemmatized'], engt_pu, engp_pu)
#german data
gera_pu = compute_coherence_values_alpha(df_ger_pu['lemmatized'], gert_pu, gerp_pu)
#october data
octa_pu = compute_coherence_values_alpha(df_october_pu['lemmatized'], octt_pu, octp_pu)
#november data
nova_pu = compute_coherence_values_alpha(df_november_pu['lemmatized'], novt_pu, novp_pu)
#december data
deca_pu = compute_coherence_values_alpha(df_december_pu['lemmatized'], dect_pu, decp_pu)
#january data
jana_pu = compute_coherence_values_alpha(df_january_pu['lemmatized'], jant_pu, janp_pu)
#february data
feba_pu = compute_coherence_values_alpha(df_february_pu['lemmatized'], febt_pu, febp_pu)
#march data
mara_pu = compute_coherence_values_alpha(df_march_pu['lemmatized'], mart_pu, marp_pu)
#april data
apra_pu = compute_coherence_values_alpha(df_april_pu['lemmatized'], aprt_pu, aprp_pu)

#full data
fulld_pu = compute_coherence_values_decay(df_pu['lemmatized'], fullt_pu, fullp_pu, fulla_pu)
#english data
engd_pu = compute_coherence_values_decay(df_en_pu['lemmatized'], engt_pu, engp_pu, enga_pu)
#german data
gerd_pu = compute_coherence_values_decay(df_ger_pu['lemmatized'], gert_pu, gerp_pu, gera_pu)
#october data
octd_pu = compute_coherence_values_decay(df_october_pu['lemmatized'], octt_pu, octp_pu, octa_pu)
#november data
novd_pu = compute_coherence_values_decay(df_november_pu['lemmatized'], novt_pu, novp_pu, nova_pu)
#december data
decd_pu = compute_coherence_values_decay(df_december_pu['lemmatized'], dect_pu, decp_pu, deca_pu)
#january data
jand_pu = compute_coherence_values_decay(df_january_pu['lemmatized'], jant_pu, janp_pu, jana_pu)
#february data
febd_pu = compute_coherence_values_decay(df_february_pu['lemmatized'], febt_pu, febp_pu, feba_pu)
#march data
mard_pu = compute_coherence_values_decay(df_march_pu['lemmatized'], mart_pu, marp_pu, mara_pu)
#april data
aprd_pu = compute_coherence_values_decay(df_april_pu['lemmatized'], aprt_pu, aprp_pu, apra_pu)

[2, 3, 4, 5, 6, 7, 8, 9] [0.4266355474197704, 0.4506344321145607, 0.44813446084570474, 0.4409750951660386, 0.4506052427970581, 0.44987094999799676, 0.4658268694118159, 0.48502728014214735]
[2, 3, 4, 5, 6, 7, 8, 9] [0.3161630734372046, 0.31257998946885185, 0.3216386458905185, 0.3185813908284758, 0.31758053545511566, 0.3242787668691878, 0.31929463423126847, 0.32485794287718694]
[2, 3, 4, 5, 6, 7, 8, 9] [0.38927406728988956, 0.42725807049291253, 0.42009845222859066, 0.44423351695564034, 0.45041192468663166, 0.4744489534083417, 0.46791299161086586, 0.4618251342821497]
[2, 3, 4, 5, 6, 7, 8, 9] [0.215059282883793, 0.19949391608984354, 0.2341504110109347, 0.2518185857876336, 0.2848952096323821, 0.29400656303802924, 0.30124721752237815, 0.2923999705756573]
[2, 3, 4, 5, 6, 7, 8, 9] [0.19032827879353026, 0.20383356144268983, 0.2054786070237673, 0.22630459588950388, 0.2351773985212662, 0.2543386263853612, 0.2606413340628617, 0.2764395180519623]
[2, 3, 4, 5, 6, 7, 8, 9] [0.3675324445974824, 0.3844

In [126]:
print(fullt_pu,fullp_pu,fulla_pu,fulld_pu)
print(engt_pu,engp_pu,enga_pu,engd_pu)
print(gert_pu,gerp_pu,gera_pu,gerd_pu)
print(octt_pu,octp_pu,octa_pu,octd_pu)
print(novt_pu,novp_pu,nova_pu,novd_pu)
print(dect_pu,decp_pu,deca_pu,decd_pu)
print(jant_pu,janp_pu,jana_pu,jand_pu)
print(febt_pu,febp_pu,feba_pu,febd_pu)
print(mart_pu,marp_pu,mara_pu,mard_pu)
print(aprt_pu,aprp_pu,apra_pu,aprd_pu)

9 15 symmetric 0.5
9 20 symmetric 0.5
7 20 symmetric 0.5
8 20 asymmetric 0.7
9 10 symmetric 0.5
8 10 symmetric 0.5
9 20 symmetric 0.5
9 5 asymmetric 0.5
7 20 symmetric 0.5
8 10 asymmetric 0.5


In [108]:
full_model_pu = perform_LDA(df_pu['lemmatized'], fullt_pu, fullp_pu, fulla_pu, fulld_pu)

------ Topic 0 ------
#hannaimbundestag uni wissen jahr befristet frage #ichbinreyhan stellen denken thema

------ Topic 1 ------
vertrag jahr befristet uni arbeit hochschule stellen zeit monat deutschland

------ Topic 2 ------
#ichbinreyhan uni #tvstud hochschule wissenschaftlich beschäftigt sprechen #wisssystemfehler uhr arbeitsbedingung

------ Topic 3 ------
@gew_bund #dauerstell jahr befristet stelle arbeit stellen system monat unbefristet

------ Topic 4 ------
#ichbinreyhan contract system job phd research one work year career

------ Topic 5 ------
#wissenschaft thread #ichbinreyhan system studieren #thesis_ev @gew_bund problem hochschule wichtig

------ Topic 6 ------
jahr stelle promotion stellen befristet uni geld job fest zeit

------ Topic 7 ------
@anjakarliczek jahr forschung problem uni prekär #hannaimbundestag befristet mittelbau leute

------ Topic 8 ------
forschung lehre stellen arbeitsbedingung arbeit aktuell debatte problem deutsch #ichbinreyhan


Perplexity:  -8

In [109]:
eng_model_pu = perform_LDA(df_en_pu['lemmatized'], engt_pu, engp_pu, enga_pu, engd_pu)

------ Topic 0 ------
research system work phd career get position contract need scientist

------ Topic 1 ------
thread job one many academic time #ichbinreyhan university permanent work

------ Topic 2 ------
story great share english join thanks check system want summary

------ Topic 3 ------
want research system #ichbinreyhan job researcher need get career know

------ Topic 4 ------
postdoc phd contract permanent job condition working #hannaimbundestag discussion fight

------ Topic 5 ------
contract year system phd one university position many law researcher

------ Topic 6 ------
#ichbinreyhan work student problem much like system research scholar teaching

------ Topic 7 ------
scholar @mahaelhissy without passport people white movement @diballestero like need

------ Topic 8 ------
problem condition working precarious #ichbinreyhan researcher colleague well solidarity position


Perplexity:  -7.90127733766446

Coherence Score:  0.3276307286866858


In [110]:
ger_model_pu = perform_LDA(df_ger_pu['lemmatized'], gert_pu, gerp_pu, gera_pu, gerd_pu)

------ Topic 0 ------
jahr uni frage system kind leute zeit stellen stelle forschung

------ Topic 1 ------
jahr uni befristet stelle stellen studieren unbefristet vertrag job wissen

------ Topic 2 ------
#ichbinreyhan system stellen problem groß befristung arbeit zeigen hochschule deutschland

------ Topic 3 ------
problem @anjakarliczek stellen forschung zeit arbeit hochschule system aktuell #wissenschaft

------ Topic 4 ------
jahr befristet forschung arbeit wissen @anjakarliczek lehre wissenschaftlich prekär arbeitsbedingung

------ Topic 5 ------
@gew_bund #ichbinreyhan #dauerstell #frististfrust forschung vertrag #tvstud daueraufgabe thread @akellergew

------ Topic 6 ------
#hannaimbundestag uni @anjakarliczek jahr hochschule prekär arbeitsbedingung beschäftigt arbeit #ichbinreyhan


Perplexity:  -8.712104676928105

Coherence Score:  0.47703880676723903


In [111]:
oct_model_pu = perform_LDA(df_october_pu['lemmatized'], octt_pu, octp_pu, octa_pu, octd_pu)

------ Topic 0 ------
#wisssystemfehler #ichbinreyhan jahr hochschule uni system forschung frage wissenschaftlich arbeitsbedingung

------ Topic 1 ------
deutsch contract #wisssystemfehler wissenschaftssystem monat antrag uni schaffen stelle beitrag

------ Topic 2 ------
#wisssystemfehler zeit jahr uni stelle forschung debatte studierend entfristen berliner

------ Topic 3 ------
#ichbinreyhan thread #wisssystemfehler projekt buch arbeitsbedingung #dauerstell person @humboldtuni schön

------ Topic 4 ------
#tvstud @gew_bund #dasgewinnenwir beschäftigt #frististfrust uhr #unverzichtbar studentisch #hannastreikt #keineausnahme

------ Topic 5 ------
#tvstud uni #stopthecuts #streiksemester arbeit hochschule studierend @hrk_aktuell politisch besetzen

------ Topic 6 ------
#vhdresolution @thstockinger @emmaquardt @vhdtweets @mpoessel problem @docbio1509 @jancloppenburg jahr @tobias_schulze

------ Topic 7 ------
#ichbinreyhan jahr nachwuchs #wisssystemfehler forschung stelle stellen ric

In [112]:
nov_model_pu = perform_LDA(df_november_pu['lemmatized'], novt_pu, novp_pu, nova_pu, novd_pu)

------ Topic 0 ------
#tvstud #wirhabenbedarf #frististfrust #dasgewinnenwir beschäftigt @gew_bund @adressel hochschule arbeitsbedingung #tvstudjetzt

------ Topic 1 ------
#ichbinreyhan vertrag uni befristet @anja_steinbeck #wisssystemfehler interessant schaffen entscheidung chance

------ Topic 2 ------
@gew_bund #dauerstell daueraufgabe #ichbinreyhan #aktionskonferenz frage #berlhg @faznet zeit debatte

------ Topic 3 ------
#ichbinreyhan #wisssystemfehler uni leute system koalitionsvertrag berliner möglichkeit verbesserung bezahlen

------ Topic 4 ------
#ichbinreyhan problem #wisssystemfehler wichtig berlin thema forschung stellen befristet @gew_bund

------ Topic 5 ------
#wissmobb #werdarfhannasein trend postdoc uni geld stellen ändern universität zahlen

------ Topic 6 ------
system forschung uni jahr hochschule karriere deutschland wissenschaftlich zeit #wisssystemfehler

------ Topic 7 ------
#ichbinreyhan @nga_wiss @akellergew jahr #koalitionsvertrag @jensjot @grundmar uhr @

In [127]:
dec_model_pu = perform_LDA(df_december_pu['lemmatized'], dect_pu, decp_pu, deca_pu, decd_pu)

------ Topic 0 ------
@gew_bund uni stelle weihnachten stellen #ichbinreyhan arbeit unbefristet frage leute

------ Topic 1 ------
#ichbinreyhan #dauerstell universität hochschule uni situation @starkwatzinger @gew_bund beschäftigt jahr

------ Topic 2 ------
#ichbinreyhan @hrk_aktuell #hrkadvent türchen #adventskalender lehre liebe uni system urlaub

------ Topic 3 ------
#berlhg richtig prekär @gew_bund gerne jahr zeit problem unterfinanzierung freuen

------ Topic 4 ------
#wissenschaft @humboldtuni #ichbinreyhan @gew_bund lassen @bverfg #oneofusallofus solidarity university glückwunsch

------ Topic 5 ------
#ichbinreyhan prekär forschung befristet arbeit #ichbinhannaat #frististfrust sprechen stellen #ugnovelle

------ Topic 6 ------
#ichbinreyhan jahr zeit #wisssystemfehler befristet forschung uhr hochschule gute forderung

------ Topic 7 ------
#ichbinreyhan 2021 uni promotion folge lehre schaffen double-binds stellen arbeitsbedingung


Perplexity:  -7.723109040144352

Coherence

In [114]:
jan_model_pu = perform_LDA(df_january_pu['lemmatized'], jant_pu, janp_pu, jana_pu, jand_pu)

------ Topic 0 ------
@diballestero system 2021 problem white job jahr vertrag deutsch @jenniferhenkehb

------ Topic 1 ------
frage perspektive arbeitsbedingung thema sprechen @hrk_aktuell @karolinedoering universität @gew_bund system

------ Topic 2 ------
#ichbinreyhan #wisssystemfehler system zeit thread forschung problem #thesis_ev stellen uni

------ Topic 3 ------
wissen arbeit zeit @starkwatzinger stellen forschung jahr hochschule professur pandemie

------ Topic 4 ------
#ichbinreyhan jahr frage #wisssystemfehler stellen teil #frististfrust woche promotion job

------ Topic 5 ------
@unileipzig @agehrlach @histodigitale @mliebendoerfer @unv_nunftbegabt @piczenik1 stellen liebe woche befristet

------ Topic 6 ------
#ichbinreyhan thema #wisssystemfehler #thesis_ev professur system studierend stelle #wissenschaft #dauerstell

------ Topic 7 ------
uni lehre job problem forschung befristet jahr mittelbau denken @jenniferhenkehb

------ Topic 8 ------
uni @hrk_aktuell reden prekär

In [115]:
feb_model_pu = perform_LDA(df_february_pu['lemmatized'], febt_pu, febp_pu, feba_pu, febd_pu)

------ Topic 0 ------
uni #ichbinreyhan zeit forschung jahr woche #wisssystemfehler @dievilla4 thema lassen

------ Topic 1 ------
#ichbinreyhan wissen #academicprecarity unbefristet thread international #wisssystemfehler lehre forschung jahr

------ Topic 2 ------
#ichbinreyhan jahr #wissenschaft thema #phdlife prekär gewerkschaft finanziell #firstgen #thesis_ev

------ Topic 3 ------
#ichbinreyhan problem @starkwatzinger @jenniferhenkehb kreativität frist druck sicherheit fest @rudolfkipp

------ Topic 4 ------
#ichbinreyhan #hannaorganisiertsich #oneofusallofus #fb03 #studiereningiessen #ichbinspielbein #jlugiessen #giessen #unigöttingen #dauerstellenfürdaueraufgaben

------ Topic 5 ------
#lenzen dieter @jlugiessen #jlugiessen #ichbinreyhan #wisssystemfehler uni zukunft kundgebung studierend

------ Topic 6 ------
@gew_bund wissen @starkwatzinger @sebblki #ichbinreyhan daueraufgabe ziel #dauerstell mitarb 2/2

------ Topic 7 ------
saliva leben befristung marginalisiert schaffen ig

In [116]:
mar_model_pu = perform_LDA(df_march_pu['lemmatized'], mart_pu, marp_pu, mara_pu, mard_pu)

------ Topic 0 ------
frage professur jahr befristet professor application position uni #mint vortrag

------ Topic 1 ------
#weilwirwissenschaftlieben #mentalhealth system #thesis_ev #ichbinreyhan #promotion #hochschulpolitik freuen forschung #ilovescience

------ Topic 2 ------
postdoc @jenniferhenkehb stelle system job diskussion buch jahr #eu reform

------ Topic 3 ------
forschung @gew_bund problem befristet lehre kind #ichbinreyhan frage freuen jahr

------ Topic 4 ------
deutschland arbeit @jenniferhenkehb wissenschaftlich @starkwatzinger #berlhg #wissenschaft buch problem woche

------ Topic 5 ------
prekär uni deutschland #ichbinreyhan jahr idiotisch sachbuch wissenschaftssystem befristet arbeitsbedingung

------ Topic 6 ------
#ichbinreyhan buch @suhrkamp jahr tag #wisssystemfehler vertrag hochschule arbeit forschung


Perplexity:  -7.6016072439228015

Coherence Score:  0.4482491623653565


In [117]:
apr_model_pu = perform_LDA(df_april_pu['lemmatized'], aprt_pu, aprp_pu, apra_pu, aprd_pu)

------ Topic 0 ------
@maithi_nk #ichbinreyhan problem @maithinkx thema arbeitsbedingung prekär leute system wissenschaftlich

------ Topic 1 ------
#ichbinreyhan liebe mensch @maithi_nk umfrage folge arbeitsbedingung @maithinkx @gew_bund #respectscience

------ Topic 2 ------
#ichbinreyhan kunst #wisssystemfehler arbeitsbedingung stellen prekär problem @czyina @maithi_nk hochschule

------ Topic 3 ------
streitschrift brotlose befristet jahr #ichbinreyhan @maithinkx @maithi_nk @dlf verstehen vertrag

------ Topic 4 ------
@gew_bund #dauerstell #wissenschaft @maithi_nk daueraufgabe #ichbinreyhan verstopfung stellen jahr lehre

------ Topic 5 ------
@vpod_schweiz zeit mittelbau @akellergew akademisch glückwunsch universitär @unibasel @eduint solidarity

------ Topic 6 ------
endlich jahr covid hören @drlutzboehm @maithinkx kümmern @diballestero @michael_gerloff handeln

------ Topic 7 ------
@drlutzboehm @maithinkx @diballestero #ichbinreyhan arbeitsbedingung thread argument movement kn

### Pool tweets by day and profile