In [70]:
import pandas as pd
import numpy as np
import re
import emoji
import nltk
from bs4 import BeautifulSoup
from matplotlib import pyplot as plt
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis.gensim_models
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from HanTa import HanoverTagger as ht
%matplotlib inline

nltk.download('wordnet')
nltk.download("stopwords")
nltk.download('omw-1.4')
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Load the new data additionally containing tweets from October 2021 until April 2022

In [71]:
#remove hashtags and only keep tweets with unique texts (keep oldest tweets)
df = pd.read_csv ('data/tweets/IchBinHanna_updated.csv')
df['new_date'] = pd.to_datetime(df['created_at']).dt.strftime('%Y-%m-%d %H:%M:%S')
print(len(df))
#sort by date to ensure duplicate removal keeps oldest tweet
df = df.sort_values(by='new_date')
df = df.drop_duplicates(subset=['text'], keep='first')
print(len(df))
df = df.loc[df['reference_type'] != 'retweeted']
print(len(df))

97899
39668
31110


In [72]:
#clean the data (remove URLs, emojis and line breaks)
df['processed'] = df['text'].astype(str)
df['processed'] = df['processed'].replace(r'\\n',  ' ', regex=True)
pat1 = r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+'
pat2 = r'www.[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
split_pattern = re.compile(r'\b('  + r')\b')
def tweet_cleaner(demo):
    soup = BeautifulSoup(demo, 'lxml') # HTML
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    return stripped
df['processed'] = [tweet_cleaner(t) for t in df['processed']]
def rem_emojis(text):
    emojis = [x for x in text if x in emoji.UNICODE_EMOJI]
    cleaned = ' '.join([str for str in text.split() if not any(i in str for i in emojis)])
    return cleaned
df['processed'] = df['processed'].apply(lambda x: rem_emojis(x))
df['processed'] = df['processed'].astype(str)

## Inspect different stop word lists

In [73]:
#preprocessing (tokenization, stop word removal, lemmatizing)
german_stop = set(stopwords.words('german'))
english_stop = set(stopwords.words('english'))
add_stop_all = ["ichbinhanna","#ichbinhanna", "hanna", "mehr", "innen", "#wisszeitvg", "#ichbinhannah", "@amreibahr", "amreibahr", "@bmf_bund","bmf_bund", "@drkeichhorn", "drkeichhorn", "@sebastiankubon", "sebastiankubon", "@bmbf_bund", "mehr", "innen", "schon", "gehen", "jahr","wissenschaft", "wissenschaftler", "kommen","academia", "academic", "year", "machen", "sagen", "sein","geben", "also", "werden", "german", "germany","gut", "haben", "geht", "gibt", "viele", "seit", "wäre", "sehen", "ganz","bekommen","!!!","???","..."]
german_stop.update(set(add_stop_all))
english_stop.update(set(add_stop_all))
tweet_tokenizer = TweetTokenizer()
df['tokenized'] = df['processed'].apply(lambda x: tweet_tokenizer.tokenize(x.lower()))
dic = Dictionary(df['tokenized'])
print('Unique tokens without stopword removal:' ,len(dic))
df['tokenized'] = df[['tokenized','lang']].apply(lambda x: ' '.join([word for word in x['tokenized'] if word not in english_stop]).split() if x['lang'] == 'en' else ' '.join([word for word in x['tokenized'] if word not in german_stop]).split(),axis=1)
dic_stop = Dictionary(df['tokenized'])
print('Unique tokens with initial stopword removal:' ,len(dic_stop))
#add the german_stopwords list
with open('data/stopwords/german_stopwords.txt', 'r',encoding='utf8') as file:
    german_stopwords=[file.read().replace('\n', ',')]
    german_stopwords=german_stopwords[0].split(",")
print(len(german_stop))
add_german_stop = german_stop.copy()
add_german_stop.update(set(german_stopwords))
print(len(german_stopwords))
print(len(add_german_stop))
df['tokenized_ger'] = df[['tokenized','lang']].apply(lambda x: ' '.join([word for word in x['tokenized'] if word not in add_german_stop]).split(),axis=1)
dic_ger = Dictionary(df['tokenized_ger'])
print('Unique tokens with initial stopword removal + german_stopwords:' ,len(dic_ger))
#add the snowball stopword list
with open('data/stopwords/snowball.txt', 'r',encoding='utf8') as file:
    snowball_stopwords=[file.read().replace('\n', ',')]
    snowball_stopwords=snowball_stopwords[0].split(",")
add_snowball_stop = german_stop.copy()
add_snowball_stop.update(set(snowball_stopwords))
print(len(snowball_stopwords))
print(len(add_snowball_stop))
df['tokenized_snow'] = df[['tokenized','lang']].apply(lambda x: ' '.join([word for word in x['tokenized'] if word not in add_snowball_stop]).split(),axis=1)
dic_snow = Dictionary(df['tokenized_snow'])
print('Unique tokens with initial stopword removal + snowball stopwords:' ,len(dic_snow))
#remove both additional stopword lists
add_german_stop.update(set(snowball_stopwords))
df['tokenized'] = df[['tokenized','lang']].apply(lambda x: ' '.join([word for word in x['tokenized'] if word not in add_german_stop]).split(),axis=1)
dic = Dictionary(df['tokenized'])
print('Unique tokens with initial stopword removal + both additional lists:' ,len(dic))

Unique tokens without stopword removal: 64038
Unique tokens with initial stopword removal: 63806
273
1853
1879
Unique tokens with initial stopword removal + german_stopwords: 62744
275
414
Unique tokens with initial stopword removal + snowball stopwords: 63728
Unique tokens with initial stopword removal + both additional lists: 62744


In [74]:
df['tokenized'] = df['tokenized'].apply(lambda x: [word for word in x if len(word) > 2])
lemmatizer = WordNetLemmatizer()
hannover = ht.HanoverTagger('morphmodel_ger.pgz')
df['lemmatized'] = df[['tokenized','lang']].apply(lambda x: [lemmatizer.lemmatize(word).lower() for word in x['tokenized']] if x['lang'] == 'en' else [hannover.analyze(word)[0].lower() for word in x['tokenized']] ,axis=1)

In [75]:
def perform_LDA(tokens, topics=5, passes =5, alpha = 'symmetric', decay = 0.5):
    #create the dictionary of lemmatized tokens
    dic = Dictionary(tokens)
    #print(len(dic))
    #remove low and high frequent terms
    dic.filter_extremes(no_below=2, no_above=.99)
    #print(len(dic))
    #create the bag of words 
    corpus = [dic.doc2bow(d) for d in tokens]
    #build LDA model 
    LDA = LdaMulticore(corpus= corpus, num_topics=topics, id2word= dic, workers=12, passes=passes, alpha = alpha, decay = decay)
    words = [re.findall(r'"([^"]*)"',t[1]) for t in LDA.print_topics()]
    #create topics
    topics = [' '.join(t[0:10]) for t in words]

    for id, t in enumerate(topics): 
        print(f"------ Topic {id} ------")
        print(t, end="\n\n")
    # Compute Perplexity
    perplexity = LDA.log_perplexity(corpus)
    print('\nPerplexity: ', perplexity) 
    # Compute Coherence Score
    coherence_model = CoherenceModel(model=LDA, texts=tokens, 
                                   dictionary=dic, coherence='c_v')
    coherence_lda_model = coherence_model.get_coherence()
    print('\nCoherence Score: ', coherence_lda_model)
    return LDA

In [76]:
full_model = perform_LDA(df['lemmatized'])

------ Topic 0 ------
#ichbinreyhan system arbeitsbedingung uni contract phd stellen #wisssystemfehler thread year

------ Topic 1 ------
#ichbinreyhan arbeit wissen prekär wissenschaftlich karriere thema befristet forschung groß

------ Topic 2 ------
#ichbinreyhan uni jahr problem @anjakarliczek forschung zeit mensch stellen zeigen

------ Topic 3 ------
jahr befristet stellen vertrag @gew_bund uni stelle forschung hochschule deutschland

------ Topic 4 ------
#ichbinreyhan #wisssystemfehler system jahr uni arbeit frage wissen @gew_bund studium


Perplexity:  -8.981906687382141

Coherence Score:  0.17385792042337486
