In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


import pymysql as mariadb
import pickle
from sklearn.decomposition import PCA

# Plotting tools
import pyLDAvis 
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline
import bokeh
from bokeh.models import Circle, ColumnDataSource, Line, LinearAxis, Range1d, LabelSet, HoverTool
from bokeh.palettes import d3
import bokeh.models as bmo
from bokeh.plotting import figure, output_file, show
from bokeh.core.properties import value
from bokeh.palettes import viridis
from bokeh.palettes import inferno



In [2]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=False))  # deacc=True removes punctuations
        
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

a = "-2hj-"     #kürzel für betrachteten zeitraum
b = 13          #anzahl cluster
c = [8,3,5,5,4,5,3,3,6,5,3,5,4]   #anzahl topics

In [3]:

stop_words = stopwords.words('german')
stop_words.extend(["wieultimativwir", "vernetztsind","dassinder","buchauszug","wirbeispielhaft","diewie", 
                   "bild_watson", "dialogfensters","dialogfensters", "this_is", "modal_window" , "esc_brechen"])


In [4]:
#establishing the dataframe
connection = mariadb.connect(host="localhost", database="masterarbeit",
                            user="USER", password="PASSWORD")

#getting data
sql = ("SELECT * "
       "FROM comment ")

df = pd.read_sql_query(sql, connection)

liste = pickle.load(open("lemmatized_kom.pkl", "rb"))

df["lemmatized"] = liste

In [5]:
df['art_date'] = pd.to_datetime(df['art_date'])
df2 = df
df2 = df2.set_index(df2["art_date"])
df2 = df2[df2.index.month.isin([1,2,3,4,5,6])]
df = df2


In [6]:
#vectorizer
vectorizer = TfidfVectorizer(max_features = 3000)
vecs = vectorizer.fit_transform(df["lemmatized"])

kmeansmodel = KMeans(n_clusters = b, random_state=42).fit(vecs)
df["labels"] = kmeansmodel.labels_
df3 = df.groupby(['labels', 'cat_res1'])['labels'].count()
df3.to_csv("komm"+str(b)+"_clustered.csv", sep = ";")

In [7]:
pca = PCA(n_components=2)
pca_2d = np.array(pca.fit_transform(vecs.todense()))

In [8]:
df["labels"] = df["labels"].astype(str)
source = ColumnDataSource(data=dict(x=pca_2d[:,0], 
                                    y=pca_2d[:,1], 
                                    label_field1=df['cat_res1'],
                                    label_field2=df['labels'],
                                    keys=df['labels']))

hover = HoverTool(tooltips=[("ressort", "@label_field1"), ('label', '@label_field2')])
palette = inferno(len(df['labels'].unique()))
color_map = bmo.CategoricalColorMapper(factors=df['labels'].unique(), palette=palette)
TOOLS = ['pan', 'wheel_zoom', 'reset', hover]
p_pca = figure(title='Graphic representation of the clustering', tools=TOOLS, toolbar_location="left")
p_pca.scatter(x='x', 
              y='y', 
              size=2, 
              source=source,
              color={'field': 'keys', 'transform': color_map},
              legend_field='keys')

output_file("clustered-kom"+str(a)+".html")
show(p_pca)

In [9]:
clusters = [x for x in range(b)]
num_subtopics = c

In [10]:
for x in clusters:
    df4 = df.loc[df['labels'] == str(x)]

    data_words = list(sent_to_words(df4["lemmatized"]))

    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)

    data_words_nostops = remove_stopwords(data_words)
    data_words_bigrams = make_bigrams(data_words_nostops)

# Create Dictionary
    id2word = corpora.Dictionary(data_words_bigrams)

# Create Corpus
    texts = data_words_bigrams

# Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

# View
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=num_subtopics[clusters.index(x)], 
                                               random_state=42,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True
                                               )

    
    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, R = 15, sort_topics = False)
    pyLDAvis.save_html(vis, str(x)+"-comment-cluster" + str(a) + "lda.html")
    
    print("\n")
    print("LDA for Cluster No. ", x, " likely containing ", num_subtopics[clusters.index(x)], " subtopics with a perplexity of ", lda_model.log_perplexity(corpus))
    pprint(lda_model.print_topics())
    doc_lda = lda_model[corpus]



LDA for Cluster No.  0  likely containing  8  subtopics with a perplexity of  -9.918825644638748
[(0,
  '0.044*"einfach" + 0.027*"menschen" + 0.017*"denken" + 0.016*"natürlich" + '
  '0.016*"dafür" + 0.016*"lassen" + 0.016*"fälle" + 0.014*"letzt" + '
  '0.013*"tage" + 0.009*"ca"'),
 (1,
  '0.030*"mal" + 0.023*"kommen" + 0.019*"immer" + 0.018*"zahlen" + '
  '0.017*"wenig" + 0.016*"massnahmen" + 0.016*"ganz" + 0.016*"genau" + '
  '0.015*"leute" + 0.011*"sollen"'),
 (2,
  '0.040*"schon" + 0.035*"gehen" + 0.028*"corona" + 0.021*"sehen" + '
  '0.017*"sagen" + 0.017*"wohl" + 0.015*"finden" + 0.012*"eigentlich" + '
  '0.012*"lang" + 0.011*"warum"'),
 (3,
  '0.037*"covid" + 0.028*"nehmen" + 0.028*"nein" + 0.021*"tot" + '
  '0.019*"liegen" + 0.018*"tag" + 0.018*"brauchen" + 0.013*"wissen" + '
  '0.013*"land" + 0.011*"je"'),
 (4,
  '0.035*"mehr" + 0.014*"müssen" + 0.013*"ja" + 0.013*"dürfen" + 0.012*"virus" '
  '+ 0.010*"danke" + 0.010*"beim" + 0.010*"leider" + 0.010*"gut" + '
  '0.009*"gar"')



LDA for Cluster No.  9  likely containing  5  subtopics with a perplexity of  -9.271546848435953
[(0,
  '0.018*"ja" + 0.016*"mehr" + 0.015*"schon" + 0.014*"gehen" + 0.013*"gut" + '
  '0.012*"immer" + 0.011*"mal" + 0.010*"einfach" + 0.008*"kommen" + '
  '0.008*"schweiz"'),
 (1,
  '0.013*"halt" + 0.012*"dafür" + 0.010*"recht" + 0.009*"jahr" + 0.009*"lieb" '
  '+ 0.009*"arbeiten" + 0.009*"gleichen" + 0.008*"schlecht" + 0.007*"kosten" + '
  '0.007*"davon"'),
 (2,
  '0.020*"sollen" + 0.014*"eben" + 0.013*"warum" + 0.012*"zeit" + '
  '0.012*"finden" + 0.010*"alt" + 0.009*"ab" + 0.009*"masken" + '
  '0.008*"anstecken" + 0.008*"eigentlich"'),
 (3,
  '0.018*"klaren" + 0.014*"niemand" + 0.013*"hören" + 0.010*"überhaupt" + '
  '0.009*"stimmen" + 0.009*"quarantäne" + 0.008*"de" + 0.008*"wieso" + '
  '0.007*"schliessen" + 0.006*"fall"'),
 (4,
  '0.025*"massnahmen" + 0.021*"wenig" + 0.010*"wissen" + 0.009*"lassen" + '
  '0.009*"trotzdem" + 0.009*"paar" + 0.008*"brauchen" + 0.008*"schweizer" + '
  