In [None]:
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.corpus import stopwords

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


# Plotting tools
import pyLDAvis 
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline


import pymysql as mariadb
import pickle
from sklearn.decomposition import PCA

import bokeh
from bokeh.models import Circle, ColumnDataSource, Line, LinearAxis, Range1d, LabelSet, HoverTool
from bokeh.palettes import d3
import bokeh.models as bmo
from bokeh.plotting import figure, output_file, show
from bokeh.core.properties import value
from bokeh.palettes import viridis
from bokeh.palettes import inferno
output_notebook()

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=False))  # deacc=True removes punctuations
        
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


In [None]:
stop_words = stopwords.words('german')
stop_words.extend(["wieultimativwir", "vernetztsind","dassinder","buchauszug","wirbeispielhaft","diewie", "bild_watson"])

In [None]:
#establishing the dataframe
connection = mariadb.connect(host="localhost", database="masterarbeit",
                            user="USER", password="PASSWORD")

#getting data
sql = ("SELECT * "
       "FROM pasted ")

df = pd.read_sql_query(sql, connection)

In [None]:
a = "fj"  # kürzel für betrachteten zeitraum
b = 8     #anzahl cluster
c = [2,2,5,2,2,2,5,2]    #anzahl subtopics

In [None]:
df.head()

In [None]:
df["art_date"] = pd.to_datetime(df["art_date"])
df2 = df
df2 = df2.set_index(df2["art_date"])
df2 = df2[df2.index.month.isin([1,2,3,4,5,6])]
df = df2

In [None]:
#vectorizer
vectorizer = TfidfVectorizer(max_features = 100000)
vecs = vectorizer.fit_transform(df["text"])

kmeansmodel = KMeans(n_clusters = b, random_state=42).fit(vecs)
df["labels"] = kmeansmodel.labels_
df3 = df.groupby(["labels", 'cat_res1'])["labels"].count()
df3.to_csv("comment_fully_" + str(a) + "_clustered.csv", sep = ";")

In [None]:
pca = PCA(n_components=2)
pca_2d = np.array(pca.fit_transform(vecs.todense()))

In [None]:
df["labels"] = df["labels"].astype(str)
source = ColumnDataSource(data=dict(x=pca_2d[:,0], 
                                    y=pca_2d[:,1], 
                                    #label_field1=df['cat_res1'],
                                    label_field2=df['labels'],
                                    keys=df['labels']))

hover = HoverTool(tooltips=[("ressort", "@label_field1"), 
                            ('label', '@label_field2')])
palette = inferno(len(df['labels'].unique()))
color_map = bmo.CategoricalColorMapper(factors=df['labels'].unique(), 
                                       palette=palette)
TOOLS = ['pan', 'wheel_zoom', 'reset', hover]
p_pca = figure(title='Graphic representation of the clustering', tools=TOOLS, toolbar_location="left")
p_pca.scatter(x='x', 
              y='y', 
              size=2, 
              source=source,
              color={'field': 'keys', 'transform': color_map},
              legend_field='keys')

output_file("clustered-kom"+str(a)+".html")
show(p_pca)

In [None]:
clusters = [x for x in range(b)]
num_subtopics = c

In [None]:
for x in clusters:
    print(num_subtopics[clusters.index(x)])

In [None]:
for x in clusters:
    df4 = df.loc[df['labels'] == str(x)]

    data_words = list(sent_to_words(df4["text"]))

    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
    bigram_mod = gensim.models.phrases.Phraser(bigram)

    data_words_nostops = remove_stopwords(data_words)
    data_words_bigrams = make_bigrams(data_words_nostops)

# Create Dictionary
    id2word = corpora.Dictionary(data_words_bigrams)

# Create Corpus
    texts = data_words_bigrams

# Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

# View
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=num_subtopics[clusters.index(x)], 
                                               random_state=42,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True
                                               )
    
        
    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, R = 15, sort_topics = False)
    pyLDAvis.save_html(vis, str(x)+"-artikel-cluster" + str(a) + "lda.html")
    
    

    print("\n")
    print("LDA for Cluster No. ", x, " likely containing ", num_subtopics[clusters.index(x)], " subtopics with a perplexity of ", lda_model.log_perplexity(corpus))
    pprint(lda_model.print_topics())
    doc_lda = lda_model[corpus]