In [None]:
# Run "jupyter labextension install jupyterlab-plotly" in console if charts do not show up

# Install
!pip install bertopic
!pip install nltk

# !pip install flair

In [None]:
# Import
import pandas as pd 
import numpy as np
from bertopic import BERTopic # https://github.com/MaartenGr/BERTopic
# import flair # https://github.com/flairNLP/flair

In [None]:
import nltk 
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stopwords=set(stopwords.words('german'))

In [None]:
# Load bert-base-german-cased model
# https://huggingface.co/bert-base-german-cased

# from flair.embeddings import TransformerDocumentEmbeddings
# bertGerman = TransformerDocumentEmbeddings('bert-base-german-cased')

In [None]:
# Remove stopwords from an array of strings
def remove_stopwords(data):
    output_array=[]
    for sentence in data:
        temp_list=[]
        for word in str(sentence).split():
            if word.lower() not in stopwords:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

In [None]:
# Load reviews data
df = pd.read_csv("./reviews-B003CMKQTS-de.csv", engine='python')

In [None]:
df

In [None]:
# Only get critical reviews (1-3 stars)
dfNegative = df[df.rating < 4]
# Only get critical reviews (1-3 stars)
dfPositive = df[df.rating >= 4]

In [None]:
# Convert df column "text" to list 
reviewsNegativeText = dfNegative.text.to_list()
reviewsPositiveText = dfPositive.text.to_list()

In [None]:
# Remove stopwords
reviewsNegativeText = remove_stopwords(reviewsNegativeText)
reviewsPositiveText = remove_stopwords(reviewsPositiveText)

## Negative Rezensionen

In [None]:
# Create model based on German BERT
# model = BERTopic(embedding_model=bertGerman, language="german")
modelNegative = BERTopic(language="german", nr_topics="auto")

In [None]:
# Fit model
topics, probabilities = modelNegative.fit_transform(reviewsNegativeText)

In [None]:
modelNegative.update_topics(reviewsNegativeText, topics, n_gram_range=(1, 2))

In [None]:
# Get top 10 topics
modelNegative.get_topic_freq().head(11)

In [None]:
modelNegative.get_topic(0)

In [None]:
modelNegative.get_topic(1)

In [None]:
modelNegative.get_topic(3)

In [None]:
for x in range(0, 3):
    first_tuple_elements = []
    for tuple in modelNegative.get_topic(x):
        first_tuple_elements.append(tuple[0])
    print(first_tuple_elements)
    print("\n")

In [None]:
for x in range(1, 10):
    print(str(modelNegative.get_topic(x)) + "\n")

In [None]:
modelNegative.visualize_barchart()

In [None]:
modelNegative.visualize_topics()

In [None]:
modelNegative.visualize_heatmap()

In [None]:
modelNegative.visualize_hierarchy(top_n_topics=20)

## Positive Rezensionen

In [None]:
modelPositive = BERTopic(language="german", nr_topics="auto")

In [None]:
topics, probabilities = modelPositive.fit_transform(reviewsPositiveText)

In [None]:
modelPositive.update_topics(reviewsPositiveText, topics, n_gram_range=(1, 2))

In [None]:
modelPositive.visualize_barchart()

In [None]:
modelPositive.visualize_hierarchy(top_n_topics=20)