In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [23]:
file_path = "anonymized_challenge_dataset.csv"
dataset = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
dataset.head()

Unnamed: 0,ID,interview_date,country,NPS,comment,translated_comment
0,1,2022-05-11,Germany,0,Die Vertragsänderung meiner Mutter wurde nicht...,My mother&#39;s contract change did not go thr...
1,2,2022-07-19,Poland,0,ebok nie dziala - brak wgladu do Faktur,ebok does not work - no access to invoices
2,3,2022-12-14,Italy,8,NON SAPREI. PERCHE' MI SONO TROVATO BENE SEMPR...,I WOULD NOT KNOW. BECAUSE I HAVE ALWAYS FOUND ...
3,4,2022-08-04,Germany,10,Service auch telefonisch immer erreichbar und ...,Service is always available by phone and you g...
4,5,2022-08-15,Poland,9,"Fachowosc,kompetencje i kultura pracownika","Professionalism, competence and employee culture"


In [24]:
# Data Cleaning
dataset = dataset.drop("comment", axis = 1)
print(dataset.info())

# Check for missing values
missing_values = dataset.isnull().sum()

# Initial analysis of NPS values (checking for outliers or inappropriate values)
dataset['NPS'] = pd.to_numeric(dataset['NPS'], errors='coerce')
nps_summary = dataset['NPS'].describe()

# Convert 'interview_date' to datetime
dataset['interview_date'] = pd.to_datetime(dataset['interview_date'], errors='coerce')

# Check data types
data_types = dataset.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503952 entries, 0 to 503951
Data columns (total 5 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   ID                  503952 non-null  int64 
 1   interview_date      502936 non-null  object
 2   country             502566 non-null  object
 3   NPS                 500699 non-null  object
 4   translated_comment  497321 non-null  object
dtypes: int64(1), object(4)
memory usage: 19.2+ MB
None


In [25]:
# Investigating the outlier in NPS
dataset = dataset[dataset['NPS'] < 11]

# Recheck NPS summary after handling outliers
nps_summary_cleaned = dataset['NPS'].describe()

dataset = dataset[dataset['translated_comment'].notnull()]
dataset.dropna(subset=['interview_date'], inplace=True)


# Convert results to a dictionary for easier interpretation
cleaning_results = {
    "missing_values": dataset.isnull().sum().to_dict(),
    "nps_summary_cleaned": nps_summary_cleaned.to_dict()
}

In [26]:
keep_countries = ['Germany', 'Poland', 'Italy', 'Romania', 'Sweden', 'Netherlands', 'Czech',
                          'Czech Republic', 'United Kingdom', 'Hungary']
df = dataset[dataset['country'].isin(keep_countries)]
df['country'].value_counts()

country
Netherlands       153148
Czech             130030
Germany            48484
Romania            41674
Poland             34981
Italy              32409
Sweden             30512
Hungary            24994
Czech Republic       667
United Kingdom       397
Name: count, dtype: int64

In [27]:
country_mapping = {'Czech Republic': 'Czech'}
df['country'] = df['country'].replace(country_mapping)

In [28]:
def customerType(dataset):
    if dataset["NPS"] >= 9:
        return "promoters"
    elif (dataset["NPS"] == 7 or dataset["NPS"] == 8):
        return "passives"
    elif dataset["NPS"] <= 6:
        return "detractors"

df["customer_type"] = dataset.apply(customerType, axis = 1)

In [31]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
import re

# Downloading necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

sw_nltk = stopwords.words('english')
#sw_nltk.remove('not')
#sw_nltk.remove('no')
# #sw_nltk.remove('nor')


# sw_nltk.remove("aren't")
# sw_nltk.remove("couldn't")
# sw_nltk.remove("didn't")
stop_words = [word for word in sw_nltk if word not in sw_nltk[-36:]]
# sw_nltk.remove([sw_nltk[-36:]])

# Initializing the lemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /Users/pati/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/pati/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/pati/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [32]:
import string
def preprocess_text(text):
    # Convert to lower case
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenization
    words = nltk.word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Joining back into a string
    text = ' '.join(words)
    return text

# df['comment_preprocessed'] = df['comment'].apply(preprocess_text)
df["translated_comment"] = df["translated_comment"].str.replace("&#39;","\'")
df['translated_comment_preprocessed'] = df['translated_comment'].apply(preprocess_text)

df["translated_comment_preprocessed"] = df["translated_comment_preprocessed"].str.replace("quot", "", regex = False)

In [33]:
def is_numeric(text):
    """
    Check if the given text is entirely numeric.
    """
    return text.isdigit() if isinstance(text, str) else False

# Filter out rows where the comments are entirely numeric
df = df[~df['translated_comment_preprocessed'].apply(is_numeric)]

In [34]:
import re

def remove_emojis(text):
    """
    Remove emojis from the given text.
    """
    # Emoji patterns
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Apply the function to the relevant columns
df['translated_comment_preprocessed'] = df['translated_comment_preprocessed'].apply(remove_emojis)

In [35]:
df['comment_length'] = df['translated_comment_preprocessed'].apply(lambda x : len(x))
dataset = df[df['comment_length'] > 1]

dataset['comment_length_2'] = dataset['translated_comment_preprocessed'].str.len()
dataset = dataset[dataset['comment_length_2'] > 0]

dataset = dataset.dropna(subset=["translated_comment_preprocessed"])
dataset.drop('comment_length_2', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['comment_length_2'] = dataset['translated_comment_preprocessed'].str.len()


In [36]:
dataset.to_csv('cleaned_dataset_100.csv', index=False)

In [37]:
df_percent = dataset.sample(frac=0.1)
df_percent.to_csv('cleaned_dataset_100_10percent.csv', index=False)

In [88]:
import os
from gensim.models import LdaModel
from gensim import corpora  # Tokenization
dataset = dataset[(dataset['NPS'] <= 6)]
tokenized_docs = [word_tokenize(doc.lower()) for doc in dataset['translated_comment_preprocessed']]
print("done with tokenization")

from gensim.models import Phrases
from gensim.models.phrases import Phraser
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
from gensim.models.coherencemodel import CoherenceModel


# Build the bigram models
#bigram = Phrases(tokenized_docs, min_count=2, threshold=5)  # higher threshold fewer phrases
#bigram_mod = Phraser(bigram)

# Apply the models to the tokenized docs
#bigram_docs = [bigram_mod[doc] for doc in tokenized_docs]


# Dictionary and Corpus
dictionary = corpora.Dictionary(tokenized_docs)
dictionary.filter_extremes()
corpus = [dictionary.doc2bow(text) for text in tokenized_docs]

# LDA Model
lda_model = LdaModel(corpus, num_topics=9, id2word=dictionary, passes=30, alpha='auto', eta='auto')

done with tokenization


In [87]:
def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        print("one down " + str(coherencemodel.get_coherence()))

    return model_list, coherence_values

limit = 100
start = 6
step = 1
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=tokenized_docs, start=start, limit=limit, step=step)

""" one down 0.642589970020583
one down 0.6502093985636616
one down 0.6510404178804576
one down 0.6660971370188418
one down 0.6458128396803969
one down 0.6132130304007789
one down 0.6628177038804354
one down 0.6113108646887729
one down 0.6296642810987029
one down 0.63816981544017
one down 0.6087262026271116
one down 0.6076305554180648
one down 0.630488733617124
one down 0.6015620371461161
one down 0.5868751652017856
one down 0.6096635838127193
one down 0.5909888093667274
one down 0.5791382156202213
one down 0.5840097174412971
one down 0.5773682550453793
one down 0.563563410486932
one down 0.571377349271367
one down 0.5864471166759112
one down 0.5621831752863081
one down 0.5583385557577716
one down 0.5496057495516775
one down 0.5292327572478748
one down 0.562755471154976
"""

one down 0.642589970020583
one down 0.6502093985636616
one down 0.6510404178804576
one down 0.6660971370188418
one down 0.6458128396803969
one down 0.6132130304007789
one down 0.6628177038804354
one down 0.6113108646887729
one down 0.6296642810987029
one down 0.63816981544017
one down 0.6087262026271116
one down 0.6076305554180648
one down 0.630488733617124
one down 0.6015620371461161
one down 0.5868751652017856
one down 0.6096635838127193
one down 0.5909888093667274
one down 0.5791382156202213
one down 0.5840097174412971
one down 0.5773682550453793
one down 0.563563410486932
one down 0.571377349271367
one down 0.5864471166759112
one down 0.5621831752863081
one down 0.5583385557577716
one down 0.5496057495516775
one down 0.5292327572478748
one down 0.562755471154976
one down 0.5508394870554895
one down 0.5391587077881534
one down 0.5414784564631206
one down 0.5241070450818176
one down 0.5246567103751271
one down 0.5077756227494503
one down 0.5032786127082203
one down 0.5074577460413144

KeyboardInterrupt: 

In [89]:
print(coherence_values)

NameError: name 'coherence_values' is not defined

In [None]:
max(coherence_values)

In [90]:
lda_model.print_topics()

[(0,
  '0.039*"work" + 0.029*"possible" + 0.027*"change" + 0.027*"website" + 0.026*"account" + 0.025*"online" + 0.024*"problems" + 0.023*"portal" + 0.020*"data" + 0.020*"doesnt"'),
 (1,
  '0.071*"meter" + 0.051*"information" + 0.046*"invoice" + 0.033*"receive" + 0.032*"reading" + 0.030*"payment" + 0.028*"invoices" + 0.019*"innogy" + 0.017*"date" + 0.017*"despite"'),
 (2,
  '0.068*"electricity" + 0.053*"price" + 0.039*"pay" + 0.031*"bill" + 0.031*"prices" + 0.028*"gas" + 0.026*"consumption" + 0.024*"high" + 0.023*"expensive" + 0.023*"month"'),
 (3,
  '0.104*"time" + 0.082*"service" + 0.064*"long" + 0.042*"bad" + 0.034*"waiting" + 0.027*"took" + 0.027*"better" + 0.021*"communication" + 0.019*"lot" + 0.018*"connection"'),
 (4,
  '0.081*"energy" + 0.066*"recommend" + 0.058*"company" + 0.050*"never" + 0.041*"supplier" + 0.031*"anyone" + 0.025*"cheaper" + 0.022*"i" + 0.020*"reason" + 0.019*"talk"'),
 (5,
  '0.058*"get" + 0.043*"phone" + 0.041*"cant" + 0.040*"contact" + 0.032*"call" + 0.027*"

In [51]:
len(dictionary)

7066

In [83]:
print(lda_model.alpha)
print(lda_model.eta)

[1.0266428  0.78861666 0.55877835 0.90464956 0.5539185  1.2918285
 0.5030147  1.2567945  0.8071133  0.511982   0.9265341  0.5816102
 0.5410908  0.48195133 0.6177211  0.5481538  1.2797472  0.516306
 0.54690194 0.7790932  1.1622723  0.9158395  1.0866107  0.45116982
 0.50072736 0.6487464  0.8637664  0.5093255  0.7529178  0.8991492
 0.9021323  0.4988445  0.6242504  0.5288439  0.56202847 0.45858353
 0.78469324 0.5288013  0.9339758  1.6888546 ]
[0.02285878 0.02225907 0.02284269 ... 0.02225907 0.02225907 0.02225907]


In [91]:
coherence_model = CoherenceModel(model=lda_model, texts=tokenized_docs, dictionary=dictionary)
coherence_lda = coherence_model.get_coherence()
print(coherence_lda)

0.6555456036113928


In [92]:
# Visualization
# Save the LDA visualization as an HTML file
vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(vis, 'lda_visualization.html')

# Optionally, automatically open the HTML file in the default web browser
os.system('open lda_visualization.html')  # For MacOS

0

In [74]:
import bitermplus as btm

# PREPROCESSING
docs = dataset['translated_comment_preprocessed'].str.strip().tolist()
# Obtaining terms frequency in a sparse matrix and corpus vocabulary
X, vocabulary, vocab_dict = btm.get_words_freqs(docs)
tf = np.array(X.sum(axis=0)).ravel()
# Vectorizing documents
docs_vec = btm.get_vectorized_docs(docs, vocabulary)
docs_lens = list(map(len, docs_vec))
# Generating biterms
biterms = btm.get_biterms(docs_vec)

In [78]:
# INITIALIZING AND RUNNING MODEL
model = btm.BTM(X, vocabulary, seed=12321, T=20, M=20, alpha=50/8, beta=0.01)
model.fit(biterms, iterations=20)
p_zd = model.transform(docs_vec)

# METRICS
perplexity = btm.perplexity(model.matrix_topics_words_, p_zd, X, 8)
coherence = btm.coherence(model.matrix_topics_words_, X, M=20)

model.labels_

100%|██████████| 20/20 [00:24<00:00,  1.24s/it]
100%|██████████| 109961/109961 [00:00<00:00, 150667.82it/s]


array([ 4, 11,  8, ..., 19, 17,  8])

In [79]:
import tmplot as tmp
tmp.report(model=model, docs=docs)

Run `pip install tomotopy` in the console.
  warn(
Run `pip install tomotopy` in the console.
  warn(
Run `pip install tomotopy` in the console.
  warn(
Run `pip install tomotopy` in the console.
  warn(


VBox(children=(VBox(children=(HBox(children=(HTML(value='<b>Select a topic</b>:'), Dropdown(options=((0, 0), (…

In [80]:
print(dataset['comment_length'].mean())

65.3240512545357


In [95]:
import tomotopy

corpus = tokenized_docs
model_tom = tomotopy.LDAModel(k=9, min_df=1)
for token_list in corpus: model_tom.add_doc(token_list)
for i in range (0, 1000, 100): model_tom.train(100)
model_tom.summary()

<Basic Info>
| LDAModel (current version: 0.12.7)
| 109961 docs, 1018575 words
| Total Vocabs: 25936, Used Vocabs: 25936
| Entropy of words: 7.35992
| Entropy of term-weighted words: 7.35992
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 1000, Burn-in steps: 0
| Optimization Interval: 10
| Log-likelihood per word: -7.75408
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 0 (minimum collection frequency of words)
| min_df: 1 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| k: 9 (the number of topics between 1 ~ 32767)
| alpha: [0.1] (hyperparameter of Dirichlet distribution for document-topic, given as a single `float` in case of symmetric prior and as a list with length `k` of `float` in case of asymmetric prior.)
| eta: 0.01 (hyperparameter of Dirichlet distribution for topic-word)
| seed: 4062820885 (random seed)
| trained in version 0.12.7
|
<Parameters>
| alpha (Dirichlet prior on the per-document topic distributions)
|  [0.15

In [98]:
import numpy as np

topic_term_dists = np.stack([model_tom.get_topic_word_dist(k) for k in range(model_tom.k)])
doc_topic_dists = np.stack([doc.get_topic_dist() for doc in model_tom.docs])
doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)
doc_lengths = np.array([len(doc.words) for doc in model_tom.docs])
vocab = list(model_tom.used_vocabs)
term_frequency = model_tom.used_vocab_freq

prepared_data = pyLDAvis.prepare(
    topic_term_dists,
    doc_topic_dists,
    doc_lengths,
    vocab,
    term_frequency,
    start_index=0, # tomotopy starts topic ids with 0, pyLDAvis with 1
    sort_topics=False # IMPORTANT: otherwise the topic_ids between pyLDAvis and tomotopy are not matching!
)

pyLDAvis.save_html(prepared_data, 'tomo_lda_visualization.html')

# Optionally, automatically open the HTML file in the default web browser
os.system('open tomo_lda_visualization.html')

0

In [96]:
# If we're interested in time slices https://github.com/bab2min/tomotopy/blob/main/examples/dtm.py
model_dtm = tomotopy.DTModel(k=9)
for token_list in corpus: model_dtm.add_doc(token_list)
for i in range (0, 1000, 100): model_dtm.train(100)
model_dtm.summary()


<Basic Info>
| DTModel (current version: 0.12.7)
| 109961 docs, 1018575 words
| Total Vocabs: 25936, Used Vocabs: 25936
| Entropy of words: 7.35992
| Entropy of term-weighted words: 7.35992
| Removed Vocabs: <NA>
|
<Training Info>
| Iterations: 1000, Burn-in steps: 0
| Optimization Interval: 10
| Log-likelihood per word: -8.38749
|
<Initial Parameters>
| tw: TermWeight.ONE
| min_cf: 0 (minimum collection frequency of words)
| min_df: 0 (minimum document frequency of words)
| rm_top: 0 (the number of top words to be removed)
| k: 9 (the number of topics between 1 ~ 32767)
| t: 1 (the number of timpoints)
| alpha_var: 0.1 (transition variance of alpha (per-document topic distribution))
| eta_var: 0.1 (variance of eta (topic distribution of each document) from its alpha )
| phi_var: 0.1 (transition variance of phi (word distribution of each topic))
| lr_a: 0.01 (shape parameter `a` greater than zero, for SGLD step size calculated as `e_i = a * (b + i) ^ (-c)`)
| lr_b: 0.1 (shape parameter

In [276]:
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from hdbscan import HDBSCAN

dataset_bert = pd.read_csv('cleaned_dataset (1).csv')
dataset_bert = dataset_bert[dataset_bert["NPS"] <= 6]
#sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
sentence_model = SentenceTransformer('all-mpnet-base-v2')
# Create a BERTopic instance
hdbscan_model = HDBSCAN(min_cluster_size=200)
umap_model = UMAP(random_state=1)
model_bert = BERTopic(language="english", n_gram_range=(1, 3), embedding_model=sentence_model, hdbscan_model=hdbscan_model, umap_model=umap_model, vectorizer_model=CountVectorizer(stop_words='english'))


# Fit the model on your documents
topics, probabilities = model_bert.fit_transform(dataset_bert["translated_comment"])

# Explore the topics
for topic in model_bert.get_topic_info().head(5).itertuples():
    print(f"Topic {topic.Topic}: {topic.Name}")

# To visualize topics
model_bert.visualize_topics()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Topic -1: -1_customer_answer_price_don
Topic 0: 0_eon_contract_customer_electricity
Topic 1: 1_essent_years_customer_customers
Topic 2: 2_invoice_invoices_receive_payment
Topic 3: 3_contract_agreement_new_contracts


In [277]:
model_bert.get_topic_info()
df_BERTopic = pd.DataFrame(model_bert.get_topic_info())
df_BERTopic.to_csv('df_BERTopic_mpnet.csv')

In [278]:
model_bert.generate_topic_labels()

['-1_customer_answer_price',
 '0_eon_contract_customer',
 '1_essent_years_customer',
 '2_invoice_invoices_receive',
 '3_contract_agreement_new',
 '4_phone_minutes_service',
 '5_price_prices_suppliers',
 '6_long_process_took',
 '7_meter_reading_readings',
 '8_payment_installment_euros',
 '9_bills_billing_high',
 '10_recommend_recommendations_recommending',
 '11_energy_supplier_recommend',
 '12_chat_chatbot_robot',
 '13_power_outage_announced',
 '14_website_site_page',
 '15_customers_customer_regular',
 '16_accessible_reduce_poorly',
 '17_communication_conversation_bad',
 '18_connection_email_notification',
 '19_gas_electricity_contract',
 '20_choice_decide_choose',
 '21_password_log_login',
 '22_electricity_energy_price',
 '23_question_answer_questions',
 '24_service_customer_bad',
 '25_request_response_requests',
 '26_information_confusing_unclear',
 '27_contact_phone_touch',
 '28_look_amateurish_',
 '29_rating_influenced_score',
 '30_consumption_manager_app',
 '31_electricity_lower_gr

In [279]:
model_bert.get_representative_docs(16)

['Poorly accessible', 'poorly accessible.', 'poorly accessible']

In [280]:
barchart_bert = model_bert.visualize_barchart(width=280, height=330, top_n_topics=20, n_words=5)
barchart_bert

In [281]:
barchart_bert.write_html('barchart_bertopic.html')

In [289]:
barchart_bert

In [282]:
heatmap_bert = model_bert.visualize_heatmap()
heatmap_bert.write_html('heatmap_bert.html')
heatmap_bert

In [283]:
model_bert.visualize_hierarchy()

In [284]:
print(model_bert.get_topic(0))
print(model_bert.get_representative_docs(0))
print(model_bert.get_topic_info(0))

[('eon', 0.06821379794480599), ('contract', 0.012875217179232635), ('customer', 0.01138633464690117), ('electricity', 0.009766549370207458), ('new', 0.009074282696778164), ('service', 0.00852755581386713), ('contracts', 0.008322318376377452), ('phone', 0.007837979709745996), ('don', 0.007797988448291199), ('months', 0.007645267608933185)]
["don't know why he has eon", "I'm new to eon", 'Eon']
   Topic  Count                                 Name  \
0      0   5051  0_eon_contract_customer_electricity   

                                      Representation  \
0  [eon, contract, customer, electricity, new, se...   

                                Representative_Docs  
0  [don't know why he has eon, I'm new to eon, Eon]  


In [285]:
dataset_bert["topic"] = topics

In [286]:
dataset_bert

Unnamed: 0,ID,interview_date,country,NPS,translated_comment,customer_type,comment_length,translated_comment_preprocessed,topic
0,1,2022-05-11,Germany,0.0,My mother's contract change did not go through...,detractors,618,mother contract change not go expected bank de...,0
1,2,2022-07-19,Poland,0.0,ebok does not work - no access to invoices,detractors,42,ebok not work no access invoice,2
8,9,2022-07-15,Poland,1.0,The contract I got,detractors,18,contract got,3
12,13,2022-12-15,Romania,5.0,it took too long,detractors,16,took long,-1
14,15,2022-07-12,Poland,0.0,"No invoices sent, total inability to take over...",detractors,98,no invoice sent total inability take new clien...,-1
...,...,...,...,...,...,...,...,...,...
491398,503943,2022-10-23,Netherlands,2.0,Waiting time when calling: / hours. Finally c...,detractors,155,waiting time calling hour finally contacted di...,4
491399,503944,2022-10-04,Netherlands,4.0,I never recommend companies.,detractors,28,never recommend company,-1
491400,503945,2022-10-02,Netherlands,0.0,no,detractors,2,no,56
491403,503948,2022-10-15,Netherlands,6.0,Same as above and unable to log in immediately...,detractors,65,unable log immediately due busy line,21


In [287]:
dataset_bert.to_csv('dataset_plus_BERTopic_mpnet.csv')

In [288]:
model_bert.hierarchical_topics(dataset_bert["translated_comment"])

100%|██████████| 67/67 [00:00<00:00, 212.83it/s]


Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_Left_ID,Child_Left_Name,Child_Right_ID,Child_Right_Name,Distance
66,134,essent_eon_contract_time_customer,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",132,essent_eon_contract_time_customer,133,recommend_price_energy_electricity_prices,1.805246
65,133,recommend_price_energy_electricity_prices,"[5, 10, 11, 16, 20, 22, 28, 31, 40, 41, 43, 44...",130,recommend_energy_cena_know_choice,126,price_prices_electricity_high_expensive,1.484294
64,132,essent_eon_contract_time_customer,"[0, 1, 2, 3, 4, 6, 7, 8, 9, 12, 13, 14, 15, 17...",131,essent_eon_contract_meter_invoice,129,chat_website_phone_question_answer,1.417962
63,131,essent_eon_contract_meter_invoice,"[0, 1, 2, 3, 6, 7, 8, 9, 13, 15, 18, 19, 25, 2...",124,power_outage_long_time_took,127,essent_eon_contract_meter_invoice,1.327671
62,130,recommend_energy_cena_know_choice,"[10, 11, 20, 28, 40, 41, 43, 44, 46, 49, 51, 5...",128,cena_nee_accessibility_annual_know,103,recommend_energy_choice_decide_recommendations,1.295579
...,...,...,...,...,...,...,...,...
4,72,connection_email_connect_notification_connected,"[18, 64]",64,electricity_connection_electrician_connect_took,18,connection_email_notification_connect_mail,0.518839
3,71,contract_contracts_new_agreement_year,"[3, 33]",3,contract_agreement_new_contracts_extend,33,contracts_contract_add_account_new,0.461000
2,70,eon_gas_contract_electricity_customer,"[0, 19]",0,eon_contract_customer_electricity_new,19,gas_electricity_contract_energiedirect_energy,0.407674
1,69,electricity_energy_price_prices_lower,"[22, 31]",31,electricity_lower_grid_prices_cheaper,22,electricity_energy_price_high_prices,0.387053
