In [1]:
import numpy as np
import pandas as pd

import gensim

from datetime import datetime

from IPython.core.display import HTML
import tba3102



display(HTML("<style>pre { white-space: pre !important; }</style>"))
tba3102.set_default_pandas_options(max_columns=16, width=1500, max_colwidth=300)

print('Text processing started at {}'.format(datetime.now()))

Text processing started at 2023-03-19 10:46:50.931787


In [2]:
df = pd.read_csv('../data/voted-kaggle-dataset-cleaned.csv', index_col=0)
documents = [tba3102.tokenize_sentence_to_words(cleaned_description) for cleaned_description in df['Cleaned_Description']]



bigram = gensim.models.Phrases(documents, min_count=20, threshold=20, delimiter='_') # higher threshold fewer phrases.
bigram_model = gensim.models.phrases.Phraser(bigram)

corpus_bigrams = [bigram_model[doc] for doc in documents]
dictionary = gensim.corpora.Dictionary(corpus_bigrams)
dictionary.filter_extremes(no_below=20, no_above=0.6)
bow_corpus = [dictionary.doc2bow(text) for text in corpus_bigrams]

In [3]:
MODEL_NAME = 'LDA'
START_TOPIC_COUNT = 3
END_TOPIC_COUNT = 3

In [4]:
models, coherence_scores = tba3102.topic_model_coherence_generator(model_name = MODEL_NAME,
                                                                    corpus=bow_corpus,
                                                                    texts=corpus_bigrams,
                                                                    dictionary=dictionary,
                                                                    start_topic_count=START_TOPIC_COUNT,
                                                                    end_topic_count=END_TOPIC_COUNT,
                                                                    step=1,
                                                                    cpus=-1,
                                                                    print_topics=False)
model = models[0]

topics = [[(term, round(wt, 3))
                for term, wt in model.show_topic(n, topn=7)]
                    for n in range(0, model.num_topics)]

topics_df = pd.DataFrame([[term for term, wt in topic]
                            for topic in topics],
                        columns = ['Term'+str(i) for i in range(1, 8)],
                        index=['Topic '+str(t) for t in range(1, model.num_topics+1)]).T

topics_df = pd.DataFrame([', '.join([term for term, wt in topic])
                                for topic in topics],
                                columns = ['Terms per Topic'],
                                index=['Topic'+str(t) for t in range(1, model.num_topics+1)])

tm_results = model[bow_corpus]

corpus_topics = [sorted(topics, key=lambda record: -record[1])[0] for topics in tm_results]

corpus_topic_df = pd.DataFrame()
corpus_topic_df['Document'] = range(0, len(documents))
corpus_topic_df['Dominant Topic'] = [item[0]+1 for item in corpus_topics]
corpus_topic_df['Contribution %'] = [round(item[1]*100, 2) for item in corpus_topics]
corpus_topic_df['Topic Desc'] = [topics_df.iloc[t[0]]['Terms per Topic'] for t in corpus_topics]
corpus_topic_df['Document'] = documents

corpus_topic_df.to_csv('../data/corpus_topic_best.csv')

  0%|                                                                                                                                                        | 0/1 [00:00<?, ?it/s]


MODEL: LDA - NUMBER OF TOPICS: 3


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:07<00:00,  7.56s/it]


In [5]:
print('Text processing ended at {}'.format(datetime.now()))

Text processing ended at 2023-03-19 10:47:02.458697
