In [None]:
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from gensim.models.coherencemodel import CoherenceModel

# Load data
with open('mytextfile.txt', 'r', encoding='ISO-8859-1') as f:
    data = f.readlines()

# Preprocess the text data
def preprocess(text):
    result = []
    for token in simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result

processed_data = [preprocess(doc) for doc in data]

# Create a dictionary from the processed data
dictionary = corpora.Dictionary(processed_data)

# Create a document-term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_data]

# Train the LDA model
num_topics = 10
lda_model = LdaModel(doc_term_matrix, num_topics=num_topics, id2word=dictionary, passes=10)

# Interpret the topics
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWords: {topic}\n')

# Compute the coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


FileNotFoundError: ignored

In [None]:
# Visualize the topics
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary)
pyLDAvis.display(lda_display)

lda_display = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)


ModuleNotFoundError: ignored

In [None]:
# import numpy as np
# import matplotlib.pyplot as plt
# import scipy.cluster.hierarchy as shc

# # Create a list of high-frequency words
# freq_words = [token for doc in processed_data for token in doc if dictionary.doc2bow([token])[0][1] > 50]

# # Sort the dictionary by frequency of occurrence
# dict_sorted = sorted(dictionary.items(), key=lambda x: x[1], reverse=True)

# # Get the top 100 high-frequency words from the sorted dictionary
# top_words = [word for word, freq in dict_sorted[:10]]

# # Create a document-term matrix for high-frequency words
# doc_term_matrix_freq = [[dictionary.doc2bow([token])[0][0] for token in doc if token in freq_words] for doc in processed_data]

# # Convert the document-term matrix to a numpy array
# doc_term_matrix_freq_np = np.zeros((len(doc_term_matrix_freq), len(freq_words)))
# for i, doc in enumerate(doc_term_matrix_freq):
#     for j, freq_word in enumerate(freq_words):
#         if freq_word in doc:
#             doc_term_matrix_freq_np[i, j] = 1

# # Compute the linkage matrix
# Z = shc.linkage(doc_term_matrix_freq_np, method='ward', optimal_ordering=True)

# # Create a dendrogram of linkages
# fig, ax = plt.subplots(figsize=(8, 6))
# plt.title("Dendrogram of Linkages for High-Frequency Words")
# dend = shc.dendrogram(Z, labels=[f"Doc {i}" for i in range(len(doc_term_matrix_freq))], orientation='right', leaf_font_size=10)
# plt.xlabel("Distance")
# plt.ylabel("Document")
# plt.savefig("dendrogram.png", bbox_inches='tight')
# plt.show()

In [None]:
import seaborn as sns
import pandas as pd

# Get the word frequencies
word_frequencies = {}
for doc in processed_data:
    for word in doc:
        if word in word_frequencies:
            word_frequencies[word] += 1
        else:
            word_frequencies[word] = 1

# Get the top 50 most frequent words
top_words = [word for word, frequency in sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)[:50]]

# Create a document-term matrix for the top words
top_doc_term_matrix = [[doc.count(word) for word in top_words] for doc in processed_data]
df = pd.DataFrame(top_doc_term_matrix, columns=top_words)

# Create the heatmap
sns.heatmap(df)

NameError: ignored