In [9]:
import pandas as pd
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.ldamulticore import LdaMulticore

# Load the CSV file with tokenized content
df = pd.read_csv("out.csv")

# Create a list of tokenized documents
tokenized_docs = [doc.split() for doc in df["processed_content"].values]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(tokenized_docs)

# Filter out tokens that appear in less than 5 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=5, no_above=0.5)

# Create a bag-of-words representation of the documents
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# Set number of topics
num_topics = 10

# Build the LDA model
lda_model = LdaMulticore(corpus=corpus,
                         id2word=dictionary,
                         num_topics=num_topics,
                         workers=3,  # Adjust based on your system
                         passes=100)  # Number of passes through the corpus

# Print the top 5 terms for each topic
for idx, topic in lda_model.print_topics(-1):
    terms = topic.split("+")
    terms = [term.split("*")[1].strip().replace('"', '') for term in terms][:5]
    print("Topic {}: {}".format(idx, ", ".join(terms)))

# Save the model if needed
# lda_model.save("lda_model")


Topic 0: iunie, ora, 'cale',, 'zona',, ',
Topic 1: 'Ucraina',, 'Rusia',, 'rus',, 'atac',, 'razboi',
Topic 2: miliard, leu',, 'ANAF',, leu, 'buget',
Topic 3: 'greva',, 'guvern',, leu',, 'salarizare',, 'profesor',
Topic 4: 'roman',, 'persoana',, an',, 'frontiera',, 'suma',
Topic 5: 'scoala',, 'veni',, 'loc',, 'pleca',, 'face',
Topic 6: 'minister',, 'numar',, 'clasa',, 'vizita',, 'data',
Topic 7: 'femeie',, an',, 'barbat',, 'politie',, 'el',
Topic 8: 'cookie',, '-urile',, 'european',, 'direct',, 'el',
Topic 9: 'spune',, 'el',, 'medical',, 'lege',, 'special',


Document 1:
Topic 3: 0.9931

Document 2:
Topic 4: 0.9873

Document 3:
Topic 9: 0.9727

Document 4:
Topic 1: 0.3132
Topic 5: 0.1982
Topic 9: 0.4763

Document 5:
Topic 3: 0.9942

Document 6:
Topic 3: 0.9977

Document 7:
Topic 6: 0.9898

Document 8:
Topic 1: 0.1098
Topic 3: 0.0187
Topic 4: 0.7498
Topic 7: 0.1159

Document 9:
Topic 8: 0.9852

Document 10:
Topic 6: 0.5791
Topic 9: 0.4123

Document 11:
Topic 3: 0.9945

Document 12:
Topic 4: 0.4094
Topic 6: 0.3078
Topic 7: 0.0926
Topic 9: 0.1843

Document 13:
Topic 9: 0.9928

Document 14:
Topic 6: 0.3048
Topic 9: 0.6884

Document 15:
Topic 3: 0.9870

Document 16:
Topic 1: 0.3121
Topic 2: 0.5346
Topic 3: 0.1463

Document 17:
Topic 4: 0.9857

Document 18:
Topic 3: 0.9931

Document 19:
Topic 7: 0.1139
Topic 8: 0.8741

Document 20:
Topic 6: 0.9816

Document 21:
Topic 2: 0.0707
Topic 3: 0.5921
Topic 6: 0.0654
Topic 9: 0.2682

Document 22:
Topic 1: 0.1879
Topic 5: 0.4247
Topic 8: 0.3800

Document 23:
Topic 1: 0.7567
Topic 8: 0.2408

Document 24:
To