In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from gensim import corpora
from gensim.models import LdaModel

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
documents = [
 "Rafael Nadal Joins Roger Federer in Missing U.S. Open",
 "Rafael Nadal Is Out of the Australian Open",
 "Biden Announces Virus Measures",
 "Biden's Virus Plans Meet Reality",
 "Where Biden's Virus Plan Stands"
]

In [5]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocessed_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

preprocessed_documents = [preprocessed_text(doc) for doc in documents]
preprocessed_documents

[['rafael', 'nadal', 'join', 'roger', 'federer', 'missing', 'open'],
 ['rafael', 'nadal', 'australian', 'open'],
 ['biden', 'announces', 'virus', 'measure'],
 ['biden', 'virus', 'plan', 'meet', 'reality'],
 ['biden', 'virus', 'plan', 'stand']]

In [9]:
# create a Gensim distionary object from the preprocessed documents
dictionary = corpora.Dictionary(preprocessed_documents)

#convert each preprocessed document into a BoW representation using the dictionary
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

In [12]:
#corpus: BoW representation of the doc
#num_topics: num of topics to be extracted by the model
#id2word=dictionary: dictionary mapping from word IDs to words
#passes: num of passes through the corpus during the training
#train the LDA model on the corpus with 4 topics using gensim LdaModelclass
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=15)

In [16]:
#empty list to store dominiant topic labels for each doc
article_labels = []

#iterate over each preprocessed document
for i, doc in enumerate(preprocessed_documents):
    #for each doc, convert to box representation
    bow = dictionary.doc2bow(doc)
    
    #get list of topic probabilities
    topics = lda_model.get_document_topics(bow)
    
    #determine the topic with highest probability
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    
    #append to the list
    article_labels.append(dominant_topic)

In [18]:
import pandas as pd
#create dataframe
df = pd.DataFrame({"Article": documents, "Topic": article_labels})

#print the dataframe
print("Table with Articles and Topic:")
print(df)
print()

Table with Articles and Topic:
                                             Article  Topic
0  Rafael Nadal Joins Roger Federer in Missing U....      0
1         Rafael Nadal Is Out of the Australian Open      0
2                     Biden Announces Virus Measures      1
3                   Biden's Virus Plans Meet Reality      1
4                    Where Biden's Virus Plan Stands      1



In [19]:
#print the top terms for each topic
print("Top terns for each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()

Top terns for each Topic:
Topic 0:
- "rafael" (weight: 0.127)
- "open" (weight: 0.127)
- "nadal" (weight: 0.127)
- "missing" (weight: 0.076)
- "federer" (weight: 0.076)
- "join" (weight: 0.076)
- "roger" (weight: 0.076)
- "australian" (weight: 0.076)
- "measure" (weight: 0.036)
- "announces" (weight: 0.036)

Topic 1:
- "biden" (weight: 0.167)
- "virus" (weight: 0.167)
- "plan" (weight: 0.122)
- "meet" (weight: 0.073)
- "reality" (weight: 0.073)
- "stand" (weight: 0.073)
- "announces" (weight: 0.063)
- "measure" (weight: 0.063)
- "australian" (weight: 0.025)
- "nadal" (weight: 0.025)

