# Lab 9.1

Name: Muhammad Bazly Bin Burhan
Student ID: SW01081224

## Example 1

### 1. Import Required Libraries

In [1]:
# For text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# For topic modeling
from gensim import corpora
from gensim.models import LdaModel

import pandas as pd

# Download the NLTK resources
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kucingbunting/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kucingbunting/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kucingbunting/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### 2. Load the Data

In [2]:
# Load the data
documents = [
 "Rafael Nadal Joins Roger Federer in Missing U.S. Open",
 "Rafael Nadal Is Out of the Australian Open",
 "Biden Announces Virus Measures",
 "Biden's Virus Plans Meet Reality",
 "Where Biden's Virus Plan Stands"
]

### 3. Do Data Preprocessing

In [3]:
# Do data preprocessing on the documents
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower()) # Tokenize the text into words and convert to lowercase
    tokens = [token for token in tokens if token.isalnum()] # Filter out non-alphanumeric tokens
    tokens = [token for token in tokens if token not in stop_words] # Filter out stopwords
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # Lemmatize the tokens
    return tokens

preprocessed_documents = [preprocess_text(doc) for doc in documents]
preprocessed_documents


[['rafael', 'nadal', 'join', 'roger', 'federer', 'missing', 'open'],
 ['rafael', 'nadal', 'australian', 'open'],
 ['biden', 'announces', 'virus', 'measure'],
 ['biden', 'virus', 'plan', 'meet', 'reality'],
 ['biden', 'virus', 'plan', 'stand']]

### 4. Crate a Document-Term Matrix

In [4]:
# Create a Gensim Dictionary object from the preprocessed documents
dictionary = corpora.Dictionary(preprocessed_documents)

# Convert each preprocessed document into a bag-of-words representation using the dictionary
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

### 5. Run LDA

In [5]:
# corpus: bag of words representation of the documents
# num_topics: number of topics to be extracted by the model
# id2word=dictionary: dictionary mapping form word IDs to words
# passes: number of passes through the corpus during training

# Train the LDA model on the corpus with 2 topics using Gensim's LdaModel class
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=15)

### 6. Interpret the Results

In [6]:
# Empty list to store dominant topic labels for each document
article_labels = []

#  Iterate over the preprocessed documents
for i, doc in enumerate(preprocessed_documents):
    # Fore each document, convert to box representation
    bow = dictionary.doc2bow(doc)
    # Get list of topic probabilities
    topics = lda_model.get_document_topics(bow)
    # Determine topic with highest probability
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    # Append to the list
    article_labels.append(dominant_topic)

In [7]:
# Create DataFrame
df = pd.DataFrame({'Document': documents, 'Topic': article_labels})

# Display the DataFrame
print("Table with Articles and Topic:")
print(df)

Table with Articles and Topic:
                                            Document  Topic
0  Rafael Nadal Joins Roger Federer in Missing U....      1
1         Rafael Nadal Is Out of the Australian Open      1
2                     Biden Announces Virus Measures      0
3                   Biden's Virus Plans Meet Reality      0
4                    Where Biden's Virus Plan Stands      0


In [8]:
# Print the top terms for each topic
print("Top terms for each topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()

Top terms for each topic:
Topic 0:
- "biden" (weight: 0.166)
- "virus" (weight: 0.166)
- "plan" (weight: 0.119)
- "reality" (weight: 0.071)
- "stand" (weight: 0.071)
- "measure" (weight: 0.071)
- "announces" (weight: 0.071)
- "meet" (weight: 0.071)
- "australian" (weight: 0.024)
- "open" (weight: 0.024)

Topic 1:
- "nadal" (weight: 0.131)
- "rafael" (weight: 0.131)
- "open" (weight: 0.131)
- "federer" (weight: 0.079)
- "join" (weight: 0.079)
- "missing" (weight: 0.079)
- "roger" (weight: 0.079)
- "australian" (weight: 0.079)
- "virus" (weight: 0.027)
- "biden" (weight: 0.027)



### 7. Calculate Coherence Score

In [12]:
# import library for Coherence Score
from gensim.models.coherencemodel import CoherenceModel

# Calculate the coherence score for the LDA model
coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_documents, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

# Display the score
print(f'Topic Coherence Score (C_V): {coherence_lda:.4f}')

Topic Coherence Score (C_V): 0.3801
