In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from gensim.models import CoherenceModel
import string

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\norar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\norar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\norar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df = pd.read_csv('news_dataset.csv')
print(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns.")

Dataset loaded with 11314 rows and 5 columns.


In [5]:
texts = df['text'].dropna().tolist()
print(f"Number of non-null texts: {len(texts)}")

Number of non-null texts: 11096


In [7]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [9]:
def preprocess(text):
    # Tokenize text
    tokens = nltk.word_tokenize(text.lower())
    # Remove punctuation and stop words, and lemmatize the words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

In [11]:
processed_texts = [preprocess(text) for text in texts]
print(f"Sample preprocessed text: {processed_texts[0][:10]}")

Sample preprocessed text: ['wondering', 'anyone', 'could', 'enlighten', 'car', 'saw', 'day', 'sport', 'car', 'looked']


In [13]:
dictionary = corpora.Dictionary(processed_texts)
print(f"Number of unique tokens before filtering: {len(dictionary)}")

Number of unique tokens before filtering: 55771


In [15]:
dictionary.filter_extremes(no_below=15, no_above=0.5)
print(f"Number of unique tokens after filtering: {len(dictionary)}")

Number of unique tokens after filtering: 6229


In [17]:
corpus = [dictionary.doc2bow(text) for text in processed_texts]
print(f"Sample BOW for first document: {corpus[0][:10]}")

Sample BOW for first document: [(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 4), (6, 1), (7, 1), (8, 1), (9, 1)]


In [19]:
lda_model = models.LdaModel(corpus, num_topics=4, id2word=dictionary, passes=15)
print("LDA model training complete.")

LDA model training complete.


In [21]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_score}')

Coherence Score: 0.6057639447044583


In [23]:
topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.071*"x" + 0.060*"q" + 0.052*"max" + 0.034*"g" + 0.034*"r" + 0.029*"p" + 0.027*"n" + 0.019*"k" + 0.019*"w" + 0.018*"v"
Topic 1: 0.011*"would" + 0.010*"one" + 0.009*"get" + 0.008*"like" + 0.007*"know" + 0.007*"year" + 0.006*"time" + 0.006*"good" + 0.006*"game" + 0.005*"think"
Topic 2: 0.011*"people" + 0.010*"would" + 0.008*"one" + 0.006*"think" + 0.006*"say" + 0.005*"god" + 0.005*"government" + 0.005*"u" + 0.005*"right" + 0.005*"know"
Topic 3: 0.013*"key" + 0.009*"use" + 0.009*"system" + 0.009*"file" + 0.007*"program" + 0.007*"chip" + 0.007*"information" + 0.006*"db" + 0.006*"encryption" + 0.005*"available"


In [None]:
# Nor Ardini Arwan - IS01082907
# Alisa Nadia bt Ahmad Nizrasham - SW01081353

# The coherence score of 0.606 for the LDA model indicates a moderately good level of topic coherence. 
# This suggests that the topics are fairly well-defined and understandable, though there is still potential for improvement. 
# For example, Topic 0 includes less meaningful words ("x," "q," "max"), whereas Topics 1, 2, and 3 contain more interpretable words related to themes like time, people, and systems. 
# Further fine-tuning of the model or preprocessing steps could enhance the coherence and interpretability of the topics.
