GROUP NAME AND ID

NAME: Zulhakim Bin Zulkefli
ID: SW01080887
Section: 02

In [1]:
# For text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# For topic modeling
from gensim import corpora
from gensim.models import LdaModel
import pandas as pd
from gensim.models import CoherenceModel  

# Download NLTK Resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import re
import string 



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Z4Teen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Z4Teen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Z4Teen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data = pd.read_csv('data\\news_dataset.csv')
documents = data['text'].dropna().tolist()


In [3]:
stop_words = set(stopwords.words('english')) # Create a set of English stopwords
lemmatizer = WordNetLemmatizer() # Initialize a WordNet lemmatizer


In [4]:
# Function to preprocess text
def preprocess_text(text):
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove words that have numbers
    text = re.sub(r'\b\w*\d\w*\b', '', text)
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text) 
    # Tokenize the text into words and convert to lowercase
    tokens = word_tokenize(text.lower())
    # Filter out non-alphanumeric tokens
    tokens = [token for token in tokens if token.isalnum()]
    # Remove stopwords from the tokens
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize each token
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Remove single-character tokens
    tokens = [token for token in tokens if len(token) > 1]
    return tokens


In [5]:
# Function to remove numbers and hyphens
def remove_numbers(text):
    # Check if the input is a string
    if isinstance(text, str):
        # If it's a string, remove numbers and hyphens
        return re.sub("[\d-]",'', str(text))
    else:
        # If it's not a string (e.g., NaN), return an empty string
        return ''


# Function to remove punctuation
def remove_punctuation(text):
    return ''.join([char for char in text if char not in string.punctuation])

In [6]:
# Clean the text data
data['clean_lower'] = data['text'].str.lower()
data['clean_number'] = data['clean_lower'].apply(remove_numbers)
data['clean_punctuation'] = data['clean_number'].apply(remove_punctuation)

# Apply preprocessing to the clean text
data['preprocessed_text'] = data['clean_punctuation'].apply(preprocess_text)

In [7]:
preprocessed_documents = [preprocess_text(doc) for doc in documents] 
# Preprocess each document in the list

print(preprocessed_documents[0])

['wondering', 'anyone', 'could', 'enlighten', 'car', 'saw', 'day', 'sport', 'car', 'looked', 'late', 'early', 'called', 'bricklin', 'door', 'really', 'small', 'addition', 'front', 'bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'made', 'history', 'whatever', 'info', 'funky', 'looking', 'car', 'please']


In [8]:
# Create a Gensim Dictionary object from the preprocessed documents
dictionary = corpora.Dictionary(preprocessed_documents)
# Filter out tokens that appear in less than 15 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=15, no_above=0.5)
# Convert each preprocessed document into a bag-of-words representation using the dictionary
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

In [9]:
# Run LDA
lda_model = LdaModel(corpus, num_topics=4, id2word=dictionary, passes=15) 
# Train an LDA model on the corpus with 2 topics using Gensim's LdaModel class
coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_documents, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.5361455647850619


In [10]:
# empty list to store dominant topic labels for each document
article_labels = []
# iterate over each processed document
for i, doc in enumerate(preprocessed_documents):
 # for each document, convert to bag-of-words representation
 bow = dictionary.doc2bow(doc)
 # get list of topic probabilities
 topics = lda_model.get_document_topics(bow)
 # determine topic with highest probability
 dominant_topic = max(topics, key=lambda x: x[1])[0]
 # append to the list
 article_labels.append(dominant_topic)
    
# Create DataFrame
data_result = pd.DataFrame({"Article": documents, "Topic": article_labels})
# Print the DataFrame
print("Table with Articles and Topic:")
print(data_result)
print()

Table with Articles and Topic:
                                                 Article  Topic
0      I was wondering if anyone out there could enli...      1
1      I recently posted an article asking what kind ...      1
2      \nIt depends on your priorities.  A lot of peo...      1
3      an excellent automatic can be found in the sub...      1
4      : Ford and his automobile.  I need information...      1
...                                                  ...    ...
11091  Secrecy in Clipper Chip\n\nThe serial number o...      0
11092  Hi !\n\nI am interested in the source of FEAL ...      0
11093  The actual algorithm is classified, however, t...      0
11094  \n\tThis appears to be generic calling upon th...      1
11095  \nProbably keep quiet and take it, lest they g...      1

[11096 rows x 2 columns]



In [11]:
print("Top Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()

Top Terms for Each Topic:
Topic 0:
- "key" (weight: 0.013)
- "use" (weight: 0.010)
- "file" (weight: 0.009)
- "system" (weight: 0.009)
- "chip" (weight: 0.007)
- "program" (weight: 0.006)
- "db" (weight: 0.006)
- "encryption" (weight: 0.006)
- "information" (weight: 0.006)
- "window" (weight: 0.006)

Topic 1:
- "would" (weight: 0.015)
- "one" (weight: 0.014)
- "know" (weight: 0.009)
- "think" (weight: 0.009)
- "like" (weight: 0.009)
- "get" (weight: 0.007)
- "say" (weight: 0.007)
- "people" (weight: 0.007)
- "thing" (weight: 0.006)
- "god" (weight: 0.006)

Topic 2:
- "max" (weight: 0.011)
- "people" (weight: 0.011)
- "government" (weight: 0.009)
- "law" (weight: 0.007)
- "state" (weight: 0.006)
- "would" (weight: 0.006)
- "right" (weight: 0.006)
- "president" (weight: 0.006)
- "armenian" (weight: 0.005)
- "said" (weight: 0.005)

Topic 3:
- "game" (weight: 0.016)
- "team" (weight: 0.013)
- "year" (weight: 0.013)
- "player" (weight: 0.008)
- "play" (weight: 0.007)
- "new" (weight: 0.007)

In [12]:
# Print top terms for each topic
for topic_id in range(lda_model.num_topics):
 print(f"Top terms for Topic #{topic_id}:")
 top_terms = lda_model.show_topic(topic_id, topn=10)
 print([term[0] for term in top_terms])
 print()

Top terms for Topic #0:
['key', 'use', 'file', 'system', 'chip', 'program', 'db', 'encryption', 'information', 'window']

Top terms for Topic #1:
['would', 'one', 'know', 'think', 'like', 'get', 'say', 'people', 'thing', 'god']

Top terms for Topic #2:
['max', 'people', 'government', 'law', 'state', 'would', 'right', 'president', 'armenian', 'said']

Top terms for Topic #3:
['game', 'team', 'year', 'player', 'play', 'new', 'last', 'first', 'season', 'league']



Coherence score assess the interpretability of the topics generated by the topic modelling algorithm. It does this by measuring
how semantically coherent the top words with each topic. In this case coherence score is around 0.57 so that means on average
around 57% of the top words are semantically related.

As for the topic results:

Topic 1: Technology and Software. 
We can see words like 'system', 'encryption', and 'chip' are grouped together.

Topic 2: Social and Opinion.
This is shown from the words like 'people' and 'think' 

Topic 3: Politic and Government. 
Words like 'law', 'state', and 'president' are together.

Topic 4: Sports. 
This can be seen in the words like 'season', 'player', and 'league'.