In [25]:
#For text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer, WordNetLemmatizer

#For topic modelling
from gensim import corpora
from gensim.models import LdaModel

#Download NLTK Resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:
import pandas as pd
import re

df = pd.read_csv('news_dataset.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,text,target,title,date
0,0,I was wondering if anyone out there could enli...,7,rec.autos,2022-08-02 13:48:37.251043
1,17,I recently posted an article asking what kind ...,7,rec.autos,2022-08-02 13:48:37.251043
2,29,\nIt depends on your priorities. A lot of peo...,7,rec.autos,2022-08-02 13:48:37.251043
3,56,an excellent automatic can be found in the sub...,7,rec.autos,2022-08-02 13:48:37.251043
4,64,: Ford and his automobile. I need information...,7,rec.autos,2022-08-02 13:48:37.251043


In [29]:
text_data = df['text']

text_data.head

<bound method NDFrame.head of 0        I was wondering if anyone out there could enli...
1        I recently posted an article asking what kind ...
2        \nIt depends on your priorities.  A lot of peo...
3        an excellent automatic can be found in the sub...
4        : Ford and his automobile.  I need information...
                               ...                        
11309    Secrecy in Clipper Chip\n\nThe serial number o...
11310    Hi !\n\nI am interested in the source of FEAL ...
11311    The actual algorithm is classified, however, t...
11312    \n\tThis appears to be generic calling upon th...
11313    \nProbably keep quiet and take it, lest they g...
Name: text, Length: 11314, dtype: object>

In [33]:
# 1. Remove null values
text_data = text_data.dropna()

# 2. Initialize preprocessing tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# 3. Define preprocessing function
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and numbers
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [stemmer.stem(word) for word in tokens]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

# 4. Apply preprocessing
preprocessed_texts = text_data.apply(preprocess)

# Preview
preprocessed_texts.head()

0    [wonder, anyon, could, enlighten, car, saw, da...
1    [recent, post, articl, ask, kind, rate, singl,...
2    [depend, prioriti, lot, peopl, put, higher, pr...
3    [excel, automat, found, subaru, legaci, switch...
4    [ford, automobil, need, inform, whether, ford,...
Name: text, dtype: object

In [37]:
# Create a Gensim Dictionary object from the preprocessed documents
dictionary = corpora.Dictionary(preprocessed_texts)

# Filter out tokens that appear in less than 15 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=15, no_above=0.5)

# Convert each preprocessed document into a bag-of-words representation using the dictionary
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_texts] 

In [39]:
# 4. Run LDA
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)  

In [41]:
# 5. Interpret Results: Find the dominant topic for each document
article_labels = []

# Iterate over each preprocessed document
for i, doc in enumerate(preprocessed_texts):
    # Convert to bag-of-words representation
    bow = dictionary.doc2bow(doc)
    
    # Get the list of topic probabilities
    topics = lda_model.get_document_topics(bow)
    
    # Determine the topic with the highest probability
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    
    # Append to the list
    article_labels.append(dominant_topic)

In [57]:
# 6. Create a DataFrame
df_result = pd.DataFrame({"Article": text_data, "Topic": article_labels})

# Print the DataFrame
print("Table with Articles and Topic:")
print(df_result)

# Optionally, evaluate the LDA model using coherence score
from gensim.models import CoherenceModel

# Calculate coherence score using the preprocessed texts
coherence_model = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, texts=preprocessed_texts, coherence='c_v')
coherence_score = coherence_model.get_coherence()

Table with Articles and Topic:
                                                 Article  Topic
0      I was wondering if anyone out there could enli...      0
1      I recently posted an article asking what kind ...      3
2      \nIt depends on your priorities.  A lot of peo...      3
3      an excellent automatic can be found in the sub...      3
4      : Ford and his automobile.  I need information...      3
...                                                  ...    ...
11309  Secrecy in Clipper Chip\n\nThe serial number o...      2
11310  Hi !\n\nI am interested in the source of FEAL ...      3
11311  The actual algorithm is classified, however, t...      2
11312  \n\tThis appears to be generic calling upon th...      0
11313  \nProbably keep quiet and take it, lest they g...      0

[11096 rows x 2 columns]


In [58]:
print("\n Name: Mysara Qistina binti Mahadzir & Addelina binti Mohd Zulkifli")
print("\nStudent ID: SW01083524 & SW01082366")

print("\nCoherence Score:", coherence_score)


 Name: Mysara Qistina binti Mahadzir & Addelina binti Mohd Zulkifli

Student ID: SW01083524 & SW01082366

Coherence Score: 0.5481828341197291


In [47]:
# Print top terms for each topic
for topic_id in range(lda_model.num_topics):
    print(f"Top terms for Topic #{topic_id}:")
    top_terms = lda_model.show_topic(topic_id, topn=10)
    print([term[0] for term in top_terms])  # Extracting the top terms from the result
    print()

Top terms for Topic #0:
['one', 'would', 'dont', 'think', 'know', 'say', 'like', 'go', 'get', 'peopl']

Top terms for Topic #1:
['peopl', 'govern', 'would', 'state', 'law', 'q', 'right', 'u', 'one', 'mr']

Top terms for Topic #2:
['key', 'encrypt', 'use', 'chip', 'secur', 'system', 'inform', 'anonym', 'post', 'clipper']

Top terms for Topic #3:
['use', 'b', 'db', 'one', 'would', 'get', 'drive', 'work', 'like', 'know']

Top terms for Topic #4:
['x', 'file', 'use', 'program', 'window', 'version', 'includ', 'avail', 'space', 'c']



In [49]:
# Print the top terms for each topic with their weights
print("Top Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split(" + ")]
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()

Top Terms for Each Topic:
Topic 0:
- "one" (weight: 0.010)
- "would" (weight: 0.009)
- "dont" (weight: 0.009)
- "think" (weight: 0.008)
- "know" (weight: 0.007)
- "say" (weight: 0.007)
- "like" (weight: 0.007)
- "go" (weight: 0.007)
- "get" (weight: 0.007)
- "peopl" (weight: 0.006)

Topic 1:
- "peopl" (weight: 0.009)
- "govern" (weight: 0.008)
- "would" (weight: 0.008)
- "state" (weight: 0.006)
- "law" (weight: 0.006)
- "q" (weight: 0.006)
- "right" (weight: 0.005)
- "u" (weight: 0.005)
- "one" (weight: 0.005)
- "mr" (weight: 0.005)

Topic 2:
- "key" (weight: 0.025)
- "encrypt" (weight: 0.016)
- "use" (weight: 0.016)
- "chip" (weight: 0.011)
- "secur" (weight: 0.010)
- "system" (weight: 0.009)
- "inform" (weight: 0.009)
- "anonym" (weight: 0.008)
- "post" (weight: 0.008)
- "clipper" (weight: 0.008)

Topic 3:
- "use" (weight: 0.016)
- "b" (weight: 0.014)
- "db" (weight: 0.013)
- "one" (weight: 0.009)
- "would" (weight: 0.009)
- "get" (weight: 0.009)
- "drive" (weight: 0.008)
- "work" (w

#### Topic 0: General
#### Topic 1: Politics 
#### Topic 2: Security
#### Topic 3: Technology
#### Topic 4: Software