### Group members: Muhammad Ahsan Bin Zulfakar(SW01081423) & Arinn Danish Bin Abdullah (SW01081421)

## Import Libraries


In [1]:
# For text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
# For topic modeling

from gensim import corpora
from gensim.models import LdaModel
import pandas as pd

# Download NLTK Resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Load the Data

In [35]:
df = pd.read_csv('news_dataset.csv')
documents = df['text'].tolist()


## Preprocess the Data


In [42]:
stop_words = set(stopwords.words('english'))

# Add custom stopwords: all lowercase and uppercase alphabetic characters
custom_stop_words = {'could', 'would', 'like', 'know', 'get', 'one', 'use', 'using', 'door', 
                     'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 
                     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 
                     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 
                     'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'}

stop_words.update(custom_stop_words)

# Initialize a WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Ensure the input is a string
    text = str(text)
    # Tokenize the text into words
    tokens = word_tokenize(text)
    # Filter out non-alphanumeric tokens and numbers
    tokens = [token for token in tokens if token.isalnum() and not token.isnumeric()]
    # Convert tokens to lowercase and remove stopwords
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    # Lemmatize each token
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

# Preprocess each document in the list
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Print the first preprocessed document
print(preprocessed_documents[0])

['wondering', 'anyone', 'enlighten', 'car', 'saw', 'day', 'sport', 'car', 'looked', 'late', 'early', '70', 'called', 'bricklin', 'door', 'really', 'small', 'addition', 'front', 'bumper', 'separate', 'rest', 'body', 'anyone', 'tellme', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'made', 'history', 'whatever', 'info', 'funky', 'looking', 'car', 'please']


## Create document-term matrix

In [43]:
# Create a Gensim Dictionary object from the preprocessed documents
dictionary = corpora.Dictionary(preprocessed_documents)
# Filter out tokens that appear in less than 15 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=15, no_above=0.5)
# Convert each preprocessed document into a bag-of-words representation using the dictionary
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents] 

## Run LDA


In [44]:
# Run LDA
lda_model = LdaModel(corpus, num_topics=4, id2word=dictionary, passes=15) # Train an LDA model
#on the corpus with 2 topics using Gensim's LdaModel class

## Interpret Results

In [45]:
# empty list to store dominant topic labels for each document
text_labels = []

# iterate over each processed document
for i, doc in enumerate(preprocessed_documents):
 # for each document, convert to bag-of-words representation
 bow = dictionary.doc2bow(doc)
 # get list of topic probabilities
 topics = lda_model.get_document_topics(bow)
 # determine topic with highest probability
 dominant_topic = max(topics, key=lambda x: x[1])[0]
 # append to the list
 text_labels.append(dominant_topic)
    
# Create DataFrame
df_result = pd.DataFrame({"Text": documents, "Topic": text_labels})

# Print the DataFrame
print("Table with Text and Topic:")
print(df_result)
print()

Table with Text and Topic:
                                                    Text  Topic
0      I was wondering if anyone out there could enli...      1
1      I recently posted an article asking what kind ...      1
2      \nIt depends on your priorities.  A lot of peo...      1
3      an excellent automatic can be found in the sub...      1
4      : Ford and his automobile.  I need information...      1
...                                                  ...    ...
11309  Secrecy in Clipper Chip\n\nThe serial number o...      0
11310  Hi !\n\nI am interested in the source of FEAL ...      0
11311  The actual algorithm is classified, however, t...      3
11312  \n\tThis appears to be generic calling upon th...      1
11313  \nProbably keep quiet and take it, lest they g...      1

[11314 rows x 2 columns]



## Print top terms for each topic

In [48]:

for topic_id in range(lda_model.num_topics):
 print(f"Top terms for Topic #{topic_id}:")
 top_terms = lda_model.show_topic(topic_id, topn=10)
 print([term[0] for term in top_terms])
 print()

Top terms for Topic #0:
['max', 'file', 'system', 'window', 'program', 'also', 'available', 'problem', 'version', 'drive']

Top terms for Topic #1:
['people', 'think', 'say', 'time', 'thing', 'god', 'even', 'see', 'make', 'well']

Top terms for Topic #2:
['game', 'db', 'team', 'year', 'player', 'play', 'new', 'season', 'first', 'league']

Top terms for Topic #3:
['government', 'key', 'people', 'armenian', 'state', 'president', 'encryption', 'law', 'right', 'public']



## Print the top terms for each topic with weight

In [49]:

print("Top Terms for Each Topic:")

for idx, topic in lda_model.print_topics():
 print(f"Topic {idx}:")
 terms = [term.strip() for term in topic.split("+")]
 for term in terms:
     weight, word = term.split("*")
     print(f"- {word.strip()} (weight: {weight.strip()})")
 print()


Top Terms for Each Topic:
Topic 0:
- "max" (weight: 0.018)
- "file" (weight: 0.011)
- "system" (weight: 0.008)
- "window" (weight: 0.007)
- "program" (weight: 0.006)
- "also" (weight: 0.005)
- "available" (weight: 0.005)
- "problem" (weight: 0.005)
- "version" (weight: 0.005)
- "drive" (weight: 0.005)

Topic 1:
- "people" (weight: 0.010)
- "think" (weight: 0.009)
- "say" (weight: 0.008)
- "time" (weight: 0.006)
- "thing" (weight: 0.006)
- "god" (weight: 0.006)
- "even" (weight: 0.006)
- "see" (weight: 0.005)
- "make" (weight: 0.005)
- "well" (weight: 0.005)

Topic 2:
- "game" (weight: 0.016)
- "db" (weight: 0.014)
- "team" (weight: 0.013)
- "year" (weight: 0.011)
- "player" (weight: 0.008)
- "play" (weight: 0.007)
- "new" (weight: 0.006)
- "season" (weight: 0.006)
- "first" (weight: 0.005)
- "league" (weight: 0.005)

Topic 3:
- "government" (weight: 0.009)
- "key" (weight: 0.008)
- "people" (weight: 0.007)
- "armenian" (weight: 0.006)
- "state" (weight: 0.006)
- "president" (weight: 0.

## Possible topic name

### Topic 0: Computer System
### Topic 1: Personal and general opinion
### Topic 2: Sports and games
### Topic 3: Politics and law


### Evaluate the LDA Model using Coherence Score

In [50]:
from gensim.models.coherencemodel import CoherenceModel

In [51]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_documents, dictionary=dictionary, coherence='c_v')

coherence_lda = coherence_model_lda.get_coherence()

In [52]:
print(f'Topic Coherence Score (C_V): {coherence_lda:.4f}')

Topic Coherence Score (C_V): 0.5577


##### The Topic Coherence Score (C_V) of 0.5577 indicates a moderate level of coherence among the topics identified in the model. This score suggests that the terms within each topic are reasonably well-related, making the topics meaningful and interpretable. However, it also implies that there is room for improvement in the model's ability to group words into highly cohesive topics.