In [1]:
# CISB5123 - TEXT ANALYTICS (SECTION 03)
# AHMAD AMIRUL AIZAD BIN ROSMADI [IS01082507]
# MUHAMMMAD NABIL BIN MUHAMMAD [IS01082117]

In [2]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
from gensim import corpora
from gensim.models import LdaModel

In [3]:
df = pd.read_csv("news_dataset.csv")

In [4]:
# Download necessary NLTK datasets
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# Initialize the lemmatizer and stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [6]:
# Filter out rows where the 'text' column has no value (empty strings or NaN)
df_cleaned = df[df['text'].notna() & (df['text'] != '')]

In [7]:
# Define a function for text pre-processing
def preprocess_text_simple(text):
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    # Tokenize text by splitting into words
    words = text.split()
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    # Return cleaned text
    return ' '.join(words)

In [8]:
# Apply the pre-processing function to the filtered 'text' column
df_cleaned['cleaned_text'] = df_cleaned['text'].apply(preprocess_text_simple)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['cleaned_text'] = df_cleaned['text'].apply(preprocess_text_simple)


In [9]:
# Tokenize the cleaned text into words
tokenized_text = [text.split() for text in df_cleaned['cleaned_text']]

In [10]:
# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(tokenized_text)

# Filter out words that occur in less than 5 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=5, no_above=0.5)

# Create a bag of words corpus
corpus = [dictionary.doc2bow(text) for text in tokenized_text]

# Build the LDA model
lda_model = LdaModel(corpus, num_topics=4, id2word=dictionary, passes=5, iterations=50)

In [11]:
print("Top Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f" - {word.strip()} (weight: {weight.strip()})")
    print()

Top Terms for Each Topic:
Topic 0:
 - "b" (weight: 0.011)
 - "db" (weight: 0.011)
 - "chip" (weight: 0.009)
 - "one" (weight: 0.008)
 - "would" (weight: 0.008)
 - "clipper" (weight: 0.007)
 - "use" (weight: 0.006)
 - "encryption" (weight: 0.006)
 - "get" (weight: 0.006)
 - "like" (weight: 0.005)

Topic 1:
 - "people" (weight: 0.011)
 - "would" (weight: 0.010)
 - "one" (weight: 0.009)
 - "dont" (weight: 0.007)
 - "think" (weight: 0.006)
 - "know" (weight: 0.006)
 - "like" (weight: 0.005)
 - "us" (weight: 0.005)
 - "say" (weight: 0.005)
 - "even" (weight: 0.004)

Topic 2:
 - "x" (weight: 0.024)
 - "key" (weight: 0.009)
 - "use" (weight: 0.008)
 - "file" (weight: 0.007)
 - "information" (weight: 0.006)
 - "available" (weight: 0.005)
 - "program" (weight: 0.005)
 - "anonymous" (weight: 0.005)
 - "system" (weight: 0.004)
 - "email" (weight: 0.004)

Topic 3:
 - "q" (weight: 0.007)
 - "president" (weight: 0.006)
 - "new" (weight: 0.005)
 - "mr" (weight: 0.005)
 - "government" (weight: 0.004)


In [None]:
# Interpretion
'''
Topic 0 
- Focus on Technology and Encryption.
- This topic seems to be related to computing systems, encryption technologies, and data management. The terms suggest a focus on 
databases, encryption keys, and hardware/software components, with a possible focus on security and system protection.

Topic 1
- Focus on General opnions.
- This topic appears to focus on opinions or discussions. The frequent use of the list word suggests personal opinions or statements in conversations. 

Topic 2
- Focus on Information Systems security.
- This topic seems to be focused on data security and information systems. The key terms suggest that the topic revolves around encryption, 
file security, programming, and anonymous systems.

Topic 3
- Focus on Political
- This topic is most likely centered around politics or governmental matters, with terms like "president", "government".
'''