In [1]:

# AMJAD ALMASSRI - IS01081643


import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel


nltk.download('stopwords')
nltk.download('wordnet')


data = pd.read_csv('news_dataset.csv', usecols=['text'])

data.dropna(subset=['text'], inplace=True)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess(text):
    tokens = [word for word in text.lower().split() if word.isalpha()]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [stemmer.stem(word) for word in tokens]
    return tokens

data['processed_text'] = data['text'].apply(preprocess)

dictionary = corpora.Dictionary(data['processed_text'])
corpus = [dictionary.doc2bow(text) for text in data['processed_text']]


lda_model = models.LdaModel(corpus, num_topics=4, id2word=dictionary, passes=15)


coherence_model_lda = CoherenceModel(model=lda_model, texts=data['processed_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()


print(f'Coherence Score: {coherence_lda}')


for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWords: {topic}')





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Amgad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Amgad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Coherence Score: 0.47229334689892727
Topic: 0 
Words: 0.008*"q" + 0.005*"presid" + 0.005*"armenian" + 0.005*"new" + 0.004*"state" + 0.004*"team" + 0.004*"year" + 0.004*"nation" + 0.004*"game" + 0.004*"first"
Topic: 1 
Words: 0.023*"db" + 0.010*"use" + 0.009*"drive" + 0.007*"card" + 0.006*"one" + 0.006*"run" + 0.006*"mov" + 0.005*"disk" + 0.005*"power" + 0.005*"get"
Topic: 2 
Words: 0.014*"would" + 0.011*"one" + 0.009*"peopl" + 0.009*"think" + 0.008*"like" + 0.008*"go" + 0.008*"get" + 0.007*"know" + 0.007*"make" + 0.006*"say"
Topic: 3 
Words: 0.022*"x" + 0.018*"use" + 0.012*"key" + 0.009*"encrypt" + 0.008*"file" + 0.007*"program" + 0.006*"system" + 0.006*"inform" + 0.005*"avail" + 0.005*"chip"
