## Pulling dataset

In [20]:
import pandas as pd
data = pd.read_csv('./news.csv')

## Selecting articles

In [21]:
# Unused features
data=data.drop(columns=['ArticleId','Category'])

# Preprocessing
#### [1] Removing punctuation and case folding

In [22]:
# removing punctuation
data['content_processed'] = data['content'].str.replace(r'[^\w\s]+', '', regex=True)

#case folding
data['content_processed']=data['content_processed'].map(lambda x: x.lower())

data.head()

Unnamed: 0,content,content_processed
0,worldcom ex-boss launches defence lawyers defe...,worldcom exboss launches defence lawyers defen...
1,german business confidence slides german busin...,german business confidence slides german busin...
2,bbc poll indicates economic gloom citizens in ...,bbc poll indicates economic gloom citizens in ...
3,lifestyle governs mobile choice faster bett...,lifestyle governs mobile choice faster bett...
4,enron bosses in $168m payout eighteen former e...,enron bosses in 168m payout eighteen former en...


### [2] Tokenizing, removing stopwords and lemmatizing

In [23]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
# nltk.download('omw-1.4')
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

stop_words = stopwords.words("english")

# tokenizing, lemmatizing & stop words removal
lemmatizer = WordNetLemmatizer()

def tokenize_lemmatize(article):
    return [lemmatizer.lemmatize(word) for word in word_tokenize(article)]

def tokenize(article):
    return word_tokenize(article)


def rm_stopwords(tokens):
    return [token for token in tokens if token not in stop_words]

def lemmatize(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

data['content_processed'] = data['content_processed'].apply(tokenize)
data['content_processed'] = data['content_processed'].apply(rm_stopwords)
data['content_processed'] = data['content_processed'].apply(lemmatize)

data.head()

Unnamed: 0,content,content_processed
0,worldcom ex-boss launches defence lawyers defe...,"[worldcom, exboss, launch, defence, lawyer, de..."
1,german business confidence slides german busin...,"[german, business, confidence, slide, german, ..."
2,bbc poll indicates economic gloom citizens in ...,"[bbc, poll, indicates, economic, gloom, citize..."
3,lifestyle governs mobile choice faster bett...,"[lifestyle, governs, mobile, choice, faster, b..."
4,enron bosses in $168m payout eighteen former e...,"[enron, boss, 168m, payout, eighteen, former, ..."


## Prepating data for LDA

In [24]:
# Prepare data for LDA

import gensim.corpora as corpora

dictionary = corpora.Dictionary(data['content_processed'])

bow_corpus = [dictionary.doc2bow(doc) for doc in data['content_processed']]

## LDA model training

In [25]:
# import warnings
# warnings.filterwarnings("ignore",category=DeprecationWarning)

from gensim.models import LdaMulticore
from gensim.models import CoherenceModel

ntopics = 5

lda_model =  LdaMulticore(bow_corpus, 
                          num_topics = ntopics, 
                          id2word = dictionary,                   
                          passes = 30)

lda_model.print_topics(ntopics)

[(0,
  '0.012*"said" + 0.006*"game" + 0.004*"would" + 0.004*"club" + 0.004*"player" + 0.004*"year" + 0.004*"one" + 0.004*"last" + 0.004*"time" + 0.004*"two"'),
 (1,
  '0.007*"england" + 0.006*"said" + 0.005*"game" + 0.004*"world" + 0.004*"first" + 0.004*"ireland" + 0.004*"year" + 0.004*"win" + 0.004*"wale" + 0.004*"two"'),
 (2,
  '0.021*"said" + 0.015*"mr" + 0.009*"would" + 0.007*"government" + 0.006*"people" + 0.005*"say" + 0.005*"labour" + 0.005*"election" + 0.005*"party" + 0.005*"minister"'),
 (3,
  '0.011*"said" + 0.011*"year" + 0.008*"film" + 0.007*"u" + 0.006*"best" + 0.005*"sale" + 0.005*"award" + 0.004*"also" + 0.004*"market" + 0.004*"new"'),
 (4,
  '0.013*"said" + 0.008*"people" + 0.006*"mobile" + 0.006*"phone" + 0.005*"technology" + 0.005*"game" + 0.005*"new" + 0.005*"service" + 0.005*"music" + 0.005*"also"')]

## Perplexity and Coherence

In [26]:
print('Perplexity: ', lda_model.log_perplexity(bow_corpus))

coherence_model_lda = CoherenceModel(
    model=lda_model, texts=data['content_processed'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.361207093437164

Coherence Score:  0.4207031751992371


## Visualizing articles clustring via html file

In [None]:
import pyLDAvis.gensim_models
import pyLDAvis
import os

pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./results/ldavis_prepared_' + str(ntopics))
p=pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.save_html(p,'lda.html')