### LDA Herald Tutorial

Here we will run through the same process as in lda_tutorial.ipynb using the Herald data that we scraped earlier.

In [73]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
from collections import Counter
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
from gensim import corpora
from gensim.models.callbacks import PerplexityMetric
from sklearn.utils import shuffle
import subprocess
import os

import matplotlib
matplotlib.use('QT5Agg')

In [None]:
df = pd.read_pickle('data/herald_business_tutorial.pkl')

In [None]:
df.head()

In [None]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

In [None]:
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [None]:
doc_clean = [clean(doc).split() for doc in df['Article Content']]

In [None]:
dictionary = corpora.Dictionary(doc_clean)

In [None]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [None]:
Lda = gensim.models.ldamodel.LdaModel

In [None]:
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=150)

In [None]:
print(ldamodel.print_topics(num_topics=3, num_words=3))

#### Improvements

#### word frequency filter

In [None]:
doc_clean_flat = [val for sublist in doc_clean for val in sublist]

In [None]:
doc_clean_flat_df = pd.DataFrame({'words': doc_clean_flat})
doc_clean_flat_df = doc_clean_flat_df.words.value_counts()

In [None]:
doc_clean_flat_df = pd.DataFrame({'word': doc_clean_flat_df.index, 'count':doc_clean_flat_df}).reset_index(drop=True)

In [None]:
doc_clean_flat_df['count_norm'] = doc_clean_flat_df['count']/doc_clean_flat_df['count'].max()

In [None]:
doc_clean_flat_df.head()

In [None]:
cutoff = 0.1
fig = plt.figure(figsize=(12,8))
plt.bar(x=doc_clean_flat_df[doc_clean_flat_df.count_norm>cutoff]['word'], height=doc_clean_flat_df[doc_clean_flat_df.count_norm>cutoff]['count'])
plt.xticks(rotation=90)
plt.show()

In [None]:
word_filter = doc_clean_flat_df[doc_clean_flat_df.count_norm>cutoff]['word']

#### POS filter

In [None]:
POS_list = list()
for item in doc_clean_flat:
    tokenized = nltk.word_tokenize(item)
    tagged = nltk.pos_tag(tokenized)

    chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>}"""
    chunkParser = nltk.RegexpParser(chunkGram)

    chunked = chunkParser.parse(tagged)
    POS_list.append(chunked)

In [None]:
POS_list = [i[0] for i in POS_list]
POS_cat = [i[1] for i in POS_list]

In [82]:
fig = plt.figure(figsize=(12,8))
plt.bar(x=Counter(POS_cat).keys(), height=Counter(POS_cat).values())
plt.xticks(rotation=90)
plt.show()

In [None]:
new_POS_list = []
for word in POS_list:
    if word[1] not in ['IN', 'MD', 'CD']:
        new_POS_list.append(word[0])

In [None]:
POS_filter = list(dict.fromkeys(new_POS_list))

['comment',
 'recently',
 'involved',
 'board',
 'appointment',
 'panel',
 'voluntary',
 'organisation',
 'lead',
 'interview']

#### batching

In [98]:
word_filter = set(word_filter)
doc_clean_2 = []
for doc in doc_clean:
    doc_2 = [x for x in doc if (x in word_filter) and (x in POS_filter)]
    doc_clean_2.append(doc_2)

In [109]:
dictionary = corpora.Dictionary(doc_clean_2)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean_2]
Lda = gensim.models.ldamodel.LdaModel

In [110]:
perplexity_logger = PerplexityMetric(corpus=doc_term_matrix, logger='shell')
ldamodel = Lda(doc_term_matrix, num_topics=10, id2word = dictionary, passes=500, chunksize=5, update_every=0, eta='auto', iterations=5, random_state=12345, callbacks=[perplexity_logger])

In [111]:
print(ldamodel.print_topics(num_topics=10, num_words=8))

[(0, '0.145*"say" + 0.092*"business" + 0.040*"people" + 0.037*"firm" + 0.030*"big" + 0.027*"many" + 0.026*"year" + 0.026*"–"'), (1, '0.263*"said" + 0.039*"office" + 0.039*"week" + 0.037*"•" + 0.037*"service" + 0.032*"cost" + 0.031*"also" + 0.030*"problem"'), (2, '0.105*"investment" + 0.072*"investor" + 0.067*"bank" + 0.066*"fund" + 0.055*"financial" + 0.045*"interest" + 0.043*"asset" + 0.041*"rate"'), (3, '0.109*"company" + 0.080*"price" + 0.067*"share" + 0.063*"saudi" + 0.061*"port" + 0.061*"cent" + 0.051*"year" + 0.039*"u"'), (4, '0.058*"change" + 0.057*"sustainable" + 0.055*"risk" + 0.051*"say" + 0.049*"—" + 0.046*"need" + 0.043*"system" + 0.042*"climate"'), (5, '0.122*"auckland" + 0.102*"property" + 0.053*"new" + 0.053*"business" + 0.053*"centre" + 0.041*"building" + 0.038*"development" + 0.036*"site"'), (6, '0.182*"cent" + 0.056*"milk" + 0.054*"market" + 0.052*"china" + 0.044*"food" + 0.042*"year" + 0.038*"•" + 0.037*"company"'), (7, '0.216*"new" + 0.167*"zealand" + 0.044*"technol