# TUTORIAL 2 - Corpus Analysis with LDA
* Text Retrieval and Mining, MSc Minor DS&AI, 2023-2024
* Author: [Julien Rossi](mailto:j.rossi@uva.nl)

# Data Preparation

We will use the News Articles dataset.

See [Link](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/GMFCTR).

This dataset was made for studying political bias in articles, and is made of articles from different sources, reporting on political topics.

With SVD we can identify topics, and use the

In [None]:
!wget https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/GMFCTR/IZQODZ -O NewsArticles.csv

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('NewsArticles.csv', encoding='latin1')

In [None]:
df.head()

In [None]:
df = df[['article_id', 'title', 'text']].copy().dropna().reset_index(drop=True)
print(df.shape)

In [None]:
df.head()

In [None]:
df['nb_words'] = df['text'].apply(lambda x: len(x.split()))

In [None]:

_ = df['nb_words'].hist(bins=100, figsize=(9, 9))

# Clustering with Bag of Words

In [None]:
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(
    stop_words='english',
    min_df=2,
    max_df=0.8,
    max_features=50000,
    token_pattern=r'[a-z]{2,}',
)

In [None]:
corpus = df['text']
term_doc = count.fit_transform(corpus)

In [None]:
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_distances
from sklearn.cluster import KMeans

class KMedians(KMeans):
    def _e_step(self, X):
        self.labels_ = cosine_distances(X, self.cluster_centers_).argmin(axis=1)
    def _average(self, X):
        return np.median(X, axis=0)

normalizer = Normalizer()
bow_norm = normalizer.fit_transform(term_doc)


km = KMedians(n_clusters=8, init='k-means++', max_iter=100, n_init=10)
km.fit(bow_norm)


In [None]:
from sklearn.metrics import pairwise_distances_argmin_min

closest, _ = pairwise_distances_argmin_min(X=km.cluster_centers_, Y=bow_norm, metric='cosine')

In [None]:
from sklearn.metrics import pairwise_distances

c = closest[3]
d = pairwise_distances(X=bow_norm[c, :], Y=bow_norm, metric='cosine')[0]
top_10_idx = np.argsort(d)[1:11]   # the closest to a point is itself, so we remove the TOP 1

print(df.iloc[c]['title'])
print('*' * 80)
for i, idx in enumerate(top_10_idx):
    print(f'#{i+1:>2} (idx={idx:4}, d={d[idx]:.2f}): {df.iloc[idx]["title"]}')


# Clustering with LDA

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

add_stops = ['mr', 'said']
stopped_tokenized = list(map(
    lambda tokens: [t.text for t in tokens if len(t.text) > 1 and not t.is_stop and t.text not in add_stops],
    nlp.tokenizer.pipe(df['text'].str.lower())
))

In [None]:
import warnings

from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

warnings.filterwarnings("ignore", category=DeprecationWarning)

dictionary = Dictionary(stopped_tokenized)

# Filter out words that occur less than 5 documents, or more than 90% of the documents.
# Same effect as min_df, max_df in CountVectorizer
dictionary.filter_extremes(no_below=5, no_above=0.9)
dictionary.compactify()

corpus = [dictionary.doc2bow(txt) for txt in stopped_tokenized]

In [None]:
K = 8

lda = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=K,
    alpha='auto',
    eta='auto',
    iterations=1,
    passes=5,
    eval_every=None,
    random_state=42,
    per_word_topics=True,
    minimum_probability=0.0
)

In [None]:
import math
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS

nb_columns = 4
nb_rows = math.ceil(K / nb_columns)

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'
cols = cols * math.ceil(K / len(cols))

cloud = WordCloud(background_color='white',
                  width=400,
                  height=400,
                  max_words=10,
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda.show_topics(num_topics=K, num_words=10, formatted=False)

fig, axes = plt.subplots(ncols=nb_columns, nrows=nb_rows,
                         figsize=(4*nb_columns, 4*nb_rows),
                         sharex=True, sharey=True)

for i, (topic, ax) in enumerate(zip(topics, axes.flatten())):
    topic_words = dict(topic[1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    ax.imshow(cloud)
    ax.set_title('Topic ' + str(i), fontdict=dict(size=16))
    ax.axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

In [None]:
lda_vecs = lda[corpus]

In [None]:
import numpy as np

lda_vecs = lda[corpus]

doc_topics = np.zeros((len(corpus), K))
for i in range(len(corpus)):
    topics = lda_vecs[i][0]
    for (j, v) in topics:
        doc_topics[i][j] = v

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics.pairwise import cosine_distances
from sklearn.cluster import KMeans

class KMedians(KMeans):
    def _e_step(self, X):
        self.labels_ = cosine_distances(X, self.cluster_centers_).argmin(axis=1)
    def _average(self, X):
        return np.median(X, axis=0)

normalizer = Normalizer()
lda_norm = normalizer.fit_transform(doc_topics)


lda_km = KMedians(n_clusters=8, init='k-means++', max_iter=100, n_init=10)
lda_km.fit(lda_norm)


In [None]:
from sklearn.metrics import pairwise_distances_argmin_min

closest, _ = pairwise_distances_argmin_min(X=lda_km.cluster_centers_, Y=lda_norm, metric='cosine')

In [None]:
from sklearn.metrics import pairwise_distances

c = closest[2]
d = pairwise_distances(X=[lda_norm[c]], Y=lda_norm, metric='cosine')[0]
top_10_idx = np.argsort(d)[1:11]   # the closest to a point is itself, so we remove the TOP 1

print(df.iloc[c]['title'])
print('*' * 80)
for i, idx in enumerate(top_10_idx):
    print(f'#{i+1:>2} (idx={idx:4}, d={d[idx]:.2f}): {df.iloc[idx]["title"]}')


# GridSearch for Coherence Optimization

In [None]:
MULTICORE = False   # Switch True to use on your own multicore laptop

In [None]:
import tqdm
import logging

from gensim.models import CoherenceModel
from gensim.models import LdaMulticore

gensim_logger = logging.getLogger('gensim.models.ldamodel')
gensim_logger.setLevel(logging.DEBUG)

history = []
Ks = [4, 6, 8, 10, 20, 50, 100]

for K in tqdm.tqdm(Ks):
    if MULTICORE:
        import psutil
        NUM_CORES = psutil.cpu_count(logical=False)

        lda_k = LdaMulticore(
            corpus=corpus,
            id2word=dictionary,
            num_topics=K,
            iterations=100,
            passes=20,
            eval_every=None,
            random_state=42,
            workers=NUM_CORES - 1              # Adjust it to your computer: Number of CPU Cores - 1
        )
    else:
        lda_k = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=K,
            iterations=100,
            passes=20,
            eval_every=None,
            random_state=42,
        )

    coherence_uci = CoherenceModel(
        model=lda_k,
        texts=stopped_tokenized,
        dictionary=dictionary,
        coherence='c_uci'
    )
    uci = coherence_uci.get_coherence()

    coherence_umass = CoherenceModel(
        model=lda_k,
        corpus=corpus,
        dictionary=dictionary,
        coherence='u_mass'
    )
    umass = coherence_umass.get_coherence()

    coherence_cv = CoherenceModel(
        model=lda_k,
        texts=stopped_tokenized,
        dictionary=dictionary,
        coherence='c_v'
    )
    c_v = coherence_cv.get_coherence()

    history.append({'K': K, 'model': lda_k, 'c_v': c_v, 'umass': umass, 'uci': uci})

In [None]:
import pandas as pd

df = pd.DataFrame(history).set_index('K')
_ = df[['uci', 'umass', 'c_v']].plot.line(marker='.', figsize=(12, 12))

In [None]:
plt.figure(figsize=(12, 12))
plt.title("GridSearch Coherence",
          fontsize=20)

plt.xlabel("K", fontsize=14)
plt.ylabel("Coherence", fontsize=14)

ax = plt.gca()

# Get the regular numpy array from the MaskedArray
X_axis = df.index

scoring = {'UMass': 'umass', 'UCI': 'uci', 'C_V': 'c_v'}

for scorer, color in zip(sorted(scoring), ['tab:blue', 'tab:orange', 'tab:green']):
  sample, style = ('test', '-')
  scores = df[scoring[scorer]]
  ax.plot(X_axis, scores, linestyle=style, color=color, label=scorer, marker='.')

  best_index = scores.idxmax()
  best_score = scores[best_index]

  # Plot a dotted vertical line at the best score for that scorer marked by x
  ax.plot([best_index, ] * 2, [0, best_score],
          linestyle=':', color=color, marker='x', markeredgewidth=3, ms=8, alpha=0.4)

  # Annotate the best score for that scorer
  ax.annotate(f"{best_index:d}",
              (best_index, 0.1), color=color, fontsize=14)

  ax.annotate(f"{best_score:0.2f}",
              (best_index+0.5, best_score + 0.1), color=color, fontsize=14)

plt.legend(loc="best", fontsize=16)
plt.grid(False)
plt.show()