<a href="https://colab.research.google.com/github/belom-nlp/micro_topic_modelling/blob/main/notebooks/fetch20_MTM_BERTopic_Top2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Installing MTM

In [None]:
! pip install sentence_transformers
! pip install transformers

In [None]:
#importing necessary libraries
from collections import Counter
import numpy as np
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import nltk
import torch

from sentence_transformers import SentenceTransformer

from sklearn.decomposition import PCA
from sklearn.cluster import HDBSCAN, DBSCAN, KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import matplotlib.pyplot as plt

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
import random

def set_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

set_random_seed(42)

In [None]:
from model import MicroTopicModeller

#Importing fetch20 Dataset

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
ds = fetch_20newsgroups()

In [None]:
mtm = MicroTopicModeller(use_pca=True, use_cuda=True)

In [None]:
topic_words = mtm.pipeline(ds.data)

In [None]:
for key in topic_words.keys():
    print(key, ', '.join(topic_words[key]))

topic0 god, ax, b8f, use, program, a86, max, gun, space, file, organization, article, people, 145, writes, year, windows
topic1 armenians, armenian, years, scsi, russian, organization, people, reserve, marine, insurance, year, naval, car
topic2 values, ax, center, b8f, information, v2, nasa, dseg, stanford, a86, max, 16, house, space, research, leland, dam9543, david, g9v, ti, gillow, forsythe
topic3 rsaref, mobility, mpce, nec, behanna, johnh, key, world, cross, deep, implementation, astemizole, bmw, ncsc, bolt, nsa, right, speculation, really, tools, said
topic4 il, bike, thinking, huji, atheists, eavesdropper, cfj, hernlem, adam, mia, new, cs, q30tbxn, amos, jason, good, charitable, steel
topic5 accel, key, 960, fleury, reverse, schism, chip, hernlem, space, mia, color, c5rpoj, excommunicated, outlet, 200mb, trsvax, turmeric, charitable, yardley, w165w, group
topic6 bbs, accel, roger, manes, cfj, better, keenan, linknet, hernlem, numbers, mia, magpie, gl, 200mb, pace, charitable, ar

In [None]:
topics = []
for k, j in topic_words.items():
    if k != 'common_key_words':
        topics.append(j)

#Metrics

See MTM_on_train_datasets.ipynb notebook for the explanation.

In [None]:
! git clone https://github.com/christianrfg/tm_metrics.git

Cloning into 'tm_metrics'...
remote: Enumerating objects: 79, done.[K
remote: Total 79 (delta 0), reused 0 (delta 0), pack-reused 79[K
Receiving objects: 100% (79/79), 1.19 MiB | 10.96 MiB/s, done.
Resolving deltas: 100% (22/22), done.


In [None]:
!sed -i 's/cv_model.get_feature_names()/cv_model.get_feature_names_out()/' tm_metrics/tm_metrics/feature_extraction/text.py

In [None]:
!pip install -U /content/tm_metrics

Processing ./tm_metrics
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tm-metrics
  Building wheel for tm-metrics (setup.py) ... [?25l[?25hdone
  Created wheel for tm-metrics: filename=tm_metrics-0.1-py3-none-any.whl size=8201 sha256=d1612d940b7a04804593abb12bdfa2a7978593d46b49716a4d228a5a25ae7f8e
  Stored in directory: /tmp/pip-ephem-wheel-cache-j9awrp2q/wheels/88/73/88/e84eb7e10e9fc6ecb2f0e636d851f394cd8789c1ed40d09cc7
Successfully built tm-metrics
Installing collected packages: tm-metrics
Successfully installed tm-metrics-0.1


In [None]:
def pmi(topic_words, word_frequency, word_frequency_in_documents, n_docs, normalise=False):
    """PMI/NPMI topic quality metric for a topic.

    Calculates the PMI/NPMI topic quality metric for one individual topic based on the topic words.

    Args:
        topic_words: list
            Words that compose one individual topic.
        word_frequency: dict
            Frequency of each word in corpus.
        word_frequency_in_documents: dict
            Frequency of each word for each document in corpus.
        n_docs: int
            Number of documents in the corpus.
        normalise: bool, default=False
            Where to normalise (NPMI) or not (PMI).

    Returns:
        pmi: float
            Resultant PMI metric value for the topic.
        npmi: float
            Resultant NPMI metric value for the topic.
    """
    n_top = len(topic_words)
    pmi = 0.0
    npmi = 0.0

    for j in range(1, n_top):
        for i in range(0, j):
            ti = topic_words[i]
            tj = topic_words[j]

            c_i = word_frequency[ti]
            c_j = word_frequency[tj]
            c_i_and_j = len(word_frequency_in_documents[ti].intersection(word_frequency_in_documents[tj]))

            dividend = (c_i_and_j + 1.0) / float(n_docs)
            divisor = ((c_i * c_j) / float(n_docs) ** 2)
            pmi += max([np.log(dividend / divisor), 0])

            npmi += -1.0 * np.log((c_i_and_j + 0.01) / float(n_docs))

    if npmi != 0:
        npmi = pmi / npmi

    if normalise:
        return npmi
    else:
        return pmi

In [None]:
from tm_metrics.feature_extraction import get_tfidf_matrices, get_vocabulary, get_word_frequencies
from tm_metrics.metrics import coherence, tfidf_coherence, lcp, topic_w2v

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from gensim.models import Word2Vec

In [None]:
def prepare_data_for_metrics(data):
  n_features = int(len(data)/2)
  tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english', max_features=1000)
  tfidf = tfidf_vectorizer.fit_transform(data)
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
  vocabulary = get_vocabulary(data)
  word_frequency, word_frequency_in_documents = get_word_frequencies(data)
  return word_frequency, word_frequency_in_documents

Preparing data for metrics:

In [None]:
wf, wfid = prepare_data_for_metrics(ds.data)

In [None]:
def count_metrics(x, word_frequency, word_frequency_in_documents):

    pmi_results = []
    npmi_results = []
    coherence_results = []


    n_samples=len(x)

    for tw in x:
        pmi_ = pmi(tw, word_frequency, word_frequency_in_documents, n_samples, normalise=False)
        npmi_ = pmi(tw, word_frequency, word_frequency_in_documents, n_samples, normalise=True)

        coherence_ = coherence(tw, word_frequency, word_frequency_in_documents)


        pmi_results.append(pmi_)
        npmi_results.append(npmi_)
        coherence_results.append(coherence_)

    return {'pmi_results': pmi_results,
          'npmi_results': npmi_results,
          'coherence_results': coherence_results}

In [None]:
results = count_metrics(topics, wf, wfid)

In [None]:
def get_result_table(d):
  pmi_results = d['pmi_results']
  npmi_results = d['npmi_results']
  coherence_results = d['coherence_results']


  avg_pmi, std_pmi = np.mean(pmi_results), np.std(pmi_results)
  avg_npmi, std_npmi = np.mean(npmi_results), np.std(npmi_results)
  avg_coherence, std_coherence = np.mean(coherence_results), np.std(coherence_results)



  data = [
    ["PMI", avg_pmi, std_pmi],
    ["NPMI", avg_npmi, std_npmi],
    ["Coherence", avg_coherence, std_coherence],
  ]
  columns = ["Metric", "Avg", "Std"]
  df = pd.DataFrame(data, columns=columns)
  return df

In [None]:
get_result_table(results)

Unnamed: 0,Metric,Avg,Std
0,PMI,40.855594,35.659783
1,NPMI,0.029166,0.020366
2,Coherence,-255.171205,71.341997


#Getting metric results

##BERTOPIC

The 1st state-of-the-art model for comparison

In [None]:
! pip install bertopic

In [None]:
from bertopic import BERTopic
bt = BERTopic(language="multilingual")
from scipy.sparse import bsr_array
bsr_arraytopics, probs = bt.fit_transform(ds.data)

.gitattributes:   0%|          | 0.00/968 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/471M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

unigram.json:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
bertopic_topics = bt.get_topic_info()
bertopic_results = bertopic_topics.Representation.tolist()
for i in range(len(bertopic_results)):
    print('Topic', i, ':', ', '.join(bertopic_results[i]))

Topic 0 : the, is, to, of, and, in, for, it, that, from
Topic 1 : gun, guns, militia, firearms, weapon, weapons, firearm, police, of, you
Topic 2 : card, drivers, diamond, ati, video, driver, speedstar, vesa, 24x, windows
Topic 3 : israel, israeli, arab, jews, arabs, israelis, cpr, war, adam, borders
Topic 4 : 55, hockey, 25, nhl, team, period, 11, 10, pp, play
Topic 5 : bmw, car, moa, engine, cars, honda, miles, ford, com, qazi
Topic 6 : sound, audio, stereo, midi, relays, channel, speaker, irq, noise, soundblaster
Topic 7 : space, nasa, jpl, spacecraft, gov, baalke, kelvin, dealy, ___, gsfc
Topic 8 : entry, output, file, oname, printf, entries, fprintf, char, contest, stream
Topic 9 : stephanopoulos, president, mr, we, jobs, clinton, file, myers, congress, that
Topic 10 : turkish, armenian, armenians, serdar, argic, turks, genocide, soviet, zuma, armenia
Topic 11 : vs, gm, flyers, game, hawks, blues, ahl, chi, adirondack, at
Topic 12 : objective, morality, values, frank, christian, d

In [None]:
results = count_metrics(bertopic_results, wf, wfid)
get_result_table(results)

Unnamed: 0,Metric,Avg,Std
0,PMI,20.675695,19.710966
1,NPMI,0.114489,0.109385
2,Coherence,-37.992408,12.051126


##Top2Vec

The 2nd state-of-the-art model for comparison

In [None]:
from IPython.display import clear_output
!pip install top2vec
!pip install top2vec[sentence_transformers]
clear_output()

In [None]:
!pip install top2vec[sentence_encoders]
clear_output()

In [None]:
from top2vec import Top2Vec

In [None]:
model = Top2Vec(ds.data, embedding_model='universal-sentence-encoder-multilingual')
num_topics = model.get_num_topics()
top2vec_topic_words, top2vec_word_scores, top2vec_topic_nums = model.get_topics(num_topics)
for i in range(len(top2vec_topic_words)):
    print('Topic', i, ':', ', '.join(top2vec_topic_words[i]))
results = count_metrics(top2vec_topic_words, wf, wfid)
get_result_table(results)

2023-11-19 06:17:57,402 - top2vec - INFO - Pre-processing documents for training
INFO:top2vec:Pre-processing documents for training
2023-11-19 06:18:16,865 - top2vec - INFO - Downloading universal-sentence-encoder-multilingual model
INFO:top2vec:Downloading universal-sentence-encoder-multilingual model
2023-11-19 06:18:32,571 - top2vec - INFO - Creating joint document/word embedding
INFO:top2vec:Creating joint document/word embedding
2023-11-19 06:31:18,203 - top2vec - INFO - Creating lower dimension embedding of documents
INFO:top2vec:Creating lower dimension embedding of documents
2023-11-19 06:32:01,741 - top2vec - INFO - Finding dense areas of documents
INFO:top2vec:Finding dense areas of documents
2023-11-19 06:32:02,077 - top2vec - INFO - Finding topics
INFO:top2vec:Finding topics


Topic 0 : phillies, dodgers, yankees, espn, nhl, hockey, pittsburgh, playoffs, playoff, bruins, gretzky, baseball, standings, hawks, pitchers, rangers, rutgers, canucks, teams, cubs, islanders, braves, orioles, goalie, tavares, hopkins, stanford, stanley, brady, team, league, ufl, jersey, phil, cleveland, puck, tim, habs, robinson, coach, eddie, lopez, hitter, lindros, hl, purdue, pitcher, sports, williams, lemieux
Topic 1 : edu, universities, professor, university, prof, educational, academic, ncsu, wustl, article, subject, decnet, wrote, education, cnn, freenet, newsletter, cited, scholars, nichols, zisfein, contrib, sc_, institute, deleted, uni, literature, publications, doctrine, telnet, students, campus, organizations, dsl, stanford, nc, college, ncsl, stl, topics, publication, srl, ioccc, harvard, subjects, enet, ns, uoknor, msstate, sc
Topic 2 : spacecraft, aerospace, astronaut, astronomy, satellites, astronomical, planetary, orbiter, satellite, space, orbit, planet, orbital, te

Unnamed: 0,Metric,Avg,Std
0,PMI,3.607402,9.988974
1,NPMI,0.000565,0.001383
2,Coherence,-1601.522931,148.660833
