# Topic modeling using traditional NLP

In [None]:
# Import Libraries
!pip install gensim spacy pyLDAvis nltk datasets

from datasets import load_dataset
import spacy
from spacy import displacy
import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel
import pyLDAvis
import pyLDAvis.gensim_models
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Using cached scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
Downloading dill-0.3.8-py3-none-any.whl (116 kB)
Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
Downloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
Using cached scipy-1.13.1-cp31

In [None]:
# Download NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

True

In [None]:
# Load dataset
dataset = load_dataset("ag_news", split="train[:200]")
docs = [item['text'] for item in dataset]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [None]:
# Data Cleaning
nlp = spacy.load('en_core_web_sm')

# Words that are expected but don't add value to model
my_stop_words = ['say', '\s', 'mr', 'Mr', 'said', 'says', 'saying', 'today', 'be']

for stopword in my_stop_words:
  lexeme = nlp.vocab[stopword]
  lexeme.is_stop = True



  my_stop_words = ['say', '\s', 'mr', 'Mr', 'said', 'says', 'saying', 'today', 'be']


In [None]:
# Preprocessing text
texts = []
for doc in docs:
    processed_doc = nlp(doc)
    tokens = [
        token.lemma_ for token in processed_doc
        if not token.is_stop and not token.is_punct and not token.like_num
    ]
    texts.append(tokens)

print(texts[0])

['Wall', 'St.', 'Bears', 'Claw', 'Black', 'Reuters', 'Reuters', 'short', 'seller', 'Wall', 'Street', 'dwindling\\band', 'ultra', 'cynic', 'see', 'green']


In [None]:
# Creating Bigrams
bigram = gensim.models.Phrases(texts)
bigram_mod = gensim.models.phrases.Phraser(bigram)
texts = [bigram_mod[text] for text in texts]
print(texts[0])

['Wall', 'St.', 'Bears', 'Claw', 'Black', 'Reuters_Reuters', 'short', 'seller', 'Wall', 'Street', 'dwindling\\band', 'ultra', 'cynic', 'see', 'green']


In [None]:
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus[1])

[(3, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1)]


## Topic Modeling

In [None]:
# Latent Semantic Indexing (LSI) method
lsi_model = LsiModel(corpus=corpus, num_topics=10, id2word=dictionary)
lsi_model.show_topics(num_topics=5)

  sparsetools.csc_matvecs(


[(0,
  '0.877*" " + 0.099*"NTP" + 0.088*"   " + 0.080*"blog" + 0.075*"work" + 0.063*"IE" + 0.062*"time" + 0.059*"go" + 0.056*"Microsoft" + 0.054*"know"'),
 (1,
  '-0.437*"Java" + -0.376*"3d" + -0.292*"core" + -0.283*"project" + -0.239*"code" + -0.219*"source" + -0.150*"release" + -0.148*"SUN" + -0.147*"announce" + -0.145*"vecmath"'),
 (2,
  '-0.365*"logger" + -0.360*"   " + -0.299*"static" + -0.299*"=" + -0.292*"Logger" + -0.219*"Logger.getLogger" + -0.219*"Log4J" + -0.146*"amazingly" + -0.146*"to:\\\\" + -0.097*"developer"'),
 (3,
  '0.334*"need" + 0.249*"sea" + 0.247*"air" + 0.244*"strike" + 0.244*"Australia" + 0.241*"gap" + 0.240*"defend" + 0.164*"nation" + 0.160*"Strike" + 0.160*"Fighter"'),
 (4,
  '0.400*"AP_AP" + 0.228*"new" + 0.183*"scientist" + 0.158*"week" + 0.144*"plan" + -0.117*" " + 0.115*"year" + 0.111*"Lynn" + 0.110*"Thursday" + 0.099*"surrender"')]

In [None]:
# Hierarchical Dirichlet Process (HDP) method
hdp_model = HdpModel(corpus=corpus, id2word=dictionary)
hdp_model.show_topics()[:5]



[(0,
  '0.003*tire + 0.002*Right + 0.002*The\\Sopranos + 0.002*advanced + 0.002*event + 0.002*Pinoy + 0.002*Services + 0.002*Wild + 0.002*encryption + 0.002*economy + 0.002*regulator + 0.002*feel + 0.002*Lightning + 0.002*UK + 0.002*generate + 0.002*spot + 0.002*absence + 0.002*available + 0.002*keep + 0.002*business'),
 (1,
  '0.003*Sierra + 0.003*Japanese + 0.002*browser + 0.002*software\\products + 0.002*Judiciary + 0.002*embrace + 0.002*Exports + 0.002*middle + 0.002*convert + 0.002*comet + 0.002*Virginia + 0.002*Breach + 0.002*resident + 0.002*holiday + 0.002*Cloudscape + 0.002*lawsuit + 0.002*lose + 0.002*i\\can + 0.002*unidentified + 0.002*white'),
 (2,
  '0.003*threat + 0.003*politician + 0.002*NFL + 0.002*Ways + 0.002*pen + 0.002*Quality + 0.002*agressive + 0.002*increasingly + 0.002*Great + 0.002*intelligence + 0.002*meteor + 0.002*fashion + 0.002*nearly + 0.002*mere + 0.002*spacecraft + 0.002*leave + 0.002*Birds + 0.002*classname:\\\\so + 0.002*Google + 0.002*powerful'),
 (3

In [None]:
# Latent Dirichlet Allocation (LDA) method
lda_model = LdaModel(corpus=corpus, num_topics=10, id2word=dictionary)
lda_model.show_topics()



[(0,
  '0.019*" " + 0.008*"AP_AP" + 0.006*"week" + 0.005*"market" + 0.005*"economy" + 0.004*"money" + 0.004*"Hubble" + 0.004*"russian" + 0.004*"space" + 0.004*"Space"'),
 (1,
  '0.014*" " + 0.008*"find" + 0.007*"new" + 0.006*"AP_AP" + 0.006*"blog" + 0.005*"good" + 0.005*"customer" + 0.004*"plan" + 0.004*"post" + 0.004*"NTP"'),
 (2,
  '0.007*" " + 0.006*"AP_AP" + 0.006*"Java" + 0.005*"power" + 0.005*"company" + 0.005*"3d" + 0.004*"core" + 0.004*"New" + 0.004*"Americans" + 0.004*"new"'),
 (3,
  '0.010*" " + 0.008*"AP_AP" + 0.006*"logger" + 0.005*"Google" + 0.005*"=" + 0.005*"static" + 0.005*"project" + 0.005*"   " + 0.005*"Logger" + 0.004*"oil"'),
 (4,
  '0.033*" " + 0.006*"   " + 0.006*"project" + 0.005*"IE" + 0.004*"Microsoft" + 0.004*"game" + 0.004*"week" + 0.004*"Autodesk" + 0.004*"OPEC" + 0.003*"application"'),
 (5,
  '0.021*" " + 0.007*"company" + 0.005*"Google" + 0.004*"share" + 0.004*"Wall" + 0.004*"mail" + 0.004*"Inc." + 0.004*"SPACE.com_SPACE.com" + 0.004*"auction" + 0.003*"sec

## Visualizing Topics with pyLDAvis

In [None]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)