<a href="https://colab.research.google.com/github/AliAkbarBadri/topic-modelling/blob/master/topic_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# setup

In [25]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import time

In [None]:
newsgroups_train = fetch_20newsgroups(subset='train')

In [None]:
newsgroups_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [None]:
newsgroups_train['data'][0].replace('\n','')

"From: lerxst@wam.umd.edu (where's my thing)Subject: WHAT car is this!?Nntp-Posting-Host: rac3.wam.umd.eduOrganization: University of Maryland, College ParkLines: 15 I was wondering if anyone out there could enlighten me on this car I sawthe other day. It was a 2-door sports car, looked to be from the late 60s/early 70s. It was called a Bricklin. The doors were really small. In addition,the front bumper was separate from the rest of the body. This is all I know. If anyone can tellme a model name, engine specs, yearsof production, where this car is made, history, or whatever info youhave on this funky looking car, please e-mail.Thanks,- IL   ---- brought to you by your neighborhood Lerxst ----"

# TOPIC MODELLING

## Preprocess

In [14]:
# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#import nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize

#preprocessing
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

# for named entity recognition (NER)
from nltk import ne_chunk

# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [34]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [16]:
#stop-words
stop_words=set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [19]:
def clean_text(headline):
  le=WordNetLemmatizer()
  headline = headline.replace('\n', '')
  word_tokens = word_tokenize(headline)
  tokens = [le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w)>3]
  cleaned_text = " ".join(tokens)
  return cleaned_text

In [28]:
df = pd.DataFrame(newsgroups_train['data'], columns = ['data'])

In [35]:
df['data'] = df['data'].apply(clean_text)

In [36]:
df['data'][0]

'From lerxst wam.umd.edu thing Subject WHAT Nntp-Posting-Host rac3.wam.umd.eduOrganization University Maryland College ParkLines wondering anyone could enlighten sawthe 2-door sport looked late 60s/early called Bricklin door really small addition front bumper separate rest body This know anyone tellme model name engine spec yearsof production made history whatever info youhave funky looking please e-mail.Thanks brought neighborhood Lerxst'

In [37]:
vect =TfidfVectorizer(stop_words=stop_words,max_features=1000)

In [39]:
vect_text=vect.fit_transform(df['data'])

## Latent Semantic Analysis (LSA)

In [60]:
from sklearn.decomposition import TruncatedSVD
lsa_model = TruncatedSVD(n_components=10, algorithm='randomized', n_iter=10, random_state=42)

In [61]:
start_time = time.time()

vect =TfidfVectorizer(stop_words=stop_words,max_features=1000)
vect_text=vect.fit_transform(df['data'])

lsa_top=lsa_model.fit_transform(vect_text)

# most important words for each topic
vocab = vect.get_feature_names_out()

for i, comp in enumerate(lsa_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

print(time.time() - start_time)

Topic 0: 
edu com would writes article subject posting like university host 

Topic 1: 
edu file windows university thanks card host posting window drive 

Topic 2: 
edu cs university game team cc pitt player uiuc year 

Topic 3: 
com article writes edu netcom hp posting host pitt inc 

Topic 4: 
game team ca player year hockey play season toronto league 

Topic 5: 
nasa gov space research access digex center moon orbit station 

Topic 6: 
ac uk cs file co window ca pitt gordon banks 

Topic 7: 
pitt cs gordon banks computer chip pittsburgh univ clipper soon 

Topic 8: 
chip clipper encryption key government game escrow state netcom phone 

Topic 9: 
ac uk drive scsi chip clipper co hard encryption key 

2.1852498054504395


In [62]:
start_time = time.time()

vect =TfidfVectorizer(stop_words=stop_words,max_features=1000)
vect_text=vect.fit_transform(newsgroups_train['data'])

lsa_top=lsa_model.fit_transform(vect_text)

# most important words for each topic
vocab = vect.get_feature_names_out()

for i, comp in enumerate(lsa_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

print(time.time() - start_time)

Topic 0: 
edu com would writes article one subject lines organization university 

Topic 1: 
god people com would one jesus think us government say 

Topic 2: 
com windows netcom hp access inc uk ibm dos window 

Topic 3: 
com edu article writes netcom posting nntp hp host pitt 

Topic 4: 
ca team game year nasa hockey gov toronto players games 

Topic 5: 
key nasa clipper chip encryption gov government edu space keys 

Topic 6: 
uk nasa ac gov space co cs __ science research 

Topic 7: 
nasa gov god space jesus windows bible research ca center 

Topic 8: 
cs pitt gordon science computer windows pittsburgh ca univ soon 

Topic 9: 
ca key god clipper chip encryption escrow keys canada jesus 

2.531376361846924


## Latent Dirichlet Allocation (LDA)

In [63]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model=LatentDirichletAllocation(n_components=10,learning_method='online',random_state=42,max_iter=1) 

In [64]:
start_time = time.time()

vect =TfidfVectorizer(stop_words=stop_words,max_features=1000)
vect_text=vect.fit_transform(newsgroups_train['data'])

lda_top=lda_model.fit_transform(vect_text)

# most important words for each topic
vocab = vect.get_feature_names()

for i, comp in enumerate(lda_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

print(time.time() - start_time)

Topic 0: 
colorado frank 000 500 vs org rate edu year hell 

Topic 1: 
uiuc israel cso israeli virginia columbia jewish harvard jews policy 

Topic 2: 
turkish armenians armenian turkey armenia greek russian uucp million population 

Topic 3: 
edu com windows lines subject organization university thanks posting host 

Topic 4: 
andrew cmu pittsburgh ax 145 cx engineering 0d ah edu 

Topic 5: 
god jesus bible church christian christians christ rutgers faith christianity 

Topic 6: 
edu team game ca nasa pitt hockey year cs games 

Topic 7: 
com edu would writes one article people like subject organization 

Topic 8: 
de hp com edu thanks uni mail lines subject organization 

Topic 9: 
edu caltech keith cwru cleveland freenet com sgi writes morality 

7.015937805175781


In [65]:
start_time = time.time()

vect =TfidfVectorizer(stop_words=stop_words,max_features=1000)
vect_text=vect.fit_transform(df['data'])

lda_top=lda_model.fit_transform(vect_text)

# most important words for each topic
vocab = vect.get_feature_names()

for i, comp in enumerate(lda_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

print(time.time() - start_time)

Topic 0: 
utexas font scsi austin max texas cc cs cx 34u 

Topic 1: 
uiuc columbia cso illinois insurance sun edu cc rochester harvard 

Topic 2: 
space chip com clipper nasa netcom encryption key edu gov 

Topic 3: 
edu people com would writes article think jesus subject know 

Topic 4: 
team game player pitt edu hockey cmu gordon season andrew 

Topic 5: 
uk ac cwru cleveland freenet co western edu host case 

Topic 6: 
edu ohio nasa caltech buffalo keith gatech state sgi gov 

Topic 7: 
file window mit windows program server motif image color problem 

Topic 8: 
edu com card subject thanks drive university posting mail host 

Topic 9: 
com edu writes would article like subject think bike time 

6.89482307434082


## BertTopic

In [66]:
! pip install -q bertopic

In [90]:
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups



In [None]:
topic_model = BERTopic()

start_time = time.time()

topics, probs = topic_model.fit_transform(newsgroups_train['data'])

print(time.time() - start_time)

In [None]:
[topic_model.get_topics()[i] for i in range(10)]

In [89]:
topic_model2 = BERTopic()

start_time = time.time()

topics2, probs2 = topic_model2.fit_transform(df['data'])

print(time.time() - start_time)

2534.42577791214


In [None]:
[topic_model2.get_topics()[i] for i in range(10)]