In [None]:
#Installing all required packages
!pip install gensim
!pip install spacy
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Import modules we need
import pandas as pd
import numpy as np
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [None]:
import nltk
import spacy
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Reading news dataset from github
news_df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(news_df.head(5))

                                             content  target  \
0  From: lerxst@wam.umd.edu (where's my thing)\nS...       7   
1  From: guykuo@carson.u.washington.edu (Guy Kuo)...       4   
2  From: twillis@ec.ecn.purdue.edu (Thomas E Will...       4   
3  From: jgreen@amber (Joe Green)\nSubject: Re: W...       1   
4  From: jcm@head-cfa.harvard.edu (Jonathan McDow...      14   

            target_names  
0              rec.autos  
1  comp.sys.mac.hardware  
2  comp.sys.mac.hardware  
3          comp.graphics  
4              sci.space  


In [None]:
import re
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
stopwords.extend(['from', 'subject', 're', 'edu', 'use'])
# Cleaning text
def textCleaning(text):
    # Removing extra white spaces from text
    text = re.sub(r'\s+',' ',text)
    # Removing special characters from text
    text = re.sub(r'\W',' ',str(text))
    # Remove Emails
    text = re.sub('\S*@\S*\s?', '',text)
    # Removing single characters from text
    text = re.sub(r'\s+[a-zA-Z]\s+',' ',text)
    # Removing numbers
    text = re.sub(r'\s+[0-9]*\s+',' ',text)
    #single quotes removing
    text = re.sub('[,\.!?]', '', text)
    # Converting entire text into lower case
    text = text.lower()
    # Word tokenisation
    tokens = text.split()
    # Removing stop words from text
    non_stopwords = [word for word in tokens if word not in stopwords]

    return non_stopwords

In [None]:
data = news_df.content.values.tolist() # reading content column from dataset
clean_data = [textCleaning(sent) for sent in data] # cleaning text
print(clean_data[:1])

[['lerxst', 'wam', 'umd', 'thing', 'car', 'nntp', 'posting', 'host', 'rac3', 'wam', 'umd', 'organization', 'university', 'maryland', 'college', 'park', 'lines', 'wondering', 'anyone', 'could', 'enlighten', 'car', 'saw', 'day', 'door', 'sports', 'car', 'looked', 'late', '60s', 'early', '70s', 'called', 'bricklin', 'doors', 'really', 'small', 'addition', 'front', 'bumper', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'production', 'car', 'made', 'history', 'whatever', 'info', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'neighborhood', 'lerxst']]


In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(clean_data, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[clean_data], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [None]:
# Define functions for bigrams, trigrams and lemmatization
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Form Bigrams
bigram_words = make_bigrams(clean_data)

# Initialize spacy 'en' model
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
lemmatised_text = lemmatization(bigram_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(lemmatised_text[:1])

[['thing', 'car', 'nntp_poste', 'host', 'park', 'line', 'wondering_anyone', 'enlighten', 'car', 'see', 'day', 'door', 'sport', 'car', 'look', 'call', 'door', 'really', 'small', 'addition', 'separate', 'rest', 'body', 'know', 'tellme', 'model', 'name', 'engine', 'spec', 'year', 'production', 'car', 'make', 'history', 'info', 'funky', 'look', 'car', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst']]


In [None]:
# Create Dictionary
id2word = corpora.Dictionary(lemmatised_text)

# Creating Corpus
texts = lemmatised_text

# Term Document Frequency matrix
corpus = [id2word.doc2bow(text) for text in texts]

print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 5), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 2), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1)]]


In [None]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('addition', 1),
  ('body', 1),
  ('bring', 1),
  ('call', 1),
  ('car', 5),
  ('day', 1),
  ('door', 2),
  ('engine', 1),
  ('enlighten', 1),
  ('funky', 1),
  ('history', 1),
  ('host', 1),
  ('info', 1),
  ('know', 1),
  ('lerxst', 1),
  ('line', 1),
  ('look', 2),
  ('mail', 1),
  ('make', 1),
  ('model', 1),
  ('name', 1),
  ('neighborhood', 1),
  ('nntp_poste', 1),
  ('park', 1),
  ('production', 1),
  ('really', 1),
  ('rest', 1),
  ('see', 1),
  ('separate', 1),
  ('small', 1),
  ('spec', 1),
  ('sport', 1),
  ('tellme', 1),
  ('thank', 1),
  ('thing', 1),
  ('wondering_anyone', 1),
  ('year', 1)]]

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the Keywords in the 5 topics
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0, '0.010*"team" + 0.010*"game" + 0.008*"year" + 0.006*"government" + 0.006*"play" + 0.005*"gun" + 0.005*"win" + 0.005*"player" + 0.004*"state" + 0.004*"go"'), (1, '0.013*"say" + 0.013*"write" + 0.012*"people" + 0.010*"think" + 0.008*"know" + 0.008*"make" + 0.008*"line" + 0.007*"see" + 0.007*"article" + 0.006*"go"'), (2, '0.221*"ax" + 0.028*"t" + 0.028*"g" + 0.016*"_" + 0.015*"p" + 0.014*"r" + 0.007*"q" + 0.006*"m" + 0.006*"o" + 0.005*"o__o"'), (3, '0.014*"com" + 0.010*"line" + 0.009*"get" + 0.008*"organization" + 0.008*"write" + 0.006*"key" + 0.005*"article" + 0.005*"go" + 0.005*"nntp_poste" + 0.005*"car"'), (4, '0.017*"line" + 0.009*"organization" + 0.008*"use" + 0.008*"nntp_poste" + 0.008*"com" + 0.007*"host" + 0.007*"program" + 0.007*"drive" + 0.007*"get" + 0.006*"file"')]


In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=lemmatised_text, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.31873356748868

Coherence Score:  0.6082085251387609


In [None]:
!pip install pyLDAvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting funcy
  Downloading funcy-1.17-py2.py3-none-any.whl (33 kB)
Collecting sklearn
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyLDAvis, sklearn
  Building wheel for pyLDAvis (pyproject.toml) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1-py2.py3-none-any.whl size=136898 sha256=92dd3dc88b4827d91ab08ff62d6e1210d83bf589cae9c73a226e5784d16ccde9
  Stored 

In [None]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

  from collections import Iterable


In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

  default_term_info = default_term_info.sort_values(
