In [115]:
%matplotlib inline

import nltk
import re
import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, Phrases, phrases

import pyLDAvis
import pyLDAvis.gensim  # don't skip this

# Spacy for lemmatization
import spacy
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups

In [34]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

## Import the Newsgroups Data

In [32]:
newsgroups_train = fetch_20newsgroups(subset='train')

## Convert the newsgroups into Pandas DataFrame

In [30]:
data = zip(newsgroups_train.data, 
           newsgroups_train.target, 
           newsgroups_train.target_names)
df = pd.DataFrame(data=data, columns=['data', 'target', 'target_names'])
df.head()

Unnamed: 0,data,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,alt.atheism
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.graphics
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.os.ms-windows.misc
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.sys.ibm.pc.hardware
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,comp.sys.mac.hardware


In [33]:
df.target_names.unique()

array(['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
       'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
       'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
       'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
       'sci.electronics', 'sci.med', 'sci.space',
       'soc.religion.christian', 'talk.politics.guns',
       'talk.politics.mideast', 'talk.politics.misc',
       'talk.religion.misc'], dtype=object)

## Cleansing of data

In [69]:
# Convert the data to list for preprocessing.
data = df.data.values.tolist()

# Remove emails.
data = [re.sub('\S*@\S*\s', '', sentence)
        for sentence in data]

# Remove new line characters.
data = [re.sub('\s', ' ', sentence)
        for sentence in data]

# Remove distracting single quotes.
data = [re.sub("\'", '', sentence)
        for sentence in data]

data = [sentence.strip()
        for sentence in data]

data[:2]

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: 15   I was wondering if anyone out there could enlighten me on this car I saw the other day. It was a 2-door sports car, looked to be from the late 60s/ early 70s. It was called a Bricklin. The doors were really small. In addition, the front bumper was separate from the rest of the body. This is  all I know. If anyone can tellme a model name, engine specs, years of production, where this car is made, history, or whatever info you have on this funky looking car, please e-mail.  Thanks, - IL    ---- brought to you by your neighborhood Lerxst ----',
 'From: (Guy Kuo) Subject: SI Clock Poll - Final Call Summary: Final call for SI clock reports Keywords: SI,acceleration,clock,upgrade Article-I.D.: shelley.1qvfo9INNc3s Organization: University of Washington Lines: 11 NNTP-Posting-Host: carson.u.washington.edu  A fair number of brave souls who upg

## Tokenize sentence and clean up text.

In [74]:
def sentence_to_tokens(sentences):
    for sentence in sentences:
        yield simple_preprocess(str(sentence),
                                deacc=True) # Removes punctuation.

In [75]:
data_words = list(sentence_to_tokens(data))
data_words[:1]

[['from',
  'wheres',
  'my',
  'thing',
  'subject',
  'what',
  'car',
  'is',
  'this',
  'nntp',
  'posting',
  'host',
  'rac',
  'wam',
  'umd',
  'edu',
  'organization',
  'university',
  'of',
  'maryland',
  'college',
  'park',
  'lines',
  'was',
  'wondering',
  'if',
  'anyone',
  'out',
  'there',
  'could',
  'enlighten',
  'me',
  'on',
  'this',
  'car',
  'saw',
  'the',
  'other',
  'day',
  'it',
  'was',
  'door',
  'sports',
  'car',
  'looked',
  'to',
  'be',
  'from',
  'the',
  'late',
  'early',
  'it',
  'was',
  'called',
  'bricklin',
  'the',
  'doors',
  'were',
  'really',
  'small',
  'in',
  'addition',
  'the',
  'front',
  'bumper',
  'was',
  'separate',
  'from',
  'the',
  'rest',
  'of',
  'the',
  'body',
  'this',
  'is',
  'all',
  'know',
  'if',
  'anyone',
  'can',
  'tellme',
  'model',
  'name',
  'engine',
  'specs',
  'years',
  'of',
  'production',
  'where',
  'this',
  'car',
  'is',
  'made',
  'history',
  'or',
  'whatever',
  

## Creating bigram and trigram models.

In [77]:
bigram = Phrases(data_words, min_count=5, threshold=100)
trigram = Phrases(bigram[data_words], threshold=100)

In [85]:
bigram_model = phrases.Phraser(bigram)
trigram_model = phrases.Phraser(trigram)

In [89]:
trigram_model[bigram_model[data_words[0]]]

['from',
 'guy',
 'kuo',
 'subject',
 'si',
 'clock',
 'poll',
 'final',
 'call',
 'summary',
 'final',
 'call',
 'for',
 'si',
 'clock',
 'reports',
 'keywords',
 'si',
 'acceleration',
 'clock',
 'upgrade',
 'article',
 'shelley',
 'qvfo',
 'innc',
 'organization',
 'university',
 'of',
 'washington',
 'lines',
 'nntp_posting_host',
 'carson',
 'washington',
 'edu',
 'fair',
 'number',
 'of',
 'brave',
 'souls',
 'who',
 'upgraded',
 'their',
 'si',
 'clock',
 'oscillator',
 'have',
 'shared',
 'their',
 'experiences',
 'for',
 'this',
 'poll',
 'please',
 'send',
 'brief',
 'message',
 'detailing',
 'your',
 'experiences',
 'with',
 'the',
 'procedure',
 'top',
 'speed',
 'attained',
 'cpu',
 'rated',
 'speed',
 'add',
 'on',
 'cards',
 'and',
 'adapters',
 'heat',
 'sinks',
 'hour',
 'of',
 'usage',
 'per',
 'day',
 'floppy',
 'disk',
 'functionality',
 'with',
 'and',
 'floppies',
 'are',
 'especially',
 'requested',
 'will',
 'be',
 'summarizing',
 'in',
 'the',
 'next',
 'two',


## Remove stopwords, make bigrams/trigrams and lemmatize

In [95]:
def remove_stopwords(docs):
    return [[token for token in tokens
             if token not in stop_words]
            for tokens in docs]

def make_bigrams(docs):
    return [bigram_model[doc] for doc in docs]

def make_trigrams(docs):
    return [trigram_model[doc] for doc in docs]

# Initialize spacy 'en' model, keeping only tagget component
# for efficiency.
# !python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

def lemmatization(docs, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    result = []
    for tokens in docs:
        doc = nlp(' '.join(tokens))
        result.append([token.lemma_ for token in doc 
                       if token.pos_ in allowed_postags])
    return result

In [100]:
# Remove stop words.
data_words_nostops = remove_stopwords(data_words)

# Form bigrams.
data_words_bigrams = make_bigrams(data_words_nostops)

data_lemmatized = lemmatization(data_words_bigrams)

data_lemmatized[:1]

[['where',
  's',
  'thing',
  'car',
  'nntp_poste',
  'host',
  'rac',
  'wam',
  'umd',
  'organization',
  'university',
  'maryland',
  'college',
  'park',
  'line',
  'wonder',
  'anyone',
  'could',
  'enlighten',
  'car',
  'see',
  'day',
  'door',
  'sport',
  'car',
  'look',
  'late',
  'early',
  'call',
  'bricklin',
  'door',
  'really',
  'small',
  'addition',
  'front',
  'bumper',
  'separate',
  'rest',
  'body',
  'know',
  'anyone',
  'tellme',
  'model',
  'name',
  'engine',
  'specs',
  'year',
  'production',
  'car',
  'make',
  'history',
  'info',
  'funky',
  'look',
  'car',
  'mail',
  'thank',
  'bring',
  'neighborhood',
  'lerxst']]

## Create the dictionary and corpus needed for topic modelling

In [103]:
# Create dictionary.
id2word = corpora.Dictionary(data_lemmatized)

# Create corpus.
corpus = [id2word.doc2bow(text) for text in data_lemmatized]

# Gensim creates a unique id for each word in the document. 
# The result is a list of tuples (word_id, word_frequency).
corpus[:1]

[[(0, 1),
  (1, 2),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 5),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 2),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 2),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1)]]

In [104]:
# The word for the id = 0.
id2word[0]

'addition'

In [105]:
# Human readable format of the corpus.
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('addition', 1),
  ('anyone', 2),
  ('body', 1),
  ('bricklin', 1),
  ('bring', 1),
  ('bumper', 1),
  ('call', 1),
  ('car', 5),
  ('college', 1),
  ('could', 1),
  ('day', 1),
  ('door', 2),
  ('early', 1),
  ('engine', 1),
  ('enlighten', 1),
  ('front', 1),
  ('funky', 1),
  ('history', 1),
  ('host', 1),
  ('info', 1),
  ('know', 1),
  ('late', 1),
  ('lerxst', 1),
  ('line', 1),
  ('look', 2),
  ('mail', 1),
  ('make', 1),
  ('maryland', 1),
  ('model', 1),
  ('name', 1),
  ('neighborhood', 1),
  ('nntp_poste', 1),
  ('organization', 1),
  ('park', 1),
  ('production', 1),
  ('rac', 1),
  ('really', 1),
  ('rest', 1),
  ('s', 1),
  ('see', 1),
  ('separate', 1),
  ('small', 1),
  ('specs', 1),
  ('sport', 1),
  ('tellme', 1),
  ('thank', 1),
  ('thing', 1),
  ('umd', 1),
  ('university', 1),
  ('wam', 1),
  ('where', 1),
  ('wonder', 1),
  ('year', 1)]]

## Building the topic model.

In [107]:
# Build LDA model.
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=20,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

In [108]:
lda_model.print_topics()

[(0,
  '0.001*"access" + 0.001*"boot" + 0.001*"question" + 0.001*"get" + 0.001*"terminal" + 0.001*"network" + 0.001*"file" + 0.001*"well" + 0.001*"great" + 0.001*"have"'),
 (1,
  '0.001*"year" + 0.001*"car" + 0.001*"insurance" + 0.001*"rate" + 0.001*"go" + 0.001*"be" + 0.001*"high" + 0.001*"pay" + 0.001*"get" + 0.001*"live"'),
 (2,
  '0.014*"record" + 0.013*"old" + 0.013*"jame" + 0.013*"initially" + 0.013*"pretty" + 0.013*"much" + 0.012*"different" + 0.012*"try" + 0.012*"require" + 0.012*"absolute"'),
 (3,
  '0.001*"weapon" + 0.001*"year" + 0.001*"scsi" + 0.001*"not" + 0.001*"mass_destruction" + 0.001*"say" + 0.001*"would" + 0.001*"article" + 0.001*"car" + 0.001*"insurance"'),
 (4,
  '0.052*"board" + 0.016*"be" + 0.016*"file" + 0.016*"stac" + 0.016*"have" + 0.016*"autodoubler" + 0.016*"problem" + 0.011*"double" + 0.011*"illinoi" + 0.011*"write"'),
 (5,
  '0.026*"clock" + 0.021*"si" + 0.016*"cerkoney" + 0.016*"rod" + 0.016*"upgrade" + 0.016*"poll" + 0.016*"co" + 0.016*"experience" + 0.0

In [110]:
doc_lda = lda_model[corpus]

<gensim.interfaces.TransformedCorpus at 0x128351a10>

## Compute model perplexity and coherence score.

In [113]:
# A measure of how good the model is. The lower the better.
lda_model.log_perplexity(corpus)

-7.2952888443039825

In [114]:
# Compute coherence score.
coherence_model_lds = CoherenceModel(model=lda_model,
                                     texts=data_lemmatized,
                                     dictionary=id2word,
                                     coherence='c_v')
coherence_model_lds.get_coherence()

0.487836108430972

## Visualize the topics-keywords

In [116]:
pyLDAvis.enable_notebook()

In [117]:
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


# References

- https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/