In [1]:
# pip install spacy

In [2]:
# !python -m spacy download en_core_web_md

In [3]:
##### text topic modeling

In [4]:
import pandas as pd
import numpy as np

# Read Data

In [5]:
# https://scikit-learn.org/stable/datasets/index.html#newsgroups-dataset
# http://qwone.com/~jason/20Newsgroups/
# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html
from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups

# load full data set; strip posts of headers, footers and quoted replies; pick all categories
data = fetch_20newsgroups(
    subset='all', # subset='train',         
    categories=['sci.space', 'rec.motorcycles','comp.windows.x'],
    remove=('headers', 'footers', 'quotes'))

# convert data to dataframe
newsgroup20df = pd.DataFrame(data.data, columns=['content'])
newsgroup20df['target'] = data.target
newsgroup20df['target_names'] = newsgroup20df['target'].apply(lambda x: data.target_names[x])
# newsgroup20df.head()
# newsgroup20df['target_names'].value_counts()

X, y = newsgroup20df['content'], newsgroup20df['target']
X.head()

0    : VirtualGrabKeys is not an OW resource. It be...
1    \nHell, just save your candle stubs and bring ...
2    \n   ...  So how about this?  Give the winning...
3    I ONLY Just prevented myself from diving  in o...
4    A few days ago there was a posting in this gro...
Name: content, dtype: object

In [6]:
newsgroup20df.head()

Unnamed: 0,content,target,target_names
0,: VirtualGrabKeys is not an OW resource. It be...,0,comp.windows.x
1,"\nHell, just save your candle stubs and bring ...",1,rec.motorcycles
2,\n ... So how about this? Give the winning...,2,sci.space
3,I ONLY Just prevented myself from diving in o...,1,rec.motorcycles
4,A few days ago there was a posting in this gro...,0,comp.windows.x


In [7]:
newsgroup20df['target_names'].value_counts()

rec.motorcycles    996
comp.windows.x     988
sci.space          987
Name: target_names, dtype: int64

# LDA setup

In [8]:
##### LDA #####
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
# Latent Dirichlet Allocation is a generative probabilistic model for collections of discrete dataset such as text corpora. 
# It is also a topic model that is used for discovering abstract topics from a collection of documents.

# LDA is an iterative algorithm. 
# Initially, each word is assigned to a random topic.
# Then, in each iteration, the algorithm reassigns each word to a topic based on 
#    the probability of the word belonging to a topic,
#    and the probability of the document generated by a topic.

In [9]:
# https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

n_docs = X.shape[0] 
n_features = 1000          
n_components = 3 # number of topics T
doc_topic_prior = 50/n_components # alpha
topic_word_prior = 0.01 # beta
n_top_words = 20

# n_top_terms = 2
# arr = np.array([10, 20, 40, 5, 30])
# print (arr.argsort())
# print (arr.argsort()[::-1])
# print (arr.argsort()[:-n_top_terms-1:-1])

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

# Preprocessing using spacy

In [10]:
def custom_tokenizer(doc):

    # use spacy to filter out noise
    tokens = [token.lemma_.lower() 
                        for token in doc 
                               if (
                                    len(token) >= 2 and # only preserve tokens that are greater than 2 characters long
                                    token.pos_ in ['PROPN', 'NOUN', 'ADJ', 'VERB', 'ADV'] and # only preserve selected pos
                                    #token.text in nlp.vocab and # check if token in vocab 
                                    token.is_alpha and # only preserve tokens that are fully alpha (not numeric or alpha-numeric)
                                    #not token.is_digit and # get rid of tokens that are fully numeric
                                    not token.is_punct and # get rid of tokens that are punctuations
                                    not token.is_space and # get rid of tokens that are spaces
                                    not token.is_stop and # get rid of tokens that are stop words
                                    not token.is_currency # get rid of tokens that denote currencies
                                )
                   ]

    # return cleaned-up text
    return ' '.join(tokens)

In [11]:
%%time

import spacy
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])
corpus = nlp.pipe(list(X))
clean_corpus = [custom_tokenizer(doc) for doc in corpus]
X = pd.Series(clean_corpus)
X.head()

Wall time: 54.5 s


0    virtualgrabkeys ow resource belong man page sa...
1    save candle stub bring light dribble wax kindl...
2    win group company corp year moratorium taxis t...
3                                         prevent dive
4    day ago posting group andrea winkler title sec...
dtype: object

# BoW Term Frequency using sklearn

In [12]:
# lda can only use raw counts 

# init bow model
bow = TfidfVectorizer(
    binary=False, norm=None, # tf - bow
    use_idf=False, smooth_idf=False, # idf - none
    lowercase=True, stop_words='english', 
    min_df=2, max_df=0.95, max_features=n_features, 
    ngram_range=(1, 1))

# fit bow model with data
bow_model = bow.fit(X)
feature_names = bow_model.get_feature_names()

# transform data
bow_trans = bow_model.transform(X)

# let's take a look at the doc-term matrix
DocTermMatrix = pd.DataFrame(data=bow_trans.toarray(), 
             index = ['Doc' + str(i) for i in range(n_docs)],
             columns = feature_names)
DocTermMatrix.head() # n_docs, n_features

Unnamed: 0,ability,able,accept,access,act,action,activity,actually,ad,add,...,wrong,xdm,xlib,xmu,xt,xterm,xv,xview,xvoid,year
Doc0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
Doc3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Doc4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# LDA using sklearn

In [13]:
# init lda model
lda = LatentDirichletAllocation(
    n_components=n_components, 
    doc_topic_prior=doc_topic_prior,
    topic_word_prior=topic_word_prior,
    learning_method='online', 
    learning_offset=50,
    random_state=1) 

# fit lda model with data
# lda_model.components_: 
# Since the complete conditional for topic word distribution is a Dirichlet, 
#    components_[i, j] can be viewed as pseudocount that represents the number of times word j was assigned to topic i. 
# It can also be viewed as distribution over the words for each topic after normalization: 
#    model.components_ / model.components_.sum(axis=1)[:, np.newaxis].
lda_model = lda.fit(bow_trans)

# let's take a look at the topic-term matrix
TopicTermMatrix = pd.DataFrame(data=np.round(lda_model.components_, 4), 
             index = ['Topic' + str(i) for i in range(n_components)],
             columns = feature_names)
TopicTermMatrix.head() # n_topics, n_features

Unnamed: 0,ability,able,accept,access,act,action,activity,actually,ad,add,...,wrong,xdm,xlib,xmu,xt,xterm,xv,xview,xvoid,year
Topic0,19.8498,75.6686,8.2712,18.6551,49.6337,8.1282,97.826,44.892,45.7311,35.8606,...,16.0058,0.0603,0.0446,0.061,0.0264,0.1754,0.7441,0.0225,0.0111,397.293
Topic1,28.3355,76.2608,26.1547,124.2753,12.588,54.6487,3.7558,71.8657,0.0712,211.4134,...,28.4953,95.0492,136.9066,57.3642,185.6525,273.3357,27.0337,170.9821,0.0115,4.9873
Topic2,2.8908,41.3794,21.8202,12.3623,2.2161,22.5933,0.3525,89.1645,0.4091,16.6746,...,71.1026,0.2593,4.0156,2.0672,0.1483,7.7362,51.693,0.2648,48.0701,196.8194


In [14]:
# let's take a look at the top words in each topic
print_top_words(lda_model, feature_names, n_top_words)

Topic #0: space launch time year earth nasa orbit mission satellite shuttle moon new cost think know solar spacecraft planet design large
Topic #1: window use server run widget application include available file support set version display work motif program user look sun software
Topic #2: file bike know program entry need think good use try line ride way want thing time read right people build



In [15]:
# transform data
lda_trans = lda_model.transform(bow_trans)

# let's take a look at the doc-topic matrix.
# note that the each document is now represented in terms of the underlying latent topics.
# the numbers represent how much of  the document was generated by which topic, and add up to 1.
DocTopicMatrix = pd.DataFrame(data=np.round(lda_trans, 4), 
             index = ['Doc' + str(i) for i in range(n_docs)],
             columns = ['Topic' + str(i) for i in range(n_components)])
DocTopicMatrix.head() # n_docs, n_topics

# let's also include the dominant topic in each document 
DocTopicMatrix['dominant_topic'] = np.argmax(DocTopicMatrix.values, axis=1)
DocTopicMatrix.head() # n_docs, n_topics + 1 col for dominant topic

Unnamed: 0,Topic0,Topic1,Topic2,dominant_topic
Doc0,0.3244,0.3521,0.3236,1
Doc1,0.3212,0.2922,0.3866,2
Doc2,0.3509,0.3002,0.3489,0
Doc3,0.3333,0.3333,0.3333,0
Doc4,0.3252,0.3022,0.3726,2


In [16]:
# Perplexity: a measure of how genralizable the model is. lower the score, the better.
lda_model.perplexity(bow_trans)

659.6462105275187

In [17]:
# here's the distribution of topics
DocTopicMatrix['dominant_topic'].value_counts()

2    1106
0     978
1     887
Name: dominant_topic, dtype: int64

In [18]:
# a good topic model will have big, non-overlapping clusters for each topic. 

# pip install pyLDAvis
# import pyLDAvis.sklearn
 
# pyLDAvis.enable_notebook()
# panel = pyLDAvis.sklearn.prepare(lda_model, bow_trans, bow, mds='tsne')
# panel

In [19]:
# predict topic given some text
text = "NASA is an independent agency responsible for the civilian space program."
text_topic_scores = lda_model.transform(bow_model.transform([text]))
dominant_topic_index = text_topic_scores.argmax()
dominant_topic_terms = TopicTermMatrix.iloc[dominant_topic_index, :]
print ("Dominant Topic Index:\n", dominant_topic_index)
print ("Dominant Topic Terms:\n", dominant_topic_terms)

Dominant Topic Index:
 0
Dominant Topic Terms:
 ability     19.8498
able        75.6686
accept       8.2712
access      18.6551
act         49.6337
             ...   
xterm        0.1754
xv           0.7441
xview        0.0225
xvoid        0.0111
year       397.2930
Name: Topic0, Length: 1000, dtype: float64


In [20]:
lda_model.transform(bow_model.transform([text]))

array([[0.36490317, 0.3156024 , 0.31949443]])