# NLP with Python: Topic Modeling

Source: https://sanjayasubedi.com.np/nlp/nlp-with-python-topic-modeling/

In [43]:
import nltk
import spacy
import numpy as np
import pandas as pd

from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [12]:
!python -m spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [2]:
# download data
!wget http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip

--2020-07-01 13:16:43--  http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip
Resolving mlg.ucd.ie (mlg.ucd.ie)... 137.43.93.132
Connecting to mlg.ucd.ie (mlg.ucd.ie)|137.43.93.132|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2874078 (2.7M) [application/zip]
Saving to: ‘bbc-fulltext.zip’


2020-07-01 13:16:44 (3.99 MB/s) - ‘bbc-fulltext.zip’ saved [2874078/2874078]



In [4]:
import zipfile
with zipfile.ZipFile('/content/bbc-fulltext.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/bbc-fulltext')

In [7]:
from sklearn.datasets import load_files
path = '/content/bbc-fulltext/bbc'
files = load_files(path, encoding='utf-8', decode_error='replace')
print(files)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [8]:
df = pd.DataFrame(list(zip(files['data'], files['target'])), columns=['text', 'label'])
df.head()

Unnamed: 0,text,label
0,Tate & Lyle boss bags top award\n\nTate & Lyle...,0
1,Halo 2 sells five million copies\n\nMicrosoft ...,4
2,MSPs hear renewed climate warning\n\nClimate c...,2
3,Pavey focuses on indoor success\n\nJo Pavey wi...,3
4,Tories reject rethink on axed MP\n\nSacked MP ...,2


## Introduction

Topic modeling is an interesting problem in NLP applications where we want to get an idea of what topics we have in our dataset. A topic is nothing more than a collection of words that describe the overall theme. For example, in case of news articles, we might think of topics as politics, sports etc. but topic modeling won’t directly give you names of the topics but rather a set of most probable words that might describe a topic. It is up to us to determine what topic the set of words might refer to.

In [15]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
some_text = 'Topic modeling is an interesting problem in NLP applications where we want to get an idea of what topics we have in our dataset.'
doc = nlp(some_text)
print([(token.lemma_, token.pos_) for token in doc])

[('topic', 'ADJ'), ('modeling', 'NOUN'), ('be', 'AUX'), ('an', 'DET'), ('interesting', 'ADJ'), ('problem', 'NOUN'), ('in', 'ADP'), ('NLP', 'PROPN'), ('application', 'NOUN'), ('where', 'ADV'), ('-PRON-', 'PRON'), ('want', 'VERB'), ('to', 'PART'), ('get', 'AUX'), ('an', 'DET'), ('idea', 'NOUN'), ('of', 'ADP'), ('what', 'PRON'), ('topic', 'NOUN'), ('-PRON-', 'PRON'), ('have', 'AUX'), ('in', 'ADP'), ('-PRON-', 'DET'), ('dataset', 'NOUN'), ('.', 'PUNCT')]


In [16]:
def only_nouns(text):
    doc = nlp(text)
    noun_text = ' '.join(token.lemma_ for token in doc if token.pos_ == 'NOUN')
    return noun_text

In [17]:
df['text'] = df['text'].apply(only_nouns)
df['text'].head()

0    boss bag award executive business magazine tit...
1    copy bumper sale fi shooter game copy sale com...
3    pavey success view week race track bronze inju...
4    tory rethink association candidate election ag...
Name: text, dtype: object

## Model Training

In [18]:
n_topics = 5

from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(stop_words='english')
features = vec.fit_transform(df['text'])

from sklearn.decomposition import NMF
nmf = NMF(n_components=n_topics)
nmf.fit(features)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=5, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

## Results

In [31]:
feature_names = vec.get_feature_names()
for i, topic in enumerate(nmf.components_):
    inds = topic.argsort()[-15:]
    words = np.array(feature_names)[inds]
    print(i, ' '.join(words))

0 quarter month analyst oil profit firm price share company market rate year sale economy growth
1 ceremony category prize festival role comedy year movie nomination director star actress actor award film
2 minute champion title victory goal coach season win time club injury team match player game
3 vote taxis country voter issue plan chancellor campaign people leader minister tax government party election
4 firm site video device network tv computer software broadband user service technology music people phone


In [33]:
new_articles = [
    "Playstation network was down so many people were angry",
    "Germany scored 7 goals against Brazil in worldcup semi-finals"
]

In [34]:
new_features = vec.transform(new_articles)
transformed = nmf.transform(new_features)
transformed.shape

(2, 5)

In [35]:
transformed.argmax(axis=1)

array([4, 2])

## Using LatentDirichletAllocation

In [None]:
def only_nouns(text):
    doc = nlp(text)
    noun_text = ' '.join(token.lemma_ for token in doc if token.pos_ == 'NOUN')
    return noun_text

In [53]:
class OnlyNouns(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, texts, y=None):
        output = []
        for text in texts:
            doc = nlp(text)
            noun_text = ' '.join(token.lemma_ for token in doc if token.pos_ == 'NOUN')
            output.append(noun_text)
        return output

In [54]:
n_topics = 5
pipeline = Pipeline([('only_nouns', OnlyNouns()),
                     ('vectorizer', TfidfVectorizer(stop_words='english')),
                     ('transformer', LatentDirichletAllocation(n_components=n_topics))])

In [55]:
pipeline.fit(df['text'])

Pipeline(memory=None,
         steps=[('only_nouns', OnlyNouns()),
                ('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=No...
                                 vocabulary=None)),
                ('transformer',
                 LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                                           evaluate_every=-1,
                                           learning_decay=0.7,
                                           learning_method='ba

In [56]:
feature_names = pipeline.named_steps['vectorizer'].get_feature_names()
components = pipeline.named_steps['transformer'].components_

In [61]:
for i, topic in enumerate(components):
    inds = topic.argsort()[-15:]
    words = ' '.join(np.array(feature_names)[inds])
    print(f"Topic {i}: {words}")

Topic 0: hole panda fix program writer patch axe leinster spyware software ballet attachment flaw virus patent
Topic 1: star nomination actress year role chart ceremony category band number singer director award actor film
Topic 2: seed week minute club year coach title victory injury season time match team player game
Topic 3: service month user profit price technology share phone music market company sale year firm people
Topic 4: law job policy rate campaign year country issue economy plan tax leader people government election


In [62]:
new_articles = [
    "Playstation network was down so many people were angry",
    "Germany scored 7 goals against Brazil in worldcup semi-finals"
]

In [70]:
new_features = pipeline.named_steps['vectorizer'].transform(new_articles)
transformed = pipeline.named_steps['transformer'].transform(new_features)

In [71]:
transformed.argmax(axis=1)

array([3, 4])