In [1]:
import os, sys
sys.path.insert(0, os.path.abspath('..'))

In [2]:
%load_ext autoreload
%autoreload 2

In [22]:
import pandas as pd
pd.set_option('max_colwidth', 100)

from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
import string
import re

from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel, LdaMulticore
from gensim.models import CoherenceModel

import pyLDAvis.gensim_models
# Visualise inside a notebook
pyLDAvis.enable_notebook()

from preprocessing.text_preprocessor import (
    ToLower,
    StopwordsRemover,
    RegexMapper,
    RegexSubstituter,
    SequentialRegexSubstituter,
    CompositeTextPreprocessors
)

from preprocessing.text_tokenizer import (
    NLTKWordTokenizer,
    TextTokenizer

)

  from imp import reload


# 0. Data Preparation

## 0.1. Load Data

In [4]:
data, _ = fetch_20newsgroups(
    shuffle=True,
    random_state=1,
    remove=("headers", "footers", "quotes"),
    return_X_y=True,
)

print(f"number of articles: {len(data)}")

number of articles: 11314


In [5]:
df = pd.DataFrame({"text": data})

print(f"dataframe shape: {df.shape}")
df.head()

dataframe shape: (11314, 1)


Unnamed: 0,text
0,Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statemen...
1,"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism..."
2,"Although I realize that principle is not one of your strongest\npoints, I would still like to kn..."
3,"Notwithstanding all the legitimate fuss about this proposal, how much\nof a change is it? ATT's..."
4,"Well, I will have to change the scoring on my playoff pool. Unfortunately\nI don't have time ri..."


# 0.2. Preprocess Data

In [6]:
en_stop_words = set(stopwords.words('english'))

print(f"preview stopwords: {list(en_stop_words)[:10]}")

preview stopwords: ['and', 'few', 'in', "couldn't", 'into', 'whom', 'theirs', 'won', 'with', 'over']


In [7]:
punctuation = string.punctuation

print(f"preview punctuation: {list(punctuation)[:10]}")

preview punctuation: ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*']


In [8]:
regex_mappers = [
    # remove punctuation
    RegexMapper(regex=f"[{punctuation}]", sub=''),
    
    # collapse whitespace(s) into one space
    RegexMapper(regex="\s+", sub=' '),
                
    # strip whitespace(s)
    RegexMapper(regex="^\s+|\s+$", sub=''),
]

preprocessors = [
    ToLower(),
    StopwordsRemover(list_stopwords=en_stop_words),
    SequentialRegexSubstituter(list_regex_mappers=regex_mappers),
]

preprocessing_funcs = CompositeTextPreprocessors(list_preprocessors=preprocessors)

In [9]:
# check preprocessing funcs
example_text = """  Yeah, do you expect people to read the FAQ, etc. \nand actually accept hard on \n """
print(f"> before applying preprocessing:\n{example_text}")

print(f"> after applying preprocessing:\n{preprocessing_funcs(example_text)}")

> before applying preprocessing:
  Yeah, do you expect people to read the FAQ, etc. 
and actually accept hard on 
 
> after applying preprocessing:
yeah expect people read faq etc actually accept hard


### Apply preprocessing to data

In [10]:
df['text_preprocessed'] = df['text'].apply(lambda x: preprocessing_funcs(x))

print(f"dataframe shape: {df.shape}")
df.head()

dataframe shape: (11314, 2)


Unnamed: 0,text,text_preprocessed
0,Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statemen...,well im sure story nad seem biased disagree statement us media ruin israels reputation rediculou...
1,"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism...",yeah expect people read faq etc actually accept hard atheism no need little leap faith jimmy log...
2,"Although I realize that principle is not one of your strongest\npoints, I would still like to kn...",although realize principle one strongest points would still like know ask question sort arab cou...
3,"Notwithstanding all the legitimate fuss about this proposal, how much\nof a change is it? ATT's...",notwithstanding legitimate fuss proposal much change it atts last product area a priced 1000 sus...
4,"Well, I will have to change the scoring on my playoff pool. Unfortunately\nI don't have time ri...",well change scoring playoff pool unfortunately time right now certainly post new scoring rules t...


# 1. LDA

## 1.1. Tokenize text

In [11]:
df['text_tokenized'] = df['text_preprocessed'].apply(lambda x: NLTKWordTokenizer()(x))

print(f"dataframe shape: {df.shape}")
df.head()

dataframe shape: (11314, 3)


Unnamed: 0,text,text_preprocessed,text_tokenized
0,Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statemen...,well im sure story nad seem biased disagree statement us media ruin israels reputation rediculou...,"[well, im, sure, story, nad, seem, biased, disagree, statement, us, media, ruin, israels, reputa..."
1,"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism...",yeah expect people read faq etc actually accept hard atheism no need little leap faith jimmy log...,"[yeah, expect, people, read, faq, etc, actually, accept, hard, atheism, no, need, little, leap, ..."
2,"Although I realize that principle is not one of your strongest\npoints, I would still like to kn...",although realize principle one strongest points would still like know ask question sort arab cou...,"[although, realize, principle, one, strongest, points, would, still, like, know, ask, question, ..."
3,"Notwithstanding all the legitimate fuss about this proposal, how much\nof a change is it? ATT's...",notwithstanding legitimate fuss proposal much change it atts last product area a priced 1000 sus...,"[notwithstanding, legitimate, fuss, proposal, much, change, it, atts, last, product, area, a, pr..."
4,"Well, I will have to change the scoring on my playoff pool. Unfortunately\nI don't have time ri...",well change scoring playoff pool unfortunately time right now certainly post new scoring rules t...,"[well, change, scoring, playoff, pool, unfortunately, time, right, now, certainly, post, new, sc..."


## 1.2. Build Corpus (BoW):

In [12]:
dictionary = Dictionary(df['text_tokenized'])

# preview tokens
list_tokens = list(dictionary.items())

print(f"number of tokens: {len(list_tokens)}")
print(f"preview tokens: {list_tokens[:5]}")

number of tokens: 106414
preview tokens: [(0, 'acts'), (1, 'all'), (2, 'at'), (3, 'atrocities'), (4, 'austria')]


In [13]:
# filter out irrelevant tokens
dictionary.filter_extremes(no_below=5, no_above=0.5)

# preview tokens
list_tokens = list(dictionary.items())

print(f"number of tokens: {len(list_tokens)}")
print(f"preview tokens: {list_tokens[:5]}")

number of tokens: 16713
preview tokens: [(0, 'acts'), (1, 'all'), (2, 'at'), (3, 'atrocities'), (4, 'austria')]


### Construct the Corpus/BoW

In [14]:
corpus = [ dictionary.doc2bow(doc) for doc in df["text_tokenized"] ]

print(f"preview corpus:\n{corpus[:1]}")

preview corpus:
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 2), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 4), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 2), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 4), (59, 1), (60, 1), (61, 1)]]


## 1.3. Determine Best Number of Topics

## 1.4. Train LDA

In [15]:
lda_model = LdaModel(corpus=corpus, id2word=dictionary,
                 iterations=50, num_topics=20, passes=10)

## 1.5. Visualize Topics

In [18]:
lda_model.print_topics()

[(0,
  '0.017*"space" + 0.008*"university" + 0.005*"data" + 0.005*"new" + 0.005*"nasa" + 0.005*"research" + 0.005*"center" + 0.004*"launch" + 0.004*"us" + 0.004*"national"'),
 (1,
  '0.140*"1" + 0.090*"2" + 0.065*"0" + 0.051*"3" + 0.038*"4" + 0.029*"5" + 0.021*"6" + 0.018*"7" + 0.017*"8" + 0.013*"10"'),
 (2,
  '0.009*"msg" + 0.009*"food" + 0.007*"one" + 0.007*"ball" + 0.007*"hit" + 0.005*"cubs" + 0.004*"much" + 0.004*"brain" + 0.004*"edge" + 0.004*"base"'),
 (3,
  '0.018*"thanks" + 0.017*"windows" + 0.016*"window" + 0.014*"anyone" + 0.013*"problem" + 0.012*"using" + 0.011*"would" + 0.011*"use" + 0.011*"know" + 0.011*"please"'),
 (4,
  '0.016*"car" + 0.011*"good" + 0.008*"like" + 0.008*"bike" + 0.007*"much" + 0.007*"well" + 0.006*"cars" + 0.006*"engine" + 0.006*"one" + 0.005*"new"'),
 (5,
  '0.032*"image" + 0.019*"widget" + 0.014*"display" + 0.014*"color" + 0.014*"data" + 0.013*"code" + 0.012*"char" + 0.012*"images" + 0.010*"set" + 0.009*"value"'),
 (6,
  '0.016*"people" + 0.011*"one" +

In [19]:
df['text'][0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [20]:
lda_model[corpus][0]

[(6, 0.7416534), (9, 0.24565668)]

In [23]:
lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_display)

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
