# 1. Load the 20newsgroups dataset

In [63]:
# Import librairies and load data

import numpy as np
import pandas as pd
import re
from sklearn.datasets import fetch_20newsgroups
from gensim import corpora, models, similarities

from nltk import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

import string

# categories   = ['sci.space','comp.graphics', 'sci.med', 'rec.motorcycles', 'rec.sport.baseball']

categories   = ['talk.politics.guns','comp.graphics', 'sci.med', 'rec.motorcycles', 'rec.sport.baseball']


dataset = fetch_20newsgroups(subset='train',  categories=categories, shuffle=True, random_state=42)

for i in range(len(dataset.target_names)):
    print(" %s has %d texts "% (dataset.target_names[i], Counter(dataset.target)[i]     )  )


 comp.graphics has 584 texts 
 rec.motorcycles has 598 texts 
 rec.sport.baseball has 597 texts 
 sci.med has 594 texts 
 talk.politics.guns has 546 texts 


# 2. clean up the data

The clean_up function below 
* lowercase texts
* removes ponctuation
* tokenizes
* removes stop words
* and words with less than 3 letters


In [14]:
# stopwords



In [24]:
# stopwords
stop = set(stopwords.words('english'))
# add the ones you want
stop.update(['edu', 'com'])

# list of Punctuation characters
punctuation_chars = list(string.punctuation)

def cleanup(raw):
    # lowercase
    raw = raw.lower()
    # @ and dot in emails => keep recipients and domain names as words
    raw = re.sub('[@.]', ' ', raw)
    # ponctuation
    raw = ''.join(ch for ch in raw if ch not in punctuation_chars)
    # numbers
    raw = re.sub('[0-9]+', '', raw)
    # tokenize
    raw = word_tokenize(raw)
    # stop words
    raw = [w for w in raw if w not in stop]
    # at least 3 letters
    raw = [w for w in raw if len(w) > 2]

    return raw

tokenized = [ cleanup(raw) for raw in dataset.data ]

# you can also use map(function, data_list): applies function to each data item
# tokenized = list(map(cleanup, dataset.data))




In [26]:
tokenized[0:2]


[['todamhyp',
  'charles',
  'unlv',
  'brian',
  'huey',
  'subject',
  'krillean',
  'photography',
  'originator',
  'todamhyp',
  'charles',
  'unlv',
  'organization',
  'university',
  'nevada',
  'las',
  'vegas',
  'college',
  'engineering',
  'lines',
  'think',
  'thats',
  'correct',
  'spelling',
  'looking',
  'informationsupplies',
  'allow',
  'doityourselfers',
  'take',
  'krillean',
  'pictures',
  'thinking',
  'education',
  'suppliers',
  'schools',
  'might',
  'appartus',
  'sale',
  'dont',
  'know',
  'companies',
  'info',
  'greatly',
  'appreciated',
  'case',
  'dont',
  'know',
  'krillean',
  'photography',
  'best',
  'knowledge',
  'involves',
  'taking',
  'pictures',
  'time',
  'organic',
  'object',
  'charged',
  'plates',
  'picture',
  'show',
  'energy',
  'patterns',
  'spikes',
  'around',
  'object',
  'photographed',
  'depending',
  'type',
  'object',
  'spikes',
  'energy',
  'patterns',
  'vary',
  'one',
  'might',
  'extrapolate',
  '

# 3. Gensim dictionary

Lists all the words and assigns an id to each word

https://radimrehurek.com/gensim/corpora/dictionary.html

* doc2bow: for a given tokenized document, counts the occurences of each word
* filter_extremes(no_below=5, no_above=0.5, keep_n=100000)¶


In [41]:
# Dictionnary
dictionary = corpora.Dictionary(tokenized)
dictionary.save('20newgroup.dict')  # store the dictionary, for future reference

# 38777 tokens
print(dictionary)

# each token as a unique id
# print(dictionary.token2id)

# filter rare words
dictionary.filter_extremes(no_below=2, no_above=0.99)

print("after filtering rare words")
print(dictionary)


Dictionary(38777 unique tokens: ['stole', 'favored', 'tightens', 'bertrand', 'valley']...)
after filtering rare words
Dictionary(19842 unique tokens: ['stole', 'edge', 'floggings', 'attacking', 'valley']...)


# 4. Corpus

1. Create the document term matrix
2. TfIdf



In [46]:
# 4.1 Document term matrix

corpus = [dictionary.doc2bow(text) for text in tokenized]

# store to disk, for later use
corpora.MmCorpus.serialize('20newsgroup.mm', corpus)  

print("Document term matrix - 2 documents")
print()
for c in corpus[0:2]:
    print(c)

print('---------------')
# 4.2 Tf-Idf    

tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
corpus_tfidf = tfidf[corpus]

print("Document term TF-Idf matrix - 2 documents")
print()
for doc in corpus_tfidf[0:2]:
    print(doc)


Document term matrix - 2 documents

[(132, 1), (458, 1), (788, 1), (951, 1), (1102, 1), (1780, 1), (1854, 1), (2093, 1), (2633, 1), (2644, 1), (2933, 1), (3042, 1), (3234, 1), (3285, 1), (3556, 1), (3686, 1), (4351, 1), (4354, 1), (4378, 1), (4612, 1), (4843, 2), (5043, 2), (5142, 1), (5220, 1), (5324, 1), (5704, 1), (6338, 1), (6441, 2), (6534, 1), (7046, 3), (7081, 1), (7369, 1), (7431, 1), (7555, 1), (8479, 1), (8680, 4), (8771, 1), (8891, 3), (9222, 1), (9359, 1), (9798, 1), (9995, 1), (10182, 1), (10390, 1), (10707, 1), (10742, 2), (11462, 1), (11739, 2), (11867, 1), (11955, 1), (12251, 1), (12405, 1), (12407, 1), (12417, 1), (12526, 1), (12535, 2), (13741, 1), (13965, 2), (14083, 1), (14249, 1), (14712, 1), (14737, 1), (14744, 1), (14832, 1), (14833, 1), (15212, 2), (15223, 1), (15234, 1), (15304, 2), (15415, 3), (15492, 1), (15793, 2), (15884, 1), (15893, 1), (17070, 1), (17216, 1), (17596, 1), (17674, 1), (17915, 1), (18135, 1), (18236, 1), (19216, 1), (19837, 1)]
[(145, 1), (3

# 5. LSA / LSI

In [51]:
# initialize an LSI transformation with 5 topics
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=15)
corpus_lsi = lsi[corpus_tfidf]

# save and load 
# lsi.save('20newsgroup.lsi') 
# lsi = models.LsiModel.load('20newsgroup.lsi')

lsi.print_topics(10, num_words = 10)


[(0,
  '-0.120*"space" + -0.103*"would" + -0.099*"pitt" + -0.093*"nasa" + -0.088*"dont" + -0.088*"one" + -0.084*"gordon" + -0.084*"geb" + -0.084*"banks" + -0.081*"like"'),
 (1,
  '-0.407*"geb" + -0.399*"pitt" + -0.392*"banks" + -0.389*"gordon" + -0.143*"shameful" + -0.143*"surrender" + -0.143*"dsl" + -0.143*"cadre" + -0.143*"njxp" + -0.143*"chastity"'),
 (2,
  '0.260*"space" + 0.207*"nasa" + 0.177*"alaska" + 0.170*"henry" + 0.141*"gov" + 0.128*"access" + 0.116*"digex" + 0.116*"toronto" + -0.107*"team" + -0.107*"baseball"'),
 (3,
  '0.371*"msg" + -0.173*"alaska" + 0.160*"food" + -0.155*"henry" + -0.144*"toronto" + -0.117*"team" + -0.106*"year" + -0.101*"jewish" + -0.096*"zoo" + -0.094*"baseball"'),
 (4,
  '0.543*"msg" + 0.225*"food" + -0.133*"graphics" + 0.132*"dyer" + 0.129*"sensitivity" + 0.125*"chinese" + 0.111*"superstition" + 0.105*"alaska" + 0.100*"henry" + -0.097*"bike"'),
 (5,
  '0.247*"henry" + 0.216*"toronto" + 0.198*"bike" + -0.158*"nasa" + 0.150*"zoo" + 0.129*"behanna" + 0.1

In [57]:
# see which doc belongs to which topic
for doc in corpus_lsi[0:2]:
    print()
    print(doc)
    

[(0, -0.094176330990150187), (1, 0.017489563483115533), (2, -0.008565810482037018), (3, 0.035013261917545141), (4, -0.022675746897196682), (5, -0.0086456778370520744), (6, -0.019942609692939102), (7, -0.015968122318651374), (8, -0.007537146304624316), (9, -0.01018110548576481), (10, 0.040224141496179845), (11, -0.0093313926044533754), (12, 0.037485444616863421), (13, 0.0035956442367594287), (14, 0.0041905048562194984)]
[(0, -0.10236062618271063), (1, 0.027621001089529276), (2, -0.06509513980806983), (3, -0.046894327287789341), (4, 0.0086447891487526074), (5, -0.034191072959614766), (6, -0.0071604064409135845), (7, 0.014992869111242025), (8, 0.00019409843778628511), (9, -0.009316413427374489), (10, -0.0095208575229727335), (11, -0.016802274101085309), (12, -0.018744476453832693), (13, -0.015644077883506323), (14, 0.0026116419697328904)]


# 6. LDA


In [61]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=5, alpha='auto', iterations = 250)
corpus_lda = lda[corpus]

lda.print_topics(5, num_words = 15)

# for doc in corpus_lda:
#     print(doc)


[(0,
  '0.007*organization + 0.007*writes + 0.006*one + 0.005*article + 0.005*would + 0.004*apr + 0.004*like + 0.004*dont + 0.003*think + 0.003*good + 0.003*time + 0.003*well + 0.003*university + 0.003*nntppostinghost + 0.003*know'),
 (1,
  '0.009*organization + 0.005*writes + 0.005*one + 0.005*article + 0.005*university + 0.004*dont + 0.004*nntppostinghost + 0.004*would + 0.004*get + 0.004*know + 0.003*like + 0.003*think + 0.003*dod + 0.003*apr + 0.002*good'),
 (2,
  '0.010*space + 0.006*organization + 0.004*nasa + 0.004*like + 0.004*article + 0.004*writes + 0.004*university + 0.004*also + 0.003*image + 0.003*one + 0.003*would + 0.003*research + 0.003*get + 0.003*use + 0.003*nntppostinghost'),
 (3,
  '0.007*would + 0.007*organization + 0.004*jpeg + 0.004*writes + 0.004*one + 0.004*nntppostinghost + 0.003*get + 0.003*article + 0.003*dont + 0.003*know + 0.003*university + 0.003*like + 0.003*image + 0.003*good + 0.003*new'),
 (4,
  '0.009*organization + 0.006*writes + 0.005*article + 0.0

# 7. refine

* add stop words
* remove high frequency words



In [66]:
stop.update(['like', 'dont', 'one', 'would', 'new', 'get', 'also', 'writes', 'article'])
punctuation_chars = list(string.punctuation)

print()
print("Tokenize")
tokenized = list(map(cleanup, dataset.data))

print()
print("Dictionary")
dictionary = corpora.Dictionary(tokenized)
# dictionary.filter_extremes(no_below=5, no_above=0.90)
print(dictionary)

print()
print("TfIdf")
corpus = [dictionary.doc2bow(text) for text in tokenized]
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
corpus_tfidf = tfidf[corpus]

print()
print("LDA")
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=9, alpha='auto', iterations = 150)
corpus_lda = lda[corpus]
lda.print_topics(9, num_words = 15)




Tokenize

Dictionary
Dictionary(38934 unique tokens: ['stole', 'biting', 'koreshians', 'mmkusoo', 'favored']...)

TfIdf

LDA


[(0,
  '0.006*lines + 0.006*organization + 0.005*subject + 0.004*well + 0.004*people + 0.004*year + 0.003*think + 0.003*right + 0.003*good + 0.003*better + 0.003*time + 0.003*first + 0.003*even + 0.003*gun + 0.003*last'),
 (1,
  '0.008*organization + 0.008*lines + 0.008*subject + 0.005*nntppostinghost + 0.004*people + 0.004*university + 0.003*think + 0.003*apr + 0.003*good + 0.003*could + 0.002*nasa + 0.002*distribution + 0.002*know + 0.002*two + 0.002*well'),
 (2,
  '0.006*subject + 0.006*lines + 0.005*organization + 0.004*team + 0.003*cubs + 0.003*nntppostinghost + 0.003*well + 0.003*ibm + 0.003*good + 0.003*first + 0.003*university + 0.003*think + 0.003*year + 0.002*last + 0.002*games'),
 (3,
  '0.006*lines + 0.006*subject + 0.005*organization + 0.004*file + 0.003*use + 0.003*university + 0.003*research + 0.002*nntppostinghost + 0.002*know + 0.002*image + 0.002*need + 0.002*program + 0.002*number + 0.002*apr + 0.002*bike'),
 (4,
  '0.006*graphics + 0.005*lines + 0.005*subject + 0.00

In [69]:
import pyLDAvis.gensim

import matplotlib.pyplot as plt
%matplotlib inline

ldavis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.display(ldavis)

  inline backend."""
  'retina', 'jpeg', 'svg', 'pdf'.""")
  use `figure_formats` instead)""")
  """
  """)
  def _config_changed(self, name, old, new):
