# 1. Load the 20newsgroups dataset

In [63]:
# Import librairies and load data

import numpy as np
import pandas as pd
import re
from sklearn.datasets import fetch_20newsgroups
from gensim import corpora, models, similarities

from nltk import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

import string

# categories   = ['sci.space','comp.graphics', 'sci.med', 'rec.motorcycles', 'rec.sport.baseball']

categories   = ['talk.politics.guns','comp.graphics', 'sci.med', 'rec.motorcycles', 'rec.sport.baseball']


dataset = fetch_20newsgroups(subset='train',  categories=categories, shuffle=True, random_state=42)

for i in range(len(dataset.target_names)):
    print(" %s has %d texts "% (dataset.target_names[i], Counter(dataset.target)[i]     )  )


 comp.graphics has 584 texts 
 rec.motorcycles has 598 texts 
 rec.sport.baseball has 597 texts 
 sci.med has 594 texts 
 talk.politics.guns has 546 texts 


# 2. clean up the data

The clean_up function below 
* lowercase texts
* removes ponctuation
* tokenizes
* removes stop words
* and words with less than 3 letters


In [73]:
# stopwords
stop = set(stopwords.words('english'))


In [76]:
# stopwords
stop = set(stopwords.words('english'))
# add the ones you want
stop.update(['edu', 'com'])

# list of Punctuation characters
punctuation_chars = list(string.punctuation)

def cleanup(raw):
    # lowercase
    raw = raw.lower()
    # @ and dot in emails => keep recipients and domain names as words
    raw = re.sub('[@.]', ' ', raw)
    # ponctuation
    raw = ''.join([ch for ch in raw if ch not in punctuation_chars])
    # numbers
    raw = re.sub('[0-9]+', '', raw)
    # tokenize
    raw = word_tokenize(raw)
    # stop words
    raw = [w for w in raw if w not in stop]
    # at least 3 letters
    raw = [w for w in raw if len(w) > 2]

    return raw

tokenized = [ cleanup(raw) for raw in dataset.data ]

# you can also use map(function, data_list): applies function to each data item
# tokenized = list(map(cleanup, dataset.data))



AttributeError: 'list' object has no attribute 'shape'

In [75]:
tokenized

['hello', 'world', 'earth', 'alex', 'gmail']

In [77]:
# tokenized[0:2]


# 3. Gensim dictionary

Lists all the words and assigns an id to each word

https://radimrehurek.com/gensim/corpora/dictionary.html

* doc2bow: for a given tokenized document, counts the occurences of each word
* filter_extremes(no_below=5, no_above=0.5, keep_n=100000)Â¶


In [78]:
# Dictionnary
dictionary = corpora.Dictionary(tokenized)
# dictionary.save('20newgroup.dict')  # store the dictionary, for future reference

# 38777 tokens
print(dictionary)

# each token as a unique id
# print(dictionary.token2id)

# filter rare words
dictionary.filter_extremes(no_below=2, no_above=0.99)

print("after filtering rare words")
print(dictionary)


Dictionary(38943 unique tokens: ['stole', 'biting', 'koreshians', 'mmkusoo', 'favored']...)
after filtering rare words
Dictionary(19550 unique tokens: ['stole', 'biting', 'koreshians', 'floggings', 'attacking']...)


# 4. Corpus

1. Create the document term matrix
2. TfIdf



In [80]:
# 4.1 Document term matrix

corpus = [dictionary.doc2bow(text) for text in tokenized]

# store to disk, for later use
# corpora.MmCorpus.serialize('20newsgroup.mm', corpus)  

print("Document term matrix - 2 documents")
print()
for c in corpus[0:2]:
    print(c)

print('---------------')
# 4.2 Tf-Idf    

tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
corpus_tfidf = tfidf[corpus]

print("Document term TF-Idf matrix - 2 documents")
print()
for doc in corpus_tfidf[0:2]:
    print(doc)


Document term matrix - 2 documents

[(88, 1), (459, 1), (892, 2), (1508, 2), (1991, 1), (2172, 1), (3852, 1), (4482, 1), (4789, 1), (5515, 1), (5747, 1), (6013, 1), (6597, 2), (6653, 1), (7438, 1), (7659, 1), (8045, 1), (8354, 1), (9328, 2), (11084, 1), (12092, 1), (12102, 1), (12466, 2), (13347, 1), (13659, 1), (13944, 1), (14299, 1), (15245, 1), (15869, 1), (16078, 2), (16894, 1), (17567, 1), (18654, 1)]
[(150, 1), (329, 1), (459, 1), (686, 1), (873, 1), (1280, 1), (1545, 1), (1610, 1), (2479, 1), (2746, 1), (3598, 1), (3718, 2), (4330, 1), (4582, 4), (4917, 1), (5064, 1), (6113, 2), (6242, 1), (6977, 1), (7191, 2), (7659, 1), (7664, 1), (7712, 2), (7731, 1), (7831, 1), (8060, 1), (8354, 1), (8654, 1), (9179, 1), (9299, 1), (9473, 2), (9683, 1), (9822, 1), (10275, 1), (10829, 1), (11065, 1), (11157, 1), (11659, 1), (11836, 2), (11966, 1), (12472, 1), (12697, 1), (12700, 1), (12713, 1), (13294, 1), (13770, 1), (13969, 1), (13977, 1), (14086, 2), (15759, 1), (16046, 1), (16088, 1), (16

# 5. LSA / LSI

In [81]:
# initialize an LSI transformation with 5 topics
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=15)
corpus_lsi = lsi[corpus_tfidf]

lsi.print_topics(10, num_words = 10)


[(0,
  '0.107*"would" + 0.098*"gun" + 0.096*"people" + 0.093*"pitt" + 0.092*"dont" + 0.090*"one" + 0.082*"like" + 0.080*"think" + 0.079*"get" + 0.079*"banks"'),
 (1,
  '-0.413*"geb" + -0.401*"banks" + -0.395*"gordon" + -0.383*"pitt" + -0.145*"dsl" + -0.145*"shameful" + -0.145*"njxp" + -0.145*"chastity" + -0.144*"cadre" + -0.144*"skepticism"'),
 (2,
  '-0.205*"stratus" + -0.179*"gun" + 0.150*"team" + -0.125*"cdt" + -0.121*"guns" + 0.120*"baseball" + 0.118*"players" + 0.116*"jewish" + 0.115*"game" + 0.115*"games"'),
 (3,
  '-0.287*"msg" + 0.158*"stratus" + -0.132*"graphics" + -0.112*"bike" + -0.111*"food" + 0.109*"team" + 0.097*"year" + 0.096*"cdt" + -0.092*"image" + 0.091*"roby"'),
 (4,
  '0.604*"msg" + 0.237*"food" + 0.147*"sensitivity" + 0.143*"chinese" + 0.141*"dyer" + 0.124*"superstition" + -0.106*"bike" + -0.100*"graphics" + 0.087*"glutamate" + 0.070*"restaurant"'),
 (5,
  '-0.306*"stratus" + 0.266*"gun" + -0.185*"cdt" + 0.169*"guns" + -0.148*"roby" + -0.132*"udel" + -0.108*"fbi" +

In [82]:
# see which doc belongs to which topic
for doc in corpus_lsi[0:2]:
    print()
    print(doc)
    


[(0, 0.031013269373317869), (1, 0.0031047534489101716), (2, 0.0056420173566220696), (3, -0.017235817978922926), (4, -0.01922738193925437), (5, 0.0039297795940552629), (6, -0.065209400479206717), (7, 0.0034089220372552219), (8, -0.01438172370065431), (9, 0.063910868518286743), (10, 0.03209516414150114), (11, 0.013149872397838869), (12, 0.011412402259838268), (13, 0.0053663091064188031), (14, 0.0096786610484557462)]

[(0, 0.11753025932963228), (1, 0.020651623938426126), (2, 0.026695616121991259), (3, -0.006520254963897373), (4, -0.015265482087941054), (5, -0.0061062783332224467), (6, -0.038997657667766195), (7, 0.0070042924346285229), (8, 0.018043204863352705), (9, 0.013751179021428148), (10, -0.082878727671468574), (11, 0.044854895551579503), (12, -0.0043921949217883179), (13, -0.017049232997016734), (14, -0.016465101326741519)]


# 6. LDA


In [83]:
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=5, alpha='auto', iterations = 250)

corpus_lda = lda[corpus]

lda.print_topics(5, num_words = 15)

# for doc in corpus_lda:
#     print(doc)


[(0,
  '0.007*organization + 0.005*one + 0.005*like + 0.004*writes + 0.004*article + 0.004*know + 0.003*would + 0.003*get + 0.003*nntppostinghost + 0.003*also + 0.003*dont + 0.003*time + 0.003*university + 0.002*people + 0.002*good'),
 (1,
  '0.006*organization + 0.005*article + 0.004*would + 0.004*writes + 0.004*one + 0.004*university + 0.003*people + 0.003*data + 0.003*image + 0.003*nntppostinghost + 0.003*new + 0.003*know + 0.003*use + 0.003*apr + 0.003*get'),
 (2,
  '0.006*organization + 0.006*writes + 0.006*would + 0.005*article + 0.005*like + 0.005*one + 0.004*dont + 0.004*get + 0.004*well + 0.003*good + 0.003*people + 0.003*file + 0.003*bike + 0.003*apr + 0.003*dod'),
 (3,
  '0.007*organization + 0.005*would + 0.005*one + 0.005*writes + 0.004*university + 0.004*article + 0.004*dont + 0.004*nntppostinghost + 0.004*think + 0.003*file + 0.003*know + 0.003*gun + 0.003*like + 0.003*year + 0.002*well'),
 (4,
  '0.007*organization + 0.006*would + 0.006*writes + 0.005*article + 0.005*on

# 7. refine

* add stop words
* remove high frequency words



In [84]:
stop.update(['like', 'dont', 'one', 'would', 'new', 'get', 'also', 'writes', 'article'])
punctuation_chars = list(string.punctuation)

print()
print("Tokenize")
tokenized = list(map(cleanup, dataset.data))

print()
print("Dictionary")
dictionary = corpora.Dictionary(tokenized)
# dictionary.filter_extremes(no_below=5, no_above=0.90)
print(dictionary)

print()
print("TfIdf")
corpus = [dictionary.doc2bow(text) for text in tokenized]
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
corpus_tfidf = tfidf[corpus]

print()
print("LDA")
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=9, alpha='auto', iterations = 150)
corpus_lda = lda[corpus]
lda.print_topics(9, num_words = 15)




Tokenize

Dictionary
Dictionary(38934 unique tokens: ['stole', 'biting', 'koreshians', 'mmkusoo', 'favored']...)

TfIdf

LDA


[(0,
  '0.007*subject + 0.006*lines + 0.006*organization + 0.004*image + 0.004*university + 0.004*graphics + 0.003*email + 0.003*nntppostinghost + 0.003*program + 0.003*nazi + 0.002*nasa + 0.002*use + 0.002*know + 0.002*data + 0.002*file'),
 (1,
  '0.006*subject + 0.006*organization + 0.006*lines + 0.005*well + 0.004*year + 0.004*good + 0.003*people + 0.003*last + 0.003*know + 0.003*right + 0.003*first + 0.003*nntppostinghost + 0.003*dod + 0.003*gun + 0.003*game'),
 (2,
  '0.006*organization + 0.006*subject + 0.006*lines + 0.003*university + 0.003*know + 0.003*nntppostinghost + 0.003*may + 0.003*even + 0.002*think + 0.002*two + 0.002*points + 0.002*firearms + 0.002*gun + 0.002*use + 0.002*time'),
 (3,
  '0.010*organization + 0.009*lines + 0.009*subject + 0.004*nntppostinghost + 0.004*apr + 0.003*pitt + 0.003*university + 0.003*gordon + 0.003*know + 0.003*dod + 0.003*geb + 0.003*bnr + 0.003*good + 0.003*much + 0.002*banks'),
 (4,
  '0.006*lines + 0.006*subject + 0.005*organization + 0.0

In [85]:
import pyLDAvis.gensim

import matplotlib.pyplot as plt
%matplotlib inline

ldavis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.display(ldavis)