In [1]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', shuffle = True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle = True)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [2]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [7]:
newsgroups_train.target.shape, newsgroups_train.filenames.shape

((11314,), (11314,))

In [9]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)


In [12]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/allen/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [13]:
# Lemmatizer Example
print(WordNetLemmatizer().lemmatize('went', pos = 'v'))

go


In [16]:
# Stemmer Example
import pandas as pd
stemmer = SnowballStemmer("english")
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]

pd.DataFrame(data={'original word':original_words, 'stemmed':singles })

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [17]:
# stemmer example 2
from nltk.stem import PorterStemmer
ps = PorterStemmer()
singles = [ps.stem(plural) for plural in original_words]
pd.DataFrame(data={'original word':original_words, 'stemmed':singles })

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [18]:
'''
Write a function to perform the pre processing steps on the entire dataset
'''
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result


In [19]:
'''
Preview a document after preprocessing
'''
document_num = 50
doc_sample = 'This disk has failed many times. I would like to get it replaced.'

print("Original document: ")
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print("\n\nTokenized and lemmatized document: ")
print(preprocess(doc_sample))

Original document: 
['This', 'disk', 'has', 'failed', 'many', 'times.', 'I', 'would', 'like', 'to', 'get', 'it', 'replaced.']


Tokenized and lemmatized document: 
['disk', 'fail', 'time', 'like', 'replac']


In [20]:
processed_docs = []

for doc in newsgroups_train.data:
    processed_docs.append(preprocess(doc))

In [21]:
print(processed_docs[:3])

[['lerxst', 'thing', 'subject', 'nntp', 'post', 'host', 'organ', 'univers', 'maryland', 'colleg', 'park', 'line', 'wonder', 'enlighten', 'door', 'sport', 'look', 'late', 'earli', 'call', 'bricklin', 'door', 'small', 'addit', 'bumper', 'separ', 'rest', 'bodi', 'know', 'tellm', 'model', 'engin', 'spec', 'year', 'product', 'histori', 'info', 'funki', 'look', 'mail', 'thank', 'bring', 'neighborhood', 'lerxst'], ['guykuo', 'carson', 'washington', 'subject', 'clock', 'poll', 'final', 'summari', 'final', 'clock', 'report', 'keyword', 'acceler', 'clock', 'upgrad', 'articl', 'shelley', 'qvfo', 'innc', 'organ', 'univers', 'washington', 'line', 'nntp', 'post', 'host', 'carson', 'washington', 'fair', 'number', 'brave', 'soul', 'upgrad', 'clock', 'oscil', 'share', 'experi', 'poll', 'send', 'brief', 'messag', 'detail', 'experi', 'procedur', 'speed', 'attain', 'rat', 'speed', 'card', 'adapt', 'heat', 'sink', 'hour', 'usag', 'floppi', 'disk', 'function', 'floppi', 'especi', 'request', 'summar', 'day',

In [22]:
dictionary = gensim.corpora.Dictionary(processed_docs)


In [23]:
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)


In [24]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]


In [34]:
document_num = 0
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 0 ("addit") appears 1 time.
Word 1 ("bodi") appears 1 time.
Word 2 ("bring") appears 1 time.
Word 3 ("bumper") appears 1 time.
Word 4 ("call") appears 1 time.
Word 5 ("colleg") appears 1 time.
Word 6 ("door") appears 2 time.
Word 7 ("earli") appears 1 time.
Word 8 ("engin") appears 1 time.
Word 9 ("enlighten") appears 1 time.
Word 10 ("histori") appears 1 time.
Word 11 ("info") appears 1 time.
Word 12 ("late") appears 1 time.
Word 13 ("maryland") appears 1 time.
Word 14 ("model") appears 1 time.
Word 15 ("neighborhood") appears 1 time.
Word 16 ("park") appears 1 time.
Word 17 ("product") appears 1 time.
Word 18 ("rest") appears 1 time.
Word 19 ("separ") appears 1 time.
Word 20 ("small") appears 1 time.
Word 21 ("spec") appears 1 time.
Word 22 ("sport") appears 1 time.
Word 23 ("wonder") appears 1 time.


In [32]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 8, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [36]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {}  \nWords: {}".format(idx,  topic ))
    print("\n")

Topic: 0 , alt.atheism 
Words: 0.007*"bike" + 0.006*"game" + 0.005*"team" + 0.004*"run" + 0.004*"player" + 0.004*"virginia" + 0.004*"play" + 0.004*"pitch" + 0.004*"homosexu" + 0.003*"defens"


Topic: 1 , comp.graphics 
Words: 0.009*"govern" + 0.007*"armenian" + 0.005*"israel" + 0.005*"kill" + 0.005*"isra" + 0.004*"american" + 0.004*"turkish" + 0.004*"countri" + 0.004*"weapon" + 0.004*"live"


Topic: 2 , comp.os.ms-windows.misc 
Words: 0.016*"game" + 0.014*"team" + 0.011*"play" + 0.009*"hockey" + 0.008*"player" + 0.005*"canada" + 0.005*"season" + 0.004*"leagu" + 0.004*"andrew" + 0.004*"score"


Topic: 3 , comp.sys.ibm.pc.hardware 
Words: 0.010*"card" + 0.010*"window" + 0.008*"driver" + 0.007*"sale" + 0.006*"price" + 0.005*"speed" + 0.005*"appl" + 0.005*"monitor" + 0.005*"video" + 0.005*"engin"


Topic: 4 , comp.sys.mac.hardware 
Words: 0.014*"file" + 0.010*"program" + 0.009*"window" + 0.006*"encrypt" + 0.006*"chip" + 0.006*"imag" + 0.006*"data" + 0.006*"avail" + 0.005*"version" + 0.004*

In [41]:
test_data = newsgroups_test.data[100]
print(test_data)

Subject: help
From: C..Doelle@p26.f3333.n106.z1.fidonet.org (C. Doelle)
Lines: 13

Hello All!

    It is my understanding that all True-Type fonts in Windows are loaded in
prior to starting Windows - this makes getting into Windows quite slow if you
have hundreds of them as I do.  First off, am I correct in this thinking -
secondly, if that is the case - can you get Windows to ignore them on boot and
maybe make something like a PIF file to load them only when you enter the
applications that need fonts?  Any ideas?


Chris

 * Origin: chris.doelle.@f3333.n106.z1.fidonet.org (1:106/3333.26)



In [42]:
bow_vector = dictionary.doc2bow(preprocess(test_data))
bow_vector

[(98, 1),
 (175, 1),
 (189, 1),
 (228, 1),
 (237, 1),
 (259, 1),
 (284, 1),
 (307, 1),
 (350, 1),
 (515, 1),
 (727, 1),
 (746, 1),
 (766, 2),
 (971, 1),
 (988, 4),
 (1025, 1),
 (1072, 2),
 (1075, 2),
 (1095, 1),
 (1951, 1),
 (3114, 1),
 (3462, 1),
 (3983, 2)]

In [43]:
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.6057841777801514	 Topic: 0.010*"card" + 0.010*"window" + 0.008*"driver" + 0.007*"sale" + 0.006*"price"
Score: 0.3699764907360077	 Topic: 0.014*"file" + 0.010*"program" + 0.009*"window" + 0.006*"encrypt" + 0.006*"chip"


In [45]:
newsgroups_test.target[100]

2