#### Importing all required libraries:

In [19]:
from sklearn.datasets import fetch_20newsgroups

In [29]:
# pip install gensim

In [22]:
import nltk

In [24]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

unable to import 'smart_open.gcs', disabling that module


In [25]:
doc_sample = 'This disk has failed many times. I would like to get it replaced.'

#### We will tokenize the above doc_sample by two ways:

In [26]:
gensim.utils.simple_preprocess(doc_sample)

['this',
 'disk',
 'has',
 'failed',
 'many',
 'times',
 'would',
 'like',
 'to',
 'get',
 'it',
 'replaced']

In [28]:
nltk.word_tokenize(doc_sample)

['This',
 'disk',
 'has',
 'failed',
 'many',
 'times',
 '.',
 'I',
 'would',
 'like',
 'to',
 'get',
 'it',
 'replaced',
 '.']

#### We can see from abour results that gensim.utils.simple_preprocess(text) tokenizes the words better than nltk.word_tokenize()

#### This is how WordNetLemmatizer() works:

In [30]:
print(WordNetLemmatizer().lemmatize('went', pos = 'v')) # past tense to present tense

go


#### This is how SnowballStemmer works:

In [31]:
import pandas as pd
stemmer = SnowballStemmer("english")
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]

pd.DataFrame(data={'original word':original_words, 'stemmed':singles })

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


#### gensim.parsing.preprocessing.STOPWORDS is basically a 'frozenset' of stopwords:
#### (A frozenset is basically a set whose values cannot be modified once created)

In [32]:
 gensim.parsing.preprocessing.STOPWORDS

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

#### Now we will write a function to perform the preprocessing of entire text:
#### Steps:
#### First split the input string into tokens using gensim.utils.simple_preprocess
#### Then for every token in the above list of tokens:
#### Check if that token belongs to STOPWORDS frozenset, if not, lemmatize it and then append it to result list

In [33]:
stemmer=SnowballStemmer("english")

def lemmatize_(word):
    return stemmer.stem(WordNetLemmatizer().lemmatize(word,pos='v'))

def preprocess(text):
    result=[]
    tokens=gensim.utils.simple_preprocess(text)
    for token in tokens:
        if(token not in gensim.parsing.preprocessing.STOPWORDS and len(token)>3):
            result.append(lemmatize_(token))
            
    return result

#### Testing the above function on a sample piece of text:

In [35]:
doc_sample #this is defined above

'This disk has failed many times. I would like to get it replaced.'

In [36]:
doc_sample.split()

['This',
 'disk',
 'has',
 'failed',
 'many',
 'times.',
 'I',
 'would',
 'like',
 'to',
 'get',
 'it',
 'replaced.']

In [37]:
preprocess(doc_sample)

['disk', 'fail', 'time', 'like', 'replac']

#### Let's load the dataset:

In [38]:
newsgroups_train = fetch_20newsgroups(subset='train', shuffle = True)
newsgroups_test = fetch_20newsgroups(subset='test', shuffle = True)

In [42]:
type(newsgroups_train)

sklearn.utils.Bunch

In [41]:
newsgroups_train['data']

["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n",
 "From: guykuo@carson.u.washington.edu (Guy Kuo)\nSubject: SI Clock Poll - Final Call\nSummary: Final call for SI clock reports\nKeywords: SI,acceleration,clock,upgrade\nArticle-I.D.: shelley.1qvfo9INNc3s\nOrganization: University of Washington\nLines: 

In [46]:
preprocessed_docs=[]
for doc in newsgroups_train['data']:
    preprocessed_docs.append(preprocess(doc))
    
preprocessed_docs

[['lerxst',
  'thing',
  'subject',
  'nntp',
  'post',
  'host',
  'organ',
  'univers',
  'maryland',
  'colleg',
  'park',
  'line',
  'wonder',
  'enlighten',
  'door',
  'sport',
  'look',
  'late',
  'earli',
  'call',
  'bricklin',
  'door',
  'small',
  'addit',
  'bumper',
  'separ',
  'rest',
  'bodi',
  'know',
  'tellm',
  'model',
  'engin',
  'spec',
  'year',
  'product',
  'histori',
  'info',
  'funki',
  'look',
  'mail',
  'thank',
  'bring',
  'neighborhood',
  'lerxst'],
 ['guykuo',
  'carson',
  'washington',
  'subject',
  'clock',
  'poll',
  'final',
  'summari',
  'final',
  'clock',
  'report',
  'keyword',
  'acceler',
  'clock',
  'upgrad',
  'articl',
  'shelley',
  'qvfo',
  'innc',
  'organ',
  'univers',
  'washington',
  'line',
  'nntp',
  'post',
  'host',
  'carson',
  'washington',
  'fair',
  'number',
  'brave',
  'soul',
  'upgrad',
  'clock',
  'oscil',
  'share',
  'experi',
  'poll',
  'send',
  'brief',
  'messag',
  'detail',
  'experi',
  

#### Next Step is to create a bag of words of the preprocessed docs
#### The bag of words is a dictionary where the key is the word and the value is how many times the word occurs in the document

In [49]:
dictionary=gensim.corpora.Dictionary(preprocessed_docs) #dictionary is an object

In [50]:
dictionary

<gensim.corpora.dictionary.Dictionary at 0x14d62830>

In [51]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 addit
1 bodi
2 bricklin
3 bring
4 bumper
5 call
6 colleg
7 door
8 earli
9 engin
10 enlighten


In [52]:

'''
OPTIONAL STEP
Remove very rare and very common words:

- words appearing less than 15 times
- words appearing in more than 10% of all documents
'''
dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)

In [55]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]

In [60]:
dictionary[bow_corpus[0][0][0]]

'addit'

#### bow_corpus contains a list of tuples
#### Each tuple comprises of: (key , frequency of occurence)
#### Where key is key of the word in above created dictionary

In [56]:
bow_corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 2),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1)],
 [(24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 2),
  (33, 5),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 2),
  (39, 1),
  (40, 2),
  (41, 2),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 3),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 2),
  (62, 1),
  (63, 1),
  (64, 3),
  (65, 1),
  (66, 4)],
 [(8, 2),
  (11, 2),
  (23, 1),
  (26, 1),
  (36, 2),
  (40, 2),
  (43, 1),
  (49, 1),
  (63, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 3),
  (72, 1),
  (73, 1),
  (74, 2),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 3),
  (81, 1),
  (82, 1

In [57]:
'''
Preview BOW for our sample preprocessed document
'''
document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 18 ("rest") appears 1 time.
Word 166 ("clear") appears 1 time.
Word 336 ("refer") appears 1 time.
Word 350 ("true") appears 1 time.
Word 391 ("technolog") appears 1 time.
Word 437 ("christian") appears 1 time.
Word 453 ("exampl") appears 1 time.
Word 476 ("jew") appears 1 time.
Word 480 ("lead") appears 1 time.
Word 482 ("littl") appears 3 time.
Word 520 ("wors") appears 2 time.
Word 721 ("keith") appears 3 time.
Word 732 ("punish") appears 1 time.
Word 803 ("california") appears 1 time.
Word 859 ("institut") appears 1 time.
Word 917 ("similar") appears 1 time.
Word 990 ("allan") appears 1 time.
Word 991 ("anti") appears 1 time.
Word 992 ("arriv") appears 1 time.
Word 993 ("austria") appears 1 time.
Word 994 ("caltech") appears 2 time.
Word 995 ("distinguish") appears 1 time.
Word 996 ("german") appears 1 time.
Word 997 ("germani") appears 3 time.
Word 998 ("hitler") appears 1 time.
Word 999 ("livesey") appears 2 time.
Word 1000 ("motto") appears 2 time.
Word 1001 ("order") appear

In [58]:

# LDA mono-core -- fallback code in case LdaMulticore throws an error on your machine
# lda_model = gensim.models.LdaModel(bow_corpus, 
#                                    num_topics = 10, 
#                                    id2word = dictionary,                                    
#                                    passes = 50)

# LDA multicore 
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
# TODO
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 8, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [59]:

'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.008*"bike" + 0.005*"game" + 0.005*"team" + 0.004*"run" + 0.004*"virginia" + 0.004*"player" + 0.004*"play" + 0.004*"homosexu" + 0.003*"pitch" + 0.003*"motorcycl"


Topic: 1 
Words: 0.009*"govern" + 0.007*"armenian" + 0.006*"israel" + 0.005*"kill" + 0.005*"isra" + 0.004*"american" + 0.004*"turkish" + 0.004*"countri" + 0.004*"weapon" + 0.004*"live"


Topic: 2 
Words: 0.016*"game" + 0.014*"team" + 0.011*"play" + 0.008*"hockey" + 0.008*"player" + 0.005*"season" + 0.005*"canada" + 0.004*"leagu" + 0.004*"score" + 0.004*"toronto"


Topic: 3 
Words: 0.010*"card" + 0.010*"window" + 0.008*"driver" + 0.007*"sale" + 0.006*"price" + 0.005*"speed" + 0.005*"appl" + 0.005*"monitor" + 0.005*"video" + 0.005*"drive"


Topic: 4 
Words: 0.014*"file" + 0.010*"program" + 0.009*"window" + 0.006*"encrypt" + 0.006*"chip" + 0.006*"data" + 0.006*"imag" + 0.006*"avail" + 0.005*"version" + 0.004*"code"


Topic: 5 
Words: 0.012*"space" + 0.009*"nasa" + 0.006*"scienc" + 0.005*"orbit" + 0.004*"resear