In [1]:
import xmltodict
import glob
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
from time import time


# <Document>
# <Id>en_2014-01-12_99ed1bbd8128fe754e1d37ef5a13384f59a2b49</Id>
# <SourceName>Arutz Sheva</SourceName>
# <CaptureDateTime>01 Dec 2014 00:00:00</CaptureDateTime>
# <PublicationDateTime>01 Dec 2014 00:00:00</PublicationDateTime>
# <Title>Atty-General to Seek Controversial Mayor’s Ouster</Title>
# <TranslatedTitle/>
# <SourceCoverage>ISRAEL</SourceCoverage>
# <Url>http://www.israelnationalnews.com/News/News.aspx/176185</Url>
# <Encoding>UTF-8</Encoding>
# <Language>English</Language><SourceType/>
# <RobotName>israelnat4048</RobotName>
# <Text>Document Test</Text>
# </Document>

def removeNonAscii(s): return "".join(i for i in s if ord(i) < 128)

tokenizer = RegexpTokenizer(r'\w+')
# create English stop words list
en_stop = get_stop_words('en')
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
start = time()
corpus = []
path = "/home/sonic/sonic/EOS_DATA/XML_Export_6-8-2015/ContainingTerms/English/Ninewa/*.xml"
for fname in glob.glob(path):
    # print(fname)
    with open(fname) as fd:
        doc = xmltodict.parse(fd.read())
        # print(doc['Document']['Id'])
        # print(doc['Document']['Text'])
        # clean (Remove non-ascii) and tokenize document string
        raw = removeNonAscii(doc['Document']['Text'].lower())
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        try:
            # stem tokens
            stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
            # add tokens to list
            stemmed_tokens = [word for word in stemmed_tokens if word.isalpha()]
            corpus.append(stemmed_tokens)
        except:
            print("Caught it!")

#Clean data. Some tokenized words end up as just "s" as the word. 
#For example baby's could be split into 'baby', and 's'. 
for sentence in corpus:
    for word in sentence:
        if word == 's':
            sentence.remove(word)

print('corpus size:', len(corpus))

tokens_after_lemmas_and_rm_stopwords = open('data/eos_tokens_stopwords.txt', 'w')
for item in corpus:
    tokens_after_lemmas_and_rm_stopwords.write("%s\n" % item)

print('Cell took %.2f seconds to run.' % (time() - start))

Caught it!
Caught it!


Caught it!


Caught it!


Caught it!


Caught it!


Caught it!


Caught it!
Caught it!


Caught it!


Caught it!


Caught it!


corpus size: 11714


Cell took 108.47 seconds to run.


In [2]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(corpus)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in corpus]

In [4]:
# generate LDA model
# ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=20)
start = time()
ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, id2word=dictionary, num_topics=20, chunksize=1000, passes=20, workers=8)
print('Cell took %.2f seconds to run.' % (time() - start))

Cell took 839.30 seconds to run.


In [5]:
from sklearn.externals import joblib

dictionary.save_as_text('data/dictionary_EOS.txt')

corpora.MmCorpus.serialize('data/nostopwords_corpus_EOS.mm', corpus)
    
joblib.dump(ldamodel, 'data/ldamodel_20_EOS.pkl')

['data/ldamodel_20_EOS.pkl']

In [6]:
dictionary = gensim.corpora.Dictionary.load_from_text('data/dictionary_EOS.txt')
corpus = gensim.corpora.MmCorpus('data/nostopwords_corpus_EOS.mm')
lda = joblib.load('data/ldamodel_20_EOS.pkl')

(lda.print_topics(num_topics=20, num_words=8))

[(0,
  '0.018*"israel" + 0.015*"isra" + 0.013*"palestinian" + 0.008*"minist" + 0.007*"said" + 0.007*"elect" + 0.006*"parti" + 0.006*"jerusalem"'),
 (1,
  '0.008*"will" + 0.006*"oil" + 0.006*"said" + 0.006*"year" + 0.004*"new" + 0.004*"price" + 0.004*"can" + 0.004*"compani"'),
 (2,
  '0.018*"court" + 0.012*"year" + 0.010*"mubarak" + 0.009*"egypt" + 0.009*"said" + 0.008*"sentenc" + 0.008*"charg" + 0.008*"prison"'),
 (3,
  '0.010*"qatar" + 0.009*"huang" + 0.007*"infect" + 0.006*"integr" + 0.005*"difficil" + 0.005*"cdi" + 0.004*"pronounc" + 0.004*"al"'),
 (4,
  '0.023*"report" + 0.022*"cia" + 0.014*"said" + 0.013*"interrog" + 0.011*"tortur" + 0.010*"senat" + 0.008*"intellig" + 0.007*"us"'),
 (5,
  '0.073*"de" + 0.028*"la" + 0.016*"le" + 0.015*"n" + 0.012*"en" + 0.009*"et" + 0.007*"un" + 0.006*"l"'),
 (6,
  '0.020*"said" + 0.014*"al" + 0.010*"state" + 0.008*"islam" + 0.008*"forc" + 0.008*"group" + 0.007*"attack" + 0.007*"kill"'),
 (7,
  '0.006*"offic" + 0.006*"will" + 0.006*"christma" + 0.0

In [16]:
import pyLDAvis
import pyLDAvis.gensim

lda_vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.display(lda_vis)