In [59]:
import xmltodict
import glob
from nltk.tokenize import RegexpTokenizer
import nltk
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from gensim import corpora, models
import gensim
from time import time
from sklearn.externals import joblib
import gzip
import string
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize

import pyLDAvis
import pyLDAvis.gensim

In [60]:
exclude = list(string.punctuation) 
exclude.extend(['\'s', '\'\'', '``', '–', '‘a', '--', '...'])
# print(exclude)
lemma = WordNetLemmatizer()
# create English stop words list
en_stop = get_stop_words('en')
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
sb_stemmer = SnowballStemmer("english")

def clean(doc):
    stop_free = [i for i in doc if i not in en_stop]
    punc_free = [ch for ch in stop_free if ch not in exclude]
    punc_free = [ch for ch in punc_free if len(ch) > 1]
    normalized = [lemma.lemmatize(word) for word in punc_free]
    # stem tokens 
    # stemmed_tokens = [sb_stemmer.stem(i) for i in normalized]
    return normalized

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', "'s", "''", '``', '–', '‘a', '--', '...']


In [61]:
# NLKT 
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /home/sonic/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [62]:
# <Document>
# <Id>en_2014-01-12_99ed1bbd8128fe754e1d37ef5a13384f59a2b49</Id>
# <SourceName>Arutz Sheva</SourceName>
# <CaptureDateTime>01 Dec 2014 00:00:00</CaptureDateTime>
# <PublicationDateTime>01 Dec 2014 00:00:00</PublicationDateTime>
# <Title>Atty-General to Seek Controversial Mayor’s Ouster</Title>
# <TranslatedTitle/>
# <SourceCoverage>ISRAEL</SourceCoverage>
# <Url>http://www.israelnationalnews.com/News/News.aspx/176185</Url>
# <Encoding>UTF-8</Encoding>
# <Language>English</Language><SourceType/>
# <RobotName>israelnat4048</RobotName>
# <Text>Document Test</Text>
# </Document>


start = time()
corpus = []

# path = "data/Karbala/en_2014-01-12_a1e0ca1a503d6595d49e02207069a524d3d42.xml"
path = "data/Karbala/*.xml"
for fname in glob.glob(path):
    with open(fname, 'r', encoding='utf-8') as fd:
        # print(fname)
        try:
            doc = xmltodict.parse(fd.read())
            # print(doc['Document']['Id'])
            # print(doc['Document']['Text'])
            tokens = word_tokenize(doc['Document']['Text'].lower())
            # print(tokens)
            # add tokens to list
            corpus.append(clean(tokens))
        except Exception as e: 
            print(e)

print('corpus size:', len(corpus))

tokens_rm_stopwords = open('data/eos_tokens_stopwords.txt', 'w')
for item in corpus:
    tokens_rm_stopwords.write("%s\n" % item)

print('Cell took %.2f seconds to run.' % (time() - start))

corpus size: 9839
Cell took 82.57 seconds to run.


In [64]:
print(corpus[0])

['eastern', 'libyan', 'government', 'conduct', 'air', 'strike', 'border', 'tunisia', 'fri', 'dec', '2014', '12:39pm', 'gmt', 'print', 'single', 'page', 'text', 'tripoli', 'dec', 'reuters', 'force', 'allied', 'one', 'two', 'rival', 'government', 'vying', 'power', 'libya', 'conducted', 'air', 'strike', 'near', 'tunisian', 'border', 'vowed', 'shut', 'main', 'land', 'border', 'crossing', 'two', 'country', 'official', 'said', 'libya', 'caught', 'conflict', 'two', 'government', 'one', 'self-declared', 'group', 'called', 'libya', 'dawn', 'took', 'tripoli', 'august', 'second', 'led', 'internationally', 'recognised', 'prime', 'minister', 'abdullah', 'al-thinni', 'forced', 'capital', 'now', 'operates', 'eastern', 'libya', 'different', 'account', 'air', 'strike', 'friday', 'latest', 'series', 'week', 'part', 'turmoil', 'gripping', 'oil-producing', 'country', 'three', 'year', 'ousting', 'muammar', 'gaddafi', 'mohamed', 'el', 'hejazi', 'spokesman', 'army', 'east', 'allied', 'thinni', 'said', 'force

In [65]:
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(corpus)
dictionary.save_as_text('data/dictionary_EOS.txt')

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in corpus]
corpora.MmCorpus.serialize('data/nostopwords_corpus_EOS.mm', corpus)

In [None]:
# generate LDA model
# ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word=dictionary, passes=20)
start = time()
ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, id2word=dictionary, num_topics=20, chunksize=1000, passes=20, workers=4)
joblib.dump(ldamodel, 'data/ldamodel_20_EOS.pkl')
print('Cell took %.2f seconds to run.' % (time() - start))

In [57]:
corpus = gensim.corpora.MmCorpus('data/nostopwords_corpus_EOS.mm')
lda = joblib.load('data/ldamodel_20_EOS.pkl')

(lda.print_topics(num_topics=20, num_words=8))

[(0,
  '0.015*"will" + 0.007*"company" + 0.006*"al" + 0.006*"development" + 0.006*"dubai" + 0.005*"project" + 0.005*"service" + 0.005*"business"'),
 (1,
  '0.017*"2014" + 0.014*"dec" + 0.013*"alert" + 0.012*"reply" + 0.012*"moderator" + 0.008*"n\'t" + 0.008*"will" + 0.007*"de"'),
 (2,
  '0.009*"court" + 0.007*"said" + 0.004*"case" + 0.004*"will" + 0.004*"police" + 0.004*"hospital" + 0.004*"new" + 0.004*"district"'),
 (3,
  '0.018*"said" + 0.015*"state" + 0.013*"islamic" + 0.013*"syria" + 0.012*"iraq" + 0.008*"group" + 0.007*"militant" + 0.006*"iraqi"'),
 (4,
  '0.020*"report" + 0.019*"cia" + 0.012*"said" + 0.010*"interrogation" + 0.008*"torture" + 0.007*"senate" + 0.007*"intelligence" + 0.006*"u"'),
 (5,
  '0.009*"oil" + 0.007*"said" + 0.007*"year" + 0.007*"price" + 0.006*"market" + 0.006*"point" + 0.006*"million" + 0.005*"game"'),
 (6,
  '0.024*"في" + 0.016*"من" + 0.010*"على" + 0.005*"أن" + 0.005*"bangladesh" + 0.004*"إلى" + 0.004*"التي" + 0.004*"الى"'),
 (7,
  '0.007*"patient" + 0.00

In [58]:
lda_vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.display(lda_vis)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]
