In [1]:
import re
import glob
import gensim
from stop_words import get_stop_words
from nltk import sent_tokenize,word_tokenize
from nltk.stem.porter import PorterStemmer

In [2]:
def tokenize(text):
    word = re.compile('^[a-z]+$',re.IGNORECASE)
    en_stop = get_stop_words('en')
    p_stemmer = PorterStemmer()
    tokens = []
    #for sent in sent_tokenize(text):
    for tok in word_tokenize(text,preserve_line=False):
        tok = tok.lower()
        if tok not in en_stop:
            term = word.match(tok)
            if term is not None:
                tokens.append(p_stemmer.stem(tok))
    return tokens

In [3]:
def parse(files,encoding="utf-8"):
    texts = []
    for filename in files: # for each relevant file
        texts.append(tokenize(open(filename,encoding=encoding).read()))
    return texts

In [4]:
def prepare(files,encoding="utf-8"):
    texts = parse(files,encoding=encoding)
    print('parsed texts')
    dictionary = gensim.corpora.Dictionary(texts)
    print('dictionary created')
    corpus = [dictionary.doc2bow(text) for text in texts]
    print('corpus ready')
    return texts,dictionary,corpus

In [5]:
def train(files,num_topics=20,passes=20,encoding="utf-8"):
    texts,dictionary,corpus = prepare(files,encoding=encoding)
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=passes)
    print('model complete')
    return ldamodel

In [6]:
files_check = ['../../content/osc/test/free-lunch.txt']
files_train = glob.glob("../../content/osc/data/*.txt")

In [7]:
ldamodel = train(files_train,encoding="ISO-8859-1")
ldamodel.save('ldamodel_osc.pickle', separately=None, sep_limit=10485760, ignore=frozenset([]), pickle_protocol=2)

parsed texts
dictionary created
corpus ready
model complete


In [8]:
def printtopics(ldamodel):
    alltopics = ldamodel.print_topics(num_topics=20,num_words=20)
    topics = []
    for tops in alltopics:
        for top in tops[1].split('+'):
            parts = top.split('*')
            topics.append([float(parts[0]),parts[1].replace('"','').strip()])
    topics = sorted(topics,key=lambda x:x[0],reverse=True)
    for topic in topics:
        print(str(topic[0]) + '\t' + topic[1])
printtopics(ldamodel)

0.067	depend
0.063	solr
0.057	search
0.054	artifact
0.034	instal
0.023	field
0.022	confer
0.022	file
0.022	download
0.022	cassandra
0.022	document
0.02	scorer
0.019	solr
0.018	use
0.018	relev
0.018	patent
0.017	queri
0.017	quepid
0.017	queri
0.017	search
0.017	data
0.017	score
0.016	present
0.016	user
0.016	score
0.015	http
0.015	solr
0.015	data
0.014	rate
0.014	function
0.014	document
0.014	estim
0.014	open
0.013	can
0.013	result
0.013	podcast
0.013	like
0.013	command
0.013	deploy
0.013	central
0.013	use
0.013	life
0.013	search
0.013	agil
0.013	will
0.012	like
0.012	can
0.012	mvn
0.012	queri
0.012	use
0.012	second
0.012	term
0.012	use
0.012	search
0.012	sourc
0.011	will
0.011	groovi
0.011	http
0.011	use
0.011	project
0.011	rank
0.011	solr
0.011	document
0.011	use
0.011	can
0.011	test
0.011	work
0.01	code
0.01	like
0.01	use
0.01	java
0.01	directori
0.01	award
0.01	time
0.01	project
0.01	posit
0.01	ndcg
0.01	use
0.01	work
0.01	index
0.01	document
0.01	queri
0.01	queri
0.01	develop
0.01	

In [9]:
test_texts,test_dictionary,test_corpus = prepare(files_check)
test_topics = ldamodel[test_corpus]

parsed texts
dictionary created
corpus ready


In [10]:
for test_topic in test_topics:
    for t in test_topic:
        print(ldamodel.show_topic(t[0]))

[('confer', 0.022444598), ('present', 0.015677268), ('will', 0.011301638), ('data', 0.0091455989), ('great', 0.0089752432), ('session', 0.0081362063), ('peopl', 0.0081142662), ('talk', 0.0078431945), ('group', 0.007466238), ('discuss', 0.0069808643)]
[('like', 0.013304528), ('can', 0.01225144), ('use', 0.0088527529), ('user', 0.0070516146), ('basket', 0.0065877079), ('count', 0.0061847982), ('b', 0.005976839), ('one', 0.0058578565), ('databas', 0.0056053838), ('item', 0.0055015502)]
[('project', 0.010803971), ('custom', 0.009281191), ('will', 0.0087867724), ('work', 0.0084994696), ('can', 0.0078534093), ('use', 0.0068423925), ('go', 0.0066181845), ('need', 0.005911049), ('get', 0.0054968582), ('busi', 0.0054925266)]
[('queri', 0.01744362), ('search', 0.017039541), ('use', 0.012013937), ('solr', 0.011153456), ('document', 0.010794309), ('index', 0.010339779), ('can', 0.0094706248), ('field', 0.0094019668), ('match', 0.0091341082), ('synonym', 0.008787889)]
[('agil', 0.013197406), ('will

In [11]:
print(test_topics)

<gensim.interfaces.TransformedCorpus object at 0x11617cf28>
