In [10]:
import gensim, os, re
from elasticsearch import Elasticsearch

In [11]:
txts_dir = "./txt/"

In [12]:
es = Elasticsearch()
index = "arxiv"

if es.indices.exists(index):
    es.indices.delete(index)
es.indices.create(index)

for file_name in os.listdir(txts_dir):
    with open(txts_dir + file_name, 'r') as f:
        text = f.read()
    clean_txt = " ".join(clean_text(text))
    name = file_name.replace(".txt", "")
    body = { "name" : name, "text" : clean_txt }
    es.index(index=index, doc_type="file", body=body)

In [17]:
stop_words = set([ 
    "by", "the", "of", "for", "and", "in", "to", 
    "over", "or", "with", "under", "no", "not", 
    "from", "at", "as", "without", "this", "who",
    "other", "they", "them", "also", "except", "on",
    "are", "is", "that", "is", "were", "was", "had",
    "that", "which", "it", "be", "use", "generally",
    "than", "through", "via", "between", "each",
    "those", "these", "have", "any", "now", "if", "should",
    "such", "has", "what", "into", "primarily", 
    "more", "comprises", "all", "can", "ie", "what", 
    "below", "see", "about", "its", "eg", "greater", 
    "their", "among", "after", "having","while", "an", "we", 
     "here", "you", "will", "your", "only", "likely", 
    "because", "etc", "shall", "his", "her", "ever", 
    "every", "then", "within", "likewise", "onto"
])

def clean_text(txt):    
    clean_words = []
    for w in txt.split():
        if w not in stop_words and re.search(r"^[A-Za-z].*[A-Za-z]$", w):
            w = re.sub(r"[^A-Za-z-]", "", w)
            if len(w) > 1:
                clean_words.append(w.lower())
    return clean_words

In [14]:
docs = []
for f in os.listdir(txts_dir):
    with open(txts_dir + f,'r') as f:
        docs.append(f.read())
        
texts = [clean_text(d) for d in docs]
id2word = gensim.corpora.Dictionary(texts)
id2word.filter_extremes(no_below=10,keep_n=100000)
id2word.compactify()
id2word.save("hdpDictionary")
id2freq = [id2word.doc2bow(t) for t in texts]

hdp = gensim.models.hdpmodel.HdpModel(corpus=id2freq, id2word=id2word)
hdp.save("hdpModel")

topics_file = "hdp_topics.txt"
topics = hdp.show_topics(topics=-1, topn=15)
with open(topics_file, "w") as f:
    for topic in topics:
        f.write("{}\n".format(topic))
print "done processing: " + topics_file

done processing: hdp_topics.txt


In [15]:
import pickle
topic2words = dict()
for topic in topics:
    start = topic.index(" ") + 1
    end = topic.index(":")
    topic_num = topic[start:end]
    pairs = topic[end+1:-1]
    word2weight = dict()
    for pair in pairs.split("+"):
        weight, word = pair.strip().split("*")
        word2weight[word] = float(weight)
    topic2words[topic_num] = word2weight

with open("topic2words", "w") as f:
    pickle.dump(topic2words, f)