In [21]:
# https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21
# Topic Modelling in Python with NLTK and Gensim
# Susan Li
# Mar 30, 2018

import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [24]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to /Users/ben/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/ben/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [26]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [50]:
# Testing with ACL Anthology Reference Corpus version 20080325
# Site: http://acl-arc.comp.nus.edu.sg/#t
# File: http://acl-arc.comp.nus.edu.sg/aclArc-20080325.tgz

# First test opening a single file in the ACL corpus
# Appears that not all files are UTF-8 encoded
# See http://acl-arc.comp.nus.edu.sg/archives/acl-arc-090501d2/docs/README-20090501.txt
# "pdfbox converted versions of the pdf files in UTF-8 encoding where possible"

def print_topics(file_path):
    import random
    text_data = []
    with open(file_path) as f:
        for line in f:
            tokens = prepare_text_for_lda(line)
            if random.random() > .99:
#                 print(tokens)
                text_data.append(tokens)
    print(text_data)

In [32]:
fp1 = '../datasets/acl-arc/txt/pdfbox-0.72/X/X93/X93-1001.txt'
fp2 = '../datasets/acl-arc/txt/pdfbox-0.72/X/X93/X93-1002.txt'

# Output is extremely random and varies every run
print_topics(fp1)
print_topics(fp2)

[['commercial', 'government', 'application']]
[['describe', 'separate', 'section', 'detection', 'extrac-'], ['availability', 'funding', 'permit', 'addition'], ['license', 'develop', 'community']]


In [46]:
# Add topics to a single text_data list instead of printing them for every file
def add_topics(file_path, text_data):
    import random
    with open(file_path) as f:
        for line in f:
            tokens = prepare_text_for_lda(line)
            if random.random() > .99:
                text_data.append(tokens)

In [37]:
text_data = []
add_topics(fp1, text_data)
add_topics(fp2, text_data)

In [38]:
text_data

[['irrelevant', 'document', 'system'],
 ['crystal@arpa.mil'],
 ['evaluation', 'complete', 'system', 'development']]

In [42]:
# LDA with Gensim

from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('../artifacts/acl-corpus.pkl', 'wb'))
dictionary.save('../artifacts/dictionary.gensim')

In [44]:
# This is basically producing 5 different permutations of the same keywords.
# I think we need to back up and figure out why the tokenization step is producing so few topic keywords.
# Shouldn't it be producing a bunch more lists of keywords, roughly one for each line?

import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.143*"crystal@arpa.mil" + 0.143*"system" + 0.143*"complete" + 0.143*"development"')
(1, '0.273*"system" + 0.273*"document" + 0.273*"irrelevant" + 0.045*"crystal@arpa.mil"')
(2, '0.143*"crystal@arpa.mil" + 0.143*"system" + 0.143*"complete" + 0.143*"development"')
(3, '0.188*"evaluation" + 0.188*"development" + 0.188*"complete" + 0.187*"crystal@arpa.mil"')
(4, '0.143*"crystal@arpa.mil" + 0.143*"system" + 0.143*"complete" + 0.143*"development"')


In [49]:
# pyLDAvis

dictionary = gensim.corpora.Dictionary.load('../artifacts/dictionary.gensim')
corpus = pickle.load(open('../artifacts/acl-corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
