In [2]:
# Topic Modeling and Latent Dirichlet Allocation (LDA) in Python
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
# Susan Li
# May 30, 2018

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/ben/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
def lemmatize_stemming(text):
    stemmer = SnowballStemmer('english')
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [14]:
def import_files(file_paths):
    documents = []
    for file_path in file_paths:
        with open(file_path, 'r') as f:
            data = f.read()
            documents.append(data)
    return documents

In [15]:
fp1 = '../datasets/acl-arc/txt/pdfbox-0.72/X/X93/X93-1001.txt'
fp2 = '../datasets/acl-arc/txt/pdfbox-0.72/X/X93/X93-1002.txt'
file_paths = [fp1, fp2]
documents = import_files(file_paths)

In [16]:
processed_docs = list(map(preprocess, documents))
processed_docs[:10]

[['tipster',
  'program',
  'overview',
  'roberta',
  'merchant',
  'depart',
  'defens',
  'mead',
  'rhmerch',
  'afterlif',
  'ncsc',
  'tipster',
  'phase',
  'task',
  'tipster',
  'phase',
  'advanc',
  'state',
  'languag',
  'technolog',
  'document',
  'detect',
  'inform',
  'extract',
  'document',
  'detect',
  'includ',
  'subtask',
  'rout',
  'ning',
  'static',
  'queri',
  'stream',
  'data',
  'retriev',
  'run',
  'queri',
  'archiv',
  'data',
  'inform',
  'extract',
  'technolog',
  'specifi',
  'type',
  'inform',
  'locat',
  'free',
  'text',
  'extract',
  'place',
  'databas',
  'state',
  'document',
  'detect',
  'tipster',
  'tipster',
  'user',
  'search',
  'larg',
  'volum',
  'data',
  'queri',
  'inform',
  'retriev',
  'tool',
  'boolean',
  'keyword',
  'search',
  'system',
  'develop',
  'decad',
  'earlier',
  'charac',
  'terist',
  'boolean',
  'system',
  'recal',
  'user',
  'lose',
  'unknown',
  'quantiti',
  'use',
  'inform',
  'unabl',


In [17]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 accord
1 accur
2 accuraci
3 act
4 add
5 advanc
6 afterlif
7 algorithm
8 altern
9 appfic
10 appli


In [18]:
# Filter
# Uncomment when corpus has more than 15 documents
#dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [20]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[1]

[(5, 2),
 (7, 5),
 (11, 3),
 (13, 5),
 (16, 3),
 (29, 1),
 (30, 5),
 (33, 2),
 (36, 3),
 (43, 3),
 (47, 2),
 (50, 7),
 (51, 20),
 (52, 5),
 (55, 2),
 (61, 8),
 (69, 6),
 (73, 4),
 (77, 1),
 (78, 9),
 (81, 1),
 (86, 3),
 (87, 5),
 (88, 1),
 (89, 1),
 (90, 2),
 (94, 1),
 (101, 4),
 (102, 4),
 (109, 2),
 (114, 1),
 (117, 2),
 (121, 2),
 (127, 20),
 (130, 1),
 (135, 1),
 (137, 14),
 (138, 1),
 (139, 1),
 (149, 3),
 (150, 2),
 (152, 1),
 (155, 1),
 (160, 1),
 (162, 1),
 (166, 2),
 (173, 1),
 (175, 8),
 (176, 1),
 (177, 3),
 (180, 6),
 (182, 15),
 (184, 8),
 (186, 1),
 (187, 4),
 (191, 2),
 (200, 2),
 (205, 4),
 (206, 1),
 (207, 1),
 (208, 3),
 (209, 3),
 (210, 1),
 (211, 4),
 (212, 1),
 (213, 1),
 (214, 1),
 (215, 1),
 (216, 1),
 (217, 1),
 (218, 3),
 (219, 1),
 (220, 9),
 (221, 1),
 (222, 1),
 (223, 1),
 (224, 1),
 (225, 1),
 (226, 1),
 (227, 6),
 (228, 1),
 (229, 2),
 (230, 1),
 (231, 1),
 (232, 1),
 (233, 1),
 (234, 1),
 (235, 2),
 (236, 1),
 (237, 1),
 (238, 1),
 (239, 2),
 (240, 1),
 (

In [21]:
# Sample bag of words for a preprocessed document
bow_doc_1 = bow_corpus[1]
for i in range(len(bow_doc_1)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_1[i][0], 
                                               dictionary[bow_doc_1[i][0]], 
bow_doc_1[i][1]))

Word 5 ("advanc") appears 2 time.
Word 7 ("algorithm") appears 5 time.
Word 11 ("applic") appears 3 time.
Word 13 ("arpa") appears 5 time.
Word 16 ("base") appears 3 time.
Word 29 ("concept") appears 1 time.
Word 30 ("confer") appears 5 time.
Word 33 ("continu") appears 2 time.
Word 36 ("data") appears 3 time.
Word 43 ("demonstr") appears 3 time.
Word 47 ("describ") appears 2 time.
Word 50 ("detect") appears 7 time.
Word 51 ("develop") appears 20 time.
Word 52 ("differ") appears 5 time.
Word 55 ("domain") appears 2 time.
Word 61 ("evalu") appears 8 time.
Word 69 ("extract") appears 6 time.
Word 73 ("follow") appears 4 time.
Word 77 ("good") appears 1 time.
Word 78 ("govern") appears 9 time.
Word 81 ("heavi") appears 1 time.
Word 86 ("improv") appears 3 time.
Word 87 ("includ") appears 5 time.
Word 88 ("increas") appears 1 time.
Word 89 ("independ") appears 1 time.
Word 90 ("indic") appears 2 time.
Word 94 ("initi") appears 1 time.
Word 101 ("languag") appears 4 time.
Word 102 ("larg") 

In [22]:
# TF-IDF
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.03163859985841663),
 (1, 0.03163859985841663),
 (2, 0.03163859985841663),
 (3, 0.03163859985841663),
 (4, 0.03163859985841663),
 (6, 0.03163859985841663),
 (8, 0.03163859985841663),
 (9, 0.03163859985841663),
 (10, 0.03163859985841663),
 (12, 0.03163859985841663),
 (14, 0.03163859985841663),
 (15, 0.03163859985841663),
 (17, 0.06327719971683327),
 (18, 0.12655439943366653),
 (19, 0.03163859985841663),
 (20, 0.03163859985841663),
 (21, 0.03163859985841663),
 (22, 0.06327719971683327),
 (23, 0.03163859985841663),
 (24, 0.03163859985841663),
 (25, 0.06327719971683327),
 (26, 0.03163859985841663),
 (27, 0.03163859985841663),
 (28, 0.03163859985841663),
 (31, 0.06327719971683327),
 (32, 0.06327719971683327),
 (34, 0.12655439943366653),
 (35, 0.03163859985841663),
 (37, 0.12655439943366653),
 (38, 0.03163859985841663),
 (39, 0.03163859985841663),
 (40, 0.03163859985841663),
 (41, 0.06327719971683327),
 (42, 0.03163859985841663),
 (44, 0.03163859985841663),
 (45, 0.06327719971683327),


In [24]:
# Running LDA using bag of words
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.015*"phase" + 0.015*"develop" + 0.009*"extract" + 0.009*"tipster" + 0.009*"program" + 0.008*"system" + 0.008*"text" + 0.007*"contractor" + 0.006*"user" + 0.006*"detect"
Topic: 1 
Words: 0.011*"develop" + 0.010*"phase" + 0.009*"text" + 0.006*"system" + 0.006*"tipster" + 0.006*"contractor" + 0.006*"document" + 0.005*"govern" + 0.005*"extract" + 0.005*"detect"
Topic: 2 
Words: 0.013*"extract" + 0.013*"user" + 0.011*"document" + 0.011*"phase" + 0.011*"develop" + 0.010*"tipster" + 0.008*"system" + 0.007*"inform" + 0.007*"retriev" + 0.007*"text"
Topic: 3 
Words: 0.007*"text" + 0.007*"phase" + 0.006*"develop" + 0.005*"program" + 0.005*"system" + 0.005*"architectur" + 0.005*"extract" + 0.004*"tipster" + 0.004*"project" + 0.004*"detect"
Topic: 4 
Words: 0.015*"document" + 0.014*"user" + 0.014*"tipster" + 0.013*"phase" + 0.012*"develop" + 0.011*"system" + 0.010*"extract" + 0.010*"inform" + 0.007*"queri" + 0.007*"text"
Topic: 5 
Words: 0.013*"develop" + 0.012*"phase" + 0.009*"u

In [25]:
# Running LDA using TD-IDF
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.010*"document" + 0.009*"user" + 0.007*"contractor" + 0.007*"inform" + 0.007*"queri" + 0.006*"architectur" + 0.006*"project" + 0.005*"plan" + 0.005*"share" + 0.005*"year"
Topic: 1 Word: 0.002*"document" + 0.002*"queri" + 0.002*"user" + 0.002*"inform" + 0.002*"year" + 0.002*"architectur" + 0.002*"share" + 0.002*"project" + 0.002*"contractor" + 0.002*"plan"
Topic: 2 Word: 0.002*"document" + 0.002*"user" + 0.002*"inform" + 0.002*"queri" + 0.002*"contractor" + 0.002*"share" + 0.002*"year" + 0.002*"cost" + 0.002*"build" + 0.002*"architectur"
Topic: 3 Word: 0.002*"user" + 0.002*"inform" + 0.002*"document" + 0.002*"queri" + 0.002*"contractor" + 0.002*"year" + 0.002*"architectur" + 0.002*"share" + 0.002*"plan" + 0.002*"build"
Topic: 4 Word: 0.002*"document" + 0.002*"user" + 0.002*"queri" + 0.002*"contractor" + 0.002*"inform" + 0.002*"architectur" + 0.002*"share" + 0.002*"project" + 0.002*"relev" + 0.002*"year"
Topic: 5 Word: 0.002*"document" + 0.002*"user" + 0.002*"queri" + 0.0