In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models.ldamulticore import LdaMulticore

from gensim.models import Phrases
from gensim.parsing.preprocessing import preprocess_string, remove_stopwords, strip_numeric, strip_punctuation, strip_short, stem_text
from gensim.test.utils import common_corpus
import matplotlib.pyplot as plt

In [3]:
import sys
sys.path.append('../')
from util import load_data, load_gensim_data

In [4]:
data, X, y = load_data(path='../../Files/')

In [8]:
corpus = {
    '1a': unpickle('2013-2016_item1a_corpus.pkl', '../../Files/gensim/'),
    '7': unpickle('2013-2016_item7_corpus.pkl', '../../Files/gensim/')
}

id2word = {
    '1a': unpickle('2013-2016_item1a_id2word.pkl', '../../Files/gensim/'),
    '7': unpickle('2013-2016_item7_id2word.pkl', '../../Files/gensim/')
}

In [None]:
corpus, id2word = load_gensim_data([2014, 2017], path='../Files/gensim')

In [11]:
params = {
    'num_topics': 30,
    'chunksize': 2000,
    'passes': 20,
    'iterations': 400,
    'eval_every': None,
    'alpha': 'symmetric',
    'eta': 'auto'
}

In [12]:
models = {}
for item in corpus:
    models[item] = LdaMulticore(
        corpus=corpus[item],
        id2word=id2word[item],
        workers=32,
        **params
    )

In [14]:
# models['item1a'].print_topics(num_topics=5, num_words=5)
# models['item7'].print_topics(num_topics=5, num_words=5)
print(models['1a'].show_topics(num_topics=5, num_words=5, formatted=True))

# Look at things your throwing out in filtering
# Look at strange occurences "duke"
# Look at total perecent of words that are made up
# Split by sector

[(17, '0.078*"loan" + 0.060*"bank" + 0.021*"deposit" + 0.016*"real" + 0.015*"estat"'), (10, '0.030*"wireless" + 0.020*"spectrum" + 0.020*"fcc" + 0.020*"carrier" + 0.013*"telecommun"'), (29, '0.048*"candid" + 0.044*"clinic" + 0.035*"trial" + 0.033*"patent" + 0.026*"clinic_trial"'), (22, '0.037*"care" + 0.028*"healthcar" + 0.026*"medicar" + 0.017*"patient" + 0.016*"health_care"'), (19, '0.050*"game" + 0.037*"aircraft" + 0.022*"airlin" + 0.018*"china" + 0.014*"travel"')]


In [96]:
results = []
for doc_idx in range(item1a.shape[0]):
    result_doc = {}
    for item in items:
        lda_model = models[item]
        scores = []
        topics = []
        for index, score in sorted(lda_model[corpus[item][doc_idx]], key=lambda tup: -1*tup[1]):
#             print ("Score: {}\t Topic ID: {} Topic: {}".format(score, index, lda_model.print_topic(index, 10)))
            topics.append(dictionaries[item][index])
            scores.append(score)
        result_doc[item] = (topics, scores)
    results.append(result_doc)

In [150]:
for item in items:
    lda_model = models[item]
    print(f'{item}:')
    print(lda_model.alpha)
    print(lda_model.eta)

item1a:
[0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334
 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334
 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334
 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334
 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334]
[0.07455198 0.47855908 0.11533958 ... 0.04148679 0.0340654  0.03422894]
item7:
[0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334
 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334
 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334
 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334
 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334]
[0.18083873 0.05202858 0.6822381  ... 0.08863842 0.05057403 0.06306964]


In [108]:
lda_model.eta

array([0.02, 0.02, 0.02, ..., 0.02, 0.02, 0.02], dtype=float32)