In [1]:
import os.path

from collections import Counter
from random import sample

import pandas as pd
import numpy as np

from lda import LDA, datasets as lda_datasets

from newsbreaker.data import load_entries, save_entries

In [2]:
import os.path
from sklearn.externals import joblib

try:
    topic_model = joblib.load(os.path.join('topic_model', 'topic_model.pkl'))
    
    with open(os.path.join('topic_model', 'vocab.txt')) as f:
        vocab = f.read().split('\n')
except:
    vocab = list(lda_datasets.load_reuters_vocab())
    print('Model not loaded')

Model not loaded


In [3]:
folder = os.path.join('..', 'data')

In [4]:
entries = load_entries(folder)

In [5]:
def build_features(entry):
    doc = entry.doc(False, False, False)

    counter = Counter(
        word.lower_
        for word in doc
    )
    
    return np.array([ counter[word] for word in vocab ])

In [6]:
X = [
    build_features(entry)
    for entry in entries
]

In [7]:
# Only use documents that have at least a word in vocab
X = np.array([ row for row in X if row.sum() ])

In [8]:
# Get indexes of words in vocab that don't show up in any traning document
drop_vocab_indexs = {
    i
    for i, col in enumerate(X.T)
    if not col.sum()
}

# Drop discarded words from vocab
# Now build_features will use only the not discarded
for i in sorted(drop_vocab_indexs, reverse=True):
    vocab.pop(i)

In [9]:
# Drop any all-zero columns, identified by drop_vocab_indexs
X = np.array([ col for n, col in enumerate(X.T) if n not in drop_vocab_indexs]).T
del drop_vocab_indexs

In [10]:
# Create and fit model
topic_model = LDA(n_topics=20)
topic_model.fit(X)

<lda.lda.LDA at 0x167c077f0>

In [11]:
test_entries = sample(entries, 100)
test_vectors = np.array([
    build_features(entry)
    for entry in test_entries
])

In [12]:
test_Y = topic_model.transform(test_vectors)



In [13]:
df = pd.DataFrame(
    [
        [res.argmax(), entry.feedname, entry.index, entry.title]
        for entry, res in map(
            lambda pair: (test_entries[pair[0]], pair[1]), 
            enumerate(test_Y)
        )
    ],
    columns=['topic', 'feedname', 'index', 'title']
)

In [14]:
df.sort_values(by='topic')

Unnamed: 0,topic,feedname,index,title
86,0,ChicagoTribune,2625,Serbia's PM condemns 'brutal treatment' of mig...
69,0,NYTimes,1956,ISIS Damages Temple of Baal in Palmyra
59,0,StLouisPost,13451,"Obama, Indonesian leader expected to discuss I..."
27,0,WallStreetJournal,647,"Stymied at Channel, Migrants With Money Turn t..."
6,0,USAToday,6968,Canadian PM Justin Trudeau says cabinet is hal...
14,1,ChicagoTribune,4496,Walgreens nears deal to buy Rite Aid: report
13,1,LATimes,15042,Six Flags Entertainment reports attendance jum...
32,1,StLouisPost,1053,The rogues gallery of accounting scandals thro...
20,1,NYTimes,287,Apple Waits as App Developers Study Who’s Buyi...
53,1,DailyNews,4608,Surgeons make the most money: survey


# Save model

In [15]:
joblib.dump(topic_model, os.path.join('topic_model', 'topic_model.pkl'))

['model_data/topic_model.pkl',
 'model_data/topic_model.pkl_01.npy',
 'model_data/topic_model.pkl_02.npy',
 'model_data/topic_model.pkl_03.npy',
 'model_data/topic_model.pkl_04.npy',
 'model_data/topic_model.pkl_05.npy',
 'model_data/topic_model.pkl_06.npy',
 'model_data/topic_model.pkl_07.npy']

In [16]:
with open(os.path.join('topic_model', 'vocab.txt'), 'w') as f:
    f.write('\n'.join(vocab))