# Commands

In [1]:
# https://towardsdatascience.com/lets-build-an-article-recommender-using-lda-f22d71b7143e
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

# Init

In [2]:
isNotebook = '__file__' not in locals()

In [3]:
TEST = isNotebook

In [4]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from databasetools.mongo import *
from newstools.goodarticle.utils import *
from nlptools.preprocessing import *
from nlptools.news import parser as newsParser
from machinelearning.iterator import *
from twinews.utils import *

In [5]:
from nlptools.topicmodeling import *
from nltk.stem import WordNetLemmatizer 
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import LdaMulticore
from gensim.test.utils import common_corpus, common_dictionary
import gensim

In [6]:
logger = Logger(tmpDir('logs') + "/lda.log") if isNotebook else Logger("lda.log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

# Config

In [7]:
config = \
{
    'maxDocuments': 500 if TEST else None,
    'useExtraNews': True if TEST else False, # None = unlimited, 0 = no extra news
    'maxUsers': 20 if TEST else None, # Sub-sampling
    'n_components': 100,
    'lowercase': True,
    # 'stop_words': 'english', # 'english' or None
    'max_iter': 2 if TEST else 30,
    'min_df': 10,
    'max_df': 0.8,
    # <https://www.quora.com/How-do-you-combine-LDA-and-tf-idf>
    # <https://www.quora.com/Why-is-the-performance-improved-by-using-TFIDF-instead-of-bag-of-words-in-LDA-clustering>
    'useTFIDF': True,
    'useSklearn': True,
    'doLemmatization': True,
}

# Getting data

In [8]:
# Getting users and news
evalData = getEvalData(1, maxExtraNews=config['maxDocuments'], maxUsers=config['maxUsers'], logger=logger)
(trainUsers, testUsers, trainNews, testNews, candidates, extraNews) = \
(evalData['trainUsers'], evalData['testUsers'], evalData['trainNews'],
 evalData['testNews'], evalData['candidates'], evalData['extraNews'])
bp(evalData.keys(), 5, logger)
log(b(evalData['stats']), logger)

--> tic: 6.36s | message: Eval data loaded
Unable to create index url in twinews news
twinews news (version 1.0) initialised.
--> tic: 8.36s | message: Extra news downloaded
--> toc total duration: 14.73s | message: Got Twinews evaluation data
{ candidates, created, extraNews, ranksLength, stats, testNews, testUsers, trainNews, trainUsers }
{ testMaxNewsPerUser: 164, testMeanNewsPerUser: 10.67, testMinNewsPerUser: 2, testNewsCount: 138785, totalNewsAvailable: 570210, trainMaxNewsPerUser: 443, trainMeanNewsPerUser: 28.0, trainMinNewsPerUser: 8, trainNewsCount: 323572, usersCount: 21239 }


In [9]:
extraNewsList = list(extraNews)
trainNewsList = list(trainNews)
testNewsList = list(testNews)

In [10]:
# We get urls for the LDA model:
if config['useExtraNews']:
    urlsForModel = extraNewsList + trainNewsList + testNewsList
else:
    urlsForModel = trainNewsList + testNewsList + extraNewsList
urlsForModel = urlsForModel[:config['maxDocuments']]
# We get urls to vectorize for the training and the inference:
urlsToVectorize = copy.deepcopy(urlsForModel)
for url in list(trainNews) + list(testNews):
    if url not in urlsToVectorize:
        urlsToVectorize.append(url)
# Print all:
log(str(len(urlsForModel)) + " urls for urlsForModel:\n" + b(urlsForModel), logger=logger)
log(str(len(urlsToVectorize)) + " urls for urlsToVectorize:\n" + b(urlsToVectorize), logger=logger)

500 urls for urlsForModel:
[
  http://www.huffingtonpost.com/entry/little-havana-sits-in-darkness_us_59b9c341e4b086432b0433ff,
  http://knowledge.wharton.upenn.edu/article/jeremy-siegel-whats-ahead-u-s-economy-2018/,
  ...,
  http://www.huffingtonpost.com/entry/white-house-sidewalk-closed_us_58f884a4e4b0cb086d7e3c43,
  https://goo.gl/CUvP3R
]
19854 urls for urlsToVectorize:
[
  http://www.huffingtonpost.com/entry/little-havana-sits-in-darkness_us_59b9c341e4b086432b0433ff,
  http://knowledge.wharton.upenn.edu/article/jeremy-siegel-whats-ahead-u-s-economy-2018/,
  ...,
  https://biochem.wisc.edu/news/2017/news-biochemistry-youth-apprenticeship-2017-07-27,
  http://www.islandpacket.com/news/local/community/beaufort-news/article196262139.html
]


In [11]:
# We get sentences:
sentences = getNewsSentences(urlsToVectorize, logger=logger)
bp(sentences, logger)

Unable to create index url in twinews news
twinews news (version 1.0) initialised.
  0% [                    ]
  9% [=                   ] (51.671s left)
 19% [===                 ] (43.931s left)
 29% [=====               ] (38.791s left)
[
  [ [ MIAMI, Hurricane, ..., day, . ], [ By, Tuesday, ..., return, . ], ..., [ But, they, ..., anxious, . ], [ You, can, ..., eyes, . ] ],
  [ [ It, was, ..., hour, . ], [ According, to, ..., in, __int_4__ ], ..., [ I, think, ..., center, . ], [ Those, will, ..., investors, . ] ],
  ...,
  [ [ Jenna, Amro, ..., Madison, . ], [ She, can, ..., school, . ], ..., [ ", Having, ..., says, . ], [ ", So, ..., Wisconsin, . ] ],
  [ [ Order, Reprint, ..., Capt, . ], [ Bob, Bromage, ..., night, . ], ..., [ Our, journalism, ..., produce, . ], [ If, you, ..., today, . ] ]
]


In [12]:
# We flatten sentences:
for i in range(len(sentences)):
    sentences[i] = flattenLists(sentences[i])
docs = sentences
bp(docs, logger)

[ [ MIAMI, Hurricane, ..., eyes, . ], [ It, was, ..., investors, . ], ..., [ Jenna, Amro, ..., Wisconsin, . ], [ Order, Reprint, ..., today, . ] ]


In [13]:
# Lower case:
if config['lowercase']:
    for i in range(len(docs)):
        for u in range(len(docs[i])):
            docs[i][u] = docs[i][u].lower()
bp(docs, logger)

[ [ miami, hurricane, ..., eyes, . ], [ it, was, ..., investors, . ], ..., [ jenna, amro, ..., wisconsin, . ], [ order, reprint, ..., today, . ] ]


In [14]:
# Lemmatization:
if config['doLemmatization']:
    lemmatizer = WordNetLemmatizer()
    for i in range(len(docs)):
        for u in range(len(docs[i])):
            docs[i][u] = lemmatizer.lemmatize(docs[i][u])
bp(docs, logger)

[ [ miami, hurricane, ..., eye, . ], [ it, wa, ..., investor, . ], ..., [ jenna, amro, ..., wisconsin, . ], [ order, reprint, ..., today, . ] ]


# Infering topic vectors

In [15]:
if config['useSklearn']:
    if config['useTFIDF']:
        vectorizer = TfidfVectorizer\
        (
            sublinear_tf=True,
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            # lowercase=True, # Doesn't work because we erased preprocessor
            min_df=config['min_df'],
            max_df=config['max_df'],
        )
    else:
        vectorizer = CountVectorizer\
        (
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            # lowercase=True, # Doesn't work because we erased preprocessor
            min_df=config['min_df'],
            max_df=config['max_df'],
        )
    vectors = vectorizer.fit_transform(docs)
    assert vectors.shape[0] == len(urlsToVectorize)
    vectorsForModel = vectors[:len(urlsForModel)]
    assert vectorsForModel.shape[0] == config['maxDocuments']
    i = 0
    for url in urlsToVectorize:
        if url == trainNewsList[0]:
            break
        i += 1
    assert i == len(extraNews) or i == 0
    vectorsForInference = vectors[i:i + len(trainNews) + len(testNews)]
    assert vectorsForInference.shape[0] == len(trainNews) + len(testNews)
    learning_method = 'online'
    learning_offset = 1.0
    lda = LatentDirichletAllocation\
    (
        n_components=config['n_components'],
        max_iter=config['max_iter'],
        learning_method=learning_method,
        learning_offset=learning_offset,
        random_state=0,
        n_jobs=cpuCount(),
    )
    lda.fit(vectorsForModel)
    inferedVectors = lda.transform(vectorsForInference)
    assert inferedVectors.shape[0] == len(trainNews) + len(testNews)
    assert inferedVectors[0].shape[0] == config['n_components']
else:
    dictionary = gensim.corpora.Dictionary(docs)
    dictionary.filter_extremes(no_below=config['min_df'])
    bow = [dictionary.doc2bow(doc) for doc in docs]
    if config['useTFIDF']:
        tfidf = gensim.models.TfidfModel(bow)
        bow = tfidf[bow]
    assert len(bow) == len(urlsToVectorize)
    bowForModel = bow[:len(urlsForModel)]
    assert len(bowForModel) == config['maxDocuments']
    i = 0
    for url in urlsToVectorize:
        if url == trainNewsList[0]:
            break
        i += 1
    assert i == len(extraNews) or i == 0
    bowForInference = bow[i:i + len(trainNews) + len(testNews)]
    assert len(bowForInference) == len(trainNews) + len(testNews)
    lda_model = gensim.models.LdaMulticore(bowForModel, num_topics=config['n_components'],
                    id2word=dictionary, passes=config['max_iter'], workers=cpuCount())
    inferedVectors = []
    for current in bowForInference:
        topicProbDistrib = lda_model[current]
        currentVector = [0.0] * config['n_components']
        for t, v in topicProbDistrib:
            currentVector[t] = v
        inferedVectors.append(np.array(currentVector))
    assert len(inferedVectors) == len(trainNews) + len(testNews)
    assert len(inferedVectors[0]) == config['n_components']

# Making a dict url --> topic vector

In [16]:
urlsForInf = trainNewsList + testNewsList

In [17]:
assert len(urlsForInf) == len(inferedVectors)

In [18]:
urlsVectors = dict()

In [19]:
for i in range(len(urlsForInf)):
    urlsVectors[urlsForInf[i]] = inferedVectors[i]

In [20]:
bp(urlsVectors, logger)

{
  http://a.msn.com/00/en-ca/AAuP4NT?ocid=st: [6.06557809e-04 6.06557809e-04 6.06557809e-04 6.06557809e-04
   6.06557809e-04 6.06557809e-04 6.06557809e-04 6.06557809e-04
   6.06557809e-04 6.06557809e-04 6.06557809e-04 6.06557809e-04
   6.06557809e-04 6.06557809e-04 6.06557809e-04 6.06557809e-04
   6.06557809e-04 6.06557809e-04 6.06557809e-04 6.06557809e-04
   6.06557809e-04 6.06557809e-04 6.06557809e-04 6.06557809e-04
   6.06557809e-04 6.06557809e-04 6.06557809e-04 6.06557809e-04
   6.06557809e-04 6.06557809e-04 6.06557809e-04 6.06557809e-04
   6.06557809e-04 6.06557809e-04 6.06557809e-04 6.06557809e-04
   6.06557809e-04 6.06557809e-04 6.06557809e-04 6.06557809e-04
   6.06557809e-04 6.06557809e-04 6.06557809e-04 6.06557809e-04
   6.06557809e-04 6.06557809e-04 6.06557809e-04 6.06557809e-04
   6.06557809e-04 6.06557809e-04 6.06557809e-04 6.06557809e-04
   6.06557809e-04 6.06557809e-04 6.06557809e-04 6.06557809e-04
   6.06557809e-04 6.06557809e-04 9.32710769e-01 6.06557809e-04
   6.06557