# Commands

In [1]:
# cd twinews-logs ; jupython -o nohup-topicmodels-$HOSTNAME.out --venv st-venv ~/notebooks/twinews/hjmodels/topicmodels.ipynb

In [2]:
# Sources:
# https://towardsdatascience.com/lets-build-an-article-recommender-using-lda-f22d71b7143e
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

# Init

In [3]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [4]:
isNotebook = '__file__' not in locals()

In [5]:
TEST = isNotebook

In [6]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from databasetools.mongo import *
from newstools.goodarticle.utils import *
from nlptools.preprocessing import *
from nlptools.news import parser as newsParser
from machinelearning.iterator import *
from twinews.utils import *
from twinews.ranking import *

In [7]:
from nlptools.topicmodeling import *
from nltk.stem import WordNetLemmatizer 
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import LdaMulticore
from gensim.test.utils import common_corpus, common_dictionary
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import gensim
from math import log2
from math import sqrt
from numpy import asarray

In [8]:
logger = Logger(tmpDir('logs') + "/lda.log") if isNotebook else Logger("lda-" + getHostname() + ".log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

# Config

In [9]:
config = \
{
    'splitVersion': 2,
    
    'maxUsers': 2 if TEST else None, # Sub-sampling
    'maxDocuments': 30 if TEST else 10000,
    'useExtraNews': False if TEST else False, # None = unlimited, 0 = no extra news
    'minDF': 1 / 500 if TEST else 1 / 2000, # Remove words that have a document frequency ratio lower than 1 / 500
    'maxDF': 300, # Remove top 300 voc elements
    
    'nbTopics': 30 if TEST else 30, # 30, 100
    'lowercase': False if TEST else True,
    'doLemmatization': False if TEST else True,
    # <https://www.quora.com/How-do-you-combine-LDA-and-tf-idf>
    # <https://www.quora.com/Why-is-the-performance-improved-by-using-TFIDF-instead-of-bag-of-words-in-LDA-clustering>
    'useTFIDF': True,
    
    'maxIter': 2 if TEST else 30, # 30 for lda, 200 for nmf
    
    'nmfInit': 'nndsvd', # None, 'nndsvd'
    'nmfL1Ratio': 0, # 0.0, 0.5, 1.0
    'nmfAlpha': 0.1, # 0.0, 0.1
    
    'ldaLearningMethod': 'online',
    'ldaLearningOffset': 1.0, # 1.0, 10.0
    'ldaLearningDecay': 0.7, # 0.5, 0.7, 0.9, 1.0
    
    'implementation': 'gensim-lda', # gensim-lda, sklearn-lda, sklearn-nmf
    'distance': 'cosine', # 'cosine', 'euclidean', 'kl', 'js'
    # The historyRef param is very important, it allow to choose, for a particular candidate,
    # how many train history items will be used to calculate the similarity with
    # the user history.
    # Float are ratio on train history
    # Integers are absolute number of train item in the history
    # For example:
    #  * 1.0 will allow to mean similarities of a candidate with all train history items
    #  * 1 will allow to use only the most similar train item for the similarity of
    #    the candidate with the history of the user
    #  * 0.5 will allow to use the half of history for each candidates
    #  * 3 to use 3 most similar items with the current candidate...
    'historyRef': 0.3, # 1, 1.0, 0.5, 0.3, 3, 10
}

In [10]:
if 'nmf' in config['implementation']:
    modelName = "nmf"
    newConfig = dict()
    for key, value in config.items():
        if not key.startswith('lda'):
            newConfig[key] = value
    config = newConfig
    del config['useTFIDF']
elif 'lda' in config['implementation']:
    modelName = "lda"
    newConfig = dict()
    for key, value in config.items():
        if not key.startswith('nmf'):
            newConfig[key] = value
    config = newConfig

In [11]:
if not isNotebook:
    assert not rankingExists(modelName, config, logger=logger)

# Getting data

In [12]:
# Getting users and news
evalData = getEvalData(config['splitVersion'], maxExtraNews=config['maxDocuments'],
                       maxUsers=config['maxUsers'], logger=logger)
(trainUsers, testUsers, trainNews, testNews, candidates, extraNews) = \
(evalData['trainUsers'], evalData['testUsers'], evalData['trainNews'],
 evalData['testNews'], evalData['candidates'], evalData['extraNews'])
bp(evalData.keys(), 5, logger)
log(b(evalData['meta'], 5), logger)

--> tic: 3.74s | message: Eval data loaded
twinews news (version 1.0) initialised.
--> tic: 8.22s | message: Extra news downloaded
--> toc total duration: 11.97s | message: Got Twinews evaluation data
{ candidates, extraNews, meta, testNews, testUsers, trainNews, trainUsers }
{ 'created': 2020.03.24-14.28.06, 'endDate': 2018-01-15, 'id': 2, 'ranksLength': 1000, 'splitDate': 2017-12-25, 'startDate': 2017-10-01, 'testMaxNewsPerUser': 97, 'testMeanNewsPerUser': 7.22, 'testMinNewsPerUser': 2, 'testNewsCount': 71781, 'totalNewsAvailable': 570210, 'trainMaxNewsPerUser': 379, 'trainMeanNewsPerUser': 26.48, 'trainMinNewsPerUser': 8, 'trainNewsCount': 237150, 'usersCount': 15905 }


In [13]:
extraNewsList = shuffle(list(extraNews), seed=0)
trainNewsList = shuffle(list(trainNews), seed=0)
testNewsList = shuffle(list(testNews), seed=0)

In [14]:
# We get urls for the LDA model:
if config['useExtraNews']:
    urlsForModel = extraNewsList + trainNewsList + testNewsList
else:
    urlsForModel = trainNewsList + testNewsList + extraNewsList
urlsForModel = urlsForModel[:config['maxDocuments']]
# We get urls to vectorize for the training and the inference:
urlsForModelSet = set(urlsForModel)
urlsToVectorize = copy.deepcopy(urlsForModel)
for url in trainNewsList + testNewsList:
    if url not in urlsForModelSet:
        urlsToVectorize.append(url)
# We get url to infere for the scoring:
urlsToInfere = trainNewsList + testNewsList
# Print all:
log(str(len(urlsForModel)) + " urls for urlsForModel:\n" + b(urlsForModel), logger=logger)
log(str(len(urlsToVectorize)) + " urls for urlsToVectorize:\n" + b(urlsToVectorize), logger=logger)
log(str(len(urlsToInfere)) + " urls for urlsToInfere:\n" + b(urlsToInfere), logger=logger)

30 urls for urlsForModel:
[
  https://buff.ly/2CVPqCO,
  https://www.theguardian.com/football/2017/dec/21/paulinho-barcelona-clasico-tottenham-china,
  ...,
  https://lnkd.in/d8xkM9M,
  http://ow.ly/BztX30hsqIZ
]
2017 urls for urlsToVectorize:
[
  https://buff.ly/2CVPqCO,
  https://www.theguardian.com/football/2017/dec/21/paulinho-barcelona-clasico-tottenham-china,
  ...,
  https://fb.me/2zj0SDL7r,
  http://on.wsj.com/2DrnLdR
]
2017 urls for urlsToInfere:
[
  https://buff.ly/2CVPqCO,
  https://www.theguardian.com/football/2017/dec/21/paulinho-barcelona-clasico-tottenham-china,
  ...,
  https://fb.me/2zj0SDL7r,
  http://on.wsj.com/2DrnLdR
]


In [15]:
# We get sentences:
sentences = getNewsSentences(urlsToVectorize, logger=logger)
bp(sentences, logger)

twinews news (version 1.0) initialised.
  0% [                    ]
  9% [=                   ] (5.149s left)
 19% [===                 ] (4.62s left)
 29% [=====               ] (3.986s left)
[
  [ [ Leaders, of, ..., location, . ], [ But, if, ..., host, ? ], ..., [ Gates, said, ..., Denver, . ], [ ", If, ..., said, . ] ],
  [ [ The, Brazil, ..., more, . ], [ He, was, ..., play, . ], ..., [ That, football, . ], [ It, was, ..., am, . ] ],
  ...,
  [ [ BRUNERSBURG, The, ..., attendance, . ], [ ", I, ..., Brunersburg, . ], ..., [ ", Thousands, ..., said, . ], [ Another, stated, ..., locally, . ] ],
  [ [ District, attorneys, ..., unanimity, . ], [ Above, ,, ..., skyline, . ], ..., [ Mr., Marquis, ..., judge, . ], [ Write, to, ..., at, __email__ ] ]
]


In [16]:
# We flatten sentences:
for i in range(len(sentences)):
    sentences[i] = flattenLists(sentences[i])
docs = sentences
bp(docs, logger)

[ [ Leaders, of, ..., said, . ], [ The, Brazil, ..., am, . ], ..., [ BRUNERSBURG, The, ..., locally, . ], [ District, attorneys, ..., at, __email__ ] ]


In [17]:
# Lower case:
if config['lowercase']:
    for i in pb(list(range(len(docs))), logger=logger, message="Lower casing"):
        for u in range(len(docs[i])):
            docs[i][u] = docs[i][u].lower()
bp(docs, logger)

[ [ Leaders, of, ..., said, . ], [ The, Brazil, ..., am, . ], ..., [ BRUNERSBURG, The, ..., locally, . ], [ District, attorneys, ..., at, __email__ ] ]


In [18]:
# Lemmatization:
if config['doLemmatization']:
    lemmatizer = WordNetLemmatizer()
    pbar = ProgressBar(len(docs), logger=logger, message="Lemmatization")
    for i in range(len(docs)):
        for u in range(len(docs[i])):
            docs[i][u] = lemmatizer.lemmatize(docs[i][u])
        pbar.tic()
bp(docs, logger)

[ [ Leaders, of, ..., said, . ], [ The, Brazil, ..., am, . ], ..., [ BRUNERSBURG, The, ..., locally, . ], [ District, attorneys, ..., at, __email__ ] ]


In [19]:
# Filtering the corpus:
docs = filterCorpus(docs, minDF=config['minDF'], maxDF=config['maxDF'],
                    removeEmptyDocs=False, allowEmptyDocs=False, logger=logger)
for doc in docs: assert len(doc) > 0
bp(docs, logger)

Voc removed because of minDF (44267 elements):
{ 1000th, 100s, 100th, 100x, 103rd, 104th, 108th, 10Weather, 10X, 10x, ..., zooms, zoonotic, zoos, 💗, 😁, 😄, 😇, 😒, 😢, 🙌 }
Voc removed because of maxDF (300 elements):
{ ", ', (, ), ,, -, ., :, ;, ?, ..., with, without, work, working, world, would, year, years, you, your }
73.9% of voc will be removed.
[ [ Leaders, Olympic, ..., side, Gates ], [ Brazil, midfielder, ..., chance, am ], ..., [ County, Society, ..., gas, locally ], [ District, attorneys, ..., Jacob, __email__ ] ]


In [20]:
tt.tic("Data preprocessed")

--> tic: 18.5s | message: Data preprocessed


18.5

# Infering topic vectors

In [21]:
if config['implementation'] == 'gensim-lda':
    dictionary = gensim.corpora.Dictionary(docs)
    # dictionary.filter_extremes(no_below=config['min_df'])
    bow = [dictionary.doc2bow(doc) for doc in docs]
    if config['useTFIDF']:
        tfidf = gensim.models.TfidfModel(bow)
        bow = tfidf[bow]
    assert len(bow) == len(urlsToVectorize)
    bowForModel = bow[:len(urlsForModel)]
    assert len(bowForModel) == config['maxDocuments']
    i = 0
    for url in urlsToVectorize:
        if url == trainNewsList[0]:
            break
        i += 1
    assert i == len(extraNews) or i == 0
    bowForInference = bow[i:i + len(trainNews) + len(testNews)]
    assert len(bowForInference) == len(trainNews) + len(testNews)
    lda_model = gensim.models.LdaMulticore\
    (
        bowForModel,
        num_topics=config['nbTopics'],
        id2word=dictionary,
        iterations=config['maxIter'],
        decay=config['ldaLearningDecay'],
        offset=config['ldaLearningOffset'],
        workers=cpuCount(),
        passes=3,
    )
    inferedVectors = []
    for current in bowForInference:
        topicProbDistrib = lda_model[current]
        currentVector = [0.0] * config['nbTopics']
        for t, v in topicProbDistrib:
            currentVector[t] = v
        inferedVectors.append(np.array(currentVector))
    assert len(inferedVectors) == len(trainNews) + len(testNews)
    assert len(inferedVectors[0]) == config['nbTopics']
    topics = []
    for i in range(lda_model.num_topics):
        current = dict()
        for x in lda_model.get_topic_terms(i, topn=100):
            current[dictionary[x[0]]] = x[1]
        topics.append(current)

In [22]:
if config['implementation'] == 'sklearn-lda' or config['implementation'] == 'sklearn-nmf':
    if config['implementation'] == 'sklearn-nmf' or config['useTFIDF']:
        vectorizer = TfidfVectorizer\
        (
            sublinear_tf=True,
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            # lowercase=True, # Doesn't work because we erased preprocessor
        )
    else:
        vectorizer = CountVectorizer\
        (
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            # lowercase=True, # Doesn't work because we erased preprocessor
        )
    vectors = vectorizer.fit_transform(docs)
    assert vectors.shape[0] == len(urlsToVectorize)
    vectorsForModel = vectors[:len(urlsForModel)]
    assert vectorsForModel.shape[0] == config['maxDocuments']
    i = 0
    for url in urlsToVectorize:
        if url == trainNewsList[0]:
            break
        i += 1
    assert i == len(extraNews) or i == 0
    vectorsForInference = vectors[i:i + len(trainNews) + len(testNews)]
    assert vectorsForInference.shape[0] == len(trainNews) + len(testNews)
    if config['implementation'] == 'sklearn-lda':
        model = LatentDirichletAllocation\
        (
            n_components=config['nbTopics'],
            learning_method=config['ldaLearningMethod'],
            learning_offset=config['ldaLearningOffset'],
            learning_decay=config['ldaLearningDecay'],
            random_state=0,
            n_jobs=cpuCount(),
            max_iter=config['maxIter'],
        )
    else:
        model = NMF\
        (
            n_components=config['nbTopics'],
            random_state=0,
            alpha=config['nmfAlpha'],
            l1_ratio=config['nmfL1Ratio'],
            init=config['nmfInit'],
            max_iter=config['maxIter'],
        )
    model.fit(vectorsForModel)
    inferedVectors = model.transform(vectorsForInference)
    assert inferedVectors.shape[0] == len(trainNews) + len(testNews)
    assert inferedVectors[0].shape[0] == config['nbTopics']
    topics = []
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        wordProb = []
        for i in range(len(topic)):
            prob = topic[i]
            word = feature_names[i]
            wordProb.append((word, prob))
        wordProb = sortBy(wordProb, desc=True, index=1)[:100]
        current = dict()
        for word, prob in wordProb:
            current[word] = prob
        topics.append(current)

In [23]:
tt.tic("Model fitted and topic vectors infered")

--> tic: 9.37s | message: Model fitted and topic vectors infered


9.37

# Showing topics

In [24]:
def printTopics(topics, maxWords=10, logger=None):
    for i in range(len(topics)):
        log(str(i) + ": " + str(" ".join(list(topics[i].keys())[:10])), logger)

In [25]:
def printTopicsOf(vector, topics, logger=None):
    topicsRepr = ""
    topTopics = sortBy([(i, score) for i, score in enumerate(vector) if score > 0.1], desc=True, index=1)[:3]
    log("Top topics number are: " + str(" ".join([str(e[0]) for e in topTopics])), logger)
    currentTopics = [topics[e[0]] for e in topTopics]
    printTopics(currentTopics, logger=logger)

In [26]:
printTopics(topics, logger=logger)

0: Denver Madrid Olympics Barcelona Cup FC Ham players league Colorado
1: Ham Colorado match West Cup Hickenlooper Barcelona Messi Denver city
2: FC Dallas Messi Hunt Olympics Denver Barcelona games Rio Soccer
3: Messi Denver Barcelona Colorado Range city drainage housing Cup ball
4: Denver Colorado Olympics games Winter Hickenlooper city Goldberg Olympic Games
5: Dallas FC Panthers Hunt Denver Olympics Cup Fame club Stadium
6: Barcelona referee federation soccer games referees Denver video Messi play
7: Denver Cup Colorado No FIFA paramedic city Hickenlooper World Jones
8: Cup players World games Amazon game Korea Barcelona league play
9: jersey flag Spain colors yellow Cup Republican blue 1930s red
10: Denver Beck Barcelona Madrid Colorado Olympics Ms. Winter Dr. Olympic
11: Ham Denver Amazon match Colorado Panthers West defeat supporters Cup
12: Denver games Amazon Cup Dallas housing league Olympics players Colorado
13: Barcelona Front Denver Ham Colorado Range club Cup play team
14

# Serialization

In [None]:
if not isNotebook:
    try:
        dirPath = nosaveDir() + "/sklearn-models"
        mkdir(dirPath)
        configHash = objectToHash(config)[:5]
        serialize(model, dirPath + "/model-" + configHash + ".pickle")
        toJsonFile(config, dirPath + "/config-" + configHash + ".json")
    except Exception as e:
        logException(e, logger=logger)

# Making a dict url --> topic vector and a dict url --> text

In [27]:
assert len(urlsToInfere) == len(inferedVectors)
urlsVectors = dict()
for i in range(len(urlsToInfere)):
    urlsVectors[urlsToInfere[i]] = inferedVectors[i]

In [None]:
if isNotebook:
    urlsTexts = dict()
    allTexts = getNewsText(urlsToInfere, logger=logger)
    for i in range(len(urlsToInfere)):
        urlsTexts[urlsToInfere[i]] = allTexts[i]

# Prints

In [None]:
if isNotebook:
    userId = list(trainUsers.keys())[11]
    xvectors = []
    xurls = []
    for url in trainUsers[userId]:
        xvectors.append(urlsVectors[url])
        xurls.append(url)
    xvectors = np.array(xvectors)
    yvectors = []
    yurls = []
    for url in candidates[userId][0]:
        yvectors.append(urlsVectors[url])
        yurls.append(url)
    yvectors = np.array(yvectors)
    distances = getDistances(xvectors, yvectors, metric='cosine')

In [None]:
# Printing some docs with topics:
if isNotebook:
    for i in range(10):
        urls = random.choice([xurls, yurls])
        url = random.choice(urls)
        text = urlsTexts[url]
        vector = urlsVectors[url]
        log(url, logger)
        printTopicsOf(vector, topics, logger=logger)
        log(text, logger)
        log("\n" * 2, logger)

In [None]:
# Printing some similar docs:
if isNotebook:
    for i in range(distances.shape[0]):
        if i > 100:
            break
        # Get train:
        trainUrl = xurls[i]
        trainText = urlsTexts[trainUrl]
        trainVector = urlsVectors[trainUrl]
        log(trainUrl, logger)
        printTopicsOf(trainVector, topics, logger=logger)
        log(trainText[:2000], logger)
        log("\n", logger)
        # Get distances:
        currentDistances = []
        for u in range(len(yurls)):
            currentDistances.append((yurls[u], urlsTexts[yurls[u]], distances[i][u]))
        topSim = sortBy(currentDistances, index=2, desc=False)[:3]
        topDissim = sortBy(currentDistances, index=2, desc=True)[:3]
        # Print similars:
        log("MOST SIMILARS", logger)
        log("\n", logger)
        for url, text, dist in topSim:
            log(dist, logger)
            log(url, logger)
            printTopicsOf(urlsVectors[url], topics, logger=logger)
            log(text[:2000], logger)
            log("\n", logger)
        # Print dissimilars:
        log("MOST DISSIMILARS", logger)
        log("\n", logger)
        for url, text, dist in topDissim:
            log(dist, logger)
            log(url, logger)
            printTopicsOf(urlsVectors[url], topics, logger=logger)
            log(text[:2000], logger)
            log("\n", logger)
        log("\n", logger)
        log("\n" * 2 + '-' * 20 + "\n" * 2, logger)        

# Ranking

In [28]:
if False and isNotebook:
    config['historyRef'] = 1.0

In [29]:
historyRef = config['historyRef']
assert (isinstance(historyRef, int) and historyRef >= 1) or (isinstance(historyRef, float) and historyRef > 0.0 and historyRef <= 1.0)

In [35]:
rankings = usersRankingsByHistoryDistance\
(
    trainUsers,
    candidates,
    historyRef,
    urlsVectors,
    distanceMetric=config['distance'],
    logger=logger,
)



In [39]:
bp(rankings, logger, 4)

{
  '2617447752': 
  [
    [
      https://www.thenation.com/article/persistent-precarity/,
      https://buff.ly/2Few0du,
      https://pamboyd.wordpress.com/2018/01/02/while-thinking-over-2017/,
      http://www.baltimoresun.com/news/maryland/baltimore-county/bs-md-co-pretrial-pilot-20171222-story.html,
      http://ow.ly/tTw730hHaYI,
      http://bit.ly/2CZ2vPw,
      http://www.aljazeera.com/news/2018/01/har-gobind-khorana-google-honours-today-180108134719847.html,
      https://www.nytimes.com/2018/01/11/technology/facebook-news-feed.html,
      https://atlanta.eater.com/2017/12/28/16826246/atlanta-food-editors-name-their-biggest-dining-grievances-of-2017,
      http://windsorstar.com/news/local-news/university-of-windsor-professor-and-team-get-millions-to-study-great-lakes/,
      ...,
      https://www.washingtonpost.com/local/can-you-prepay-your-real-estate-taxes-before-the-tax-bill-takes-effect-find-out-here/2017/12/26/5539b14c-ea7c-11e7-8a6a-80acf0774e64_story.html?utm_term=.

In [37]:
bp(rankings, logger, 4)

{
  '2617447752': 
  [
    [
      https://www.thenation.com/article/persistent-precarity/,
      https://buff.ly/2Few0du,
      https://pamboyd.wordpress.com/2018/01/02/while-thinking-over-2017/,
      http://www.baltimoresun.com/news/maryland/baltimore-county/bs-md-co-pretrial-pilot-20171222-story.html,
      http://ow.ly/tTw730hHaYI,
      http://bit.ly/2CZ2vPw,
      http://www.aljazeera.com/news/2018/01/har-gobind-khorana-google-honours-today-180108134719847.html,
      https://www.nytimes.com/2018/01/11/technology/facebook-news-feed.html,
      https://atlanta.eater.com/2017/12/28/16826246/atlanta-food-editors-name-their-biggest-dining-grievances-of-2017,
      http://windsorstar.com/news/local-news/university-of-windsor-professor-and-team-get-millions-to-study-great-lakes/,
      ...,
      https://www.washingtonpost.com/local/can-you-prepay-your-real-estate-taxes-before-the-tax-bill-takes-effect-find-out-here/2017/12/26/5539b14c-ea7c-11e7-8a6a-80acf0774e64_story.html?utm_term=.

In [None]:
tt.tic("Rankings done")

# Adding rankings to the db

In [None]:
addRanking(modelName, rankings, config, logger=logger)

In [None]:
tt.tic("Rankings stored")

In [None]:
totalDuration = tt.toc()

In [None]:
notif(modelName + '-' + objectToHash(config)[:5] + " done in " + secondsToHumanReadableDuration(totalDuration))