# Commands

In [None]:
# cd misc-logs ; ./mv-old-logs.sh ; jupython --venv st-venv ~/notebooks/twinews/lda.ipynb

In [1]:
# https://towardsdatascience.com/lets-build-an-article-recommender-using-lda-f22d71b7143e
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

# Init

In [20]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [2]:
isNotebook = '__file__' not in locals()

In [3]:
TEST = isNotebook

In [4]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from databasetools.mongo import *
from newstools.goodarticle.utils import *
from nlptools.preprocessing import *
from nlptools.news import parser as newsParser
from machinelearning.iterator import *
from twinews.utils import *

In [5]:
from nlptools.topicmodeling import *
from nltk.stem import WordNetLemmatizer 
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import LdaMulticore
from gensim.test.utils import common_corpus, common_dictionary
from sklearn.metrics.pairwise import cosine_similarity
import gensim

In [6]:
logger = Logger(tmpDir('logs') + "/lda.log") if isNotebook else Logger("lda.log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

# Config

In [7]:
config = \
{
    'maxDocuments': 500 if TEST else 10000,
    'useExtraNews': True if TEST else True, # None = unlimited, 0 = no extra news
    'maxUsers': 20 if TEST else None, # Sub-sampling
    'n_components': 100,
    'lowercase': True,
    # 'stop_words': 'english', # 'english' or None
    'max_iter': 10 if TEST else 30,
    'min_df': 1,
    'max_df': 1.0,
    # <https://www.quora.com/How-do-you-combine-LDA-and-tf-idf>
    # <https://www.quora.com/Why-is-the-performance-improved-by-using-TFIDF-instead-of-bag-of-words-in-LDA-clustering>
    'useTFIDF': False,
    'useSklearn': True,
    'doLemmatization': False if TEST else True,
}

# Getting data

In [8]:
# Getting users and news
evalData = getEvalData(1, maxExtraNews=config['maxDocuments'], maxUsers=config['maxUsers'], logger=logger)
(trainUsers, testUsers, trainNews, testNews, candidates, extraNews) = \
(evalData['trainUsers'], evalData['testUsers'], evalData['trainNews'],
 evalData['testNews'], evalData['candidates'], evalData['extraNews'])
bp(evalData.keys(), 5, logger)
log(b(evalData['stats']), logger)

--> tic: 6.29s | message: Eval data loaded
Unable to create index url in twinews news
twinews news (version 1.0) initialised.
--> tic: 9.93s | message: Extra news downloaded
--> toc total duration: 16.22s | message: Got Twinews evaluation data
{ candidates, created, extraNews, ranksLength, stats, testNews, testUsers, trainNews, trainUsers }
{ testMaxNewsPerUser: 164, testMeanNewsPerUser: 10.67, testMinNewsPerUser: 2, testNewsCount: 138785, totalNewsAvailable: 570210, trainMaxNewsPerUser: 443, trainMeanNewsPerUser: 28.0, trainMinNewsPerUser: 8, trainNewsCount: 323572, usersCount: 21239 }


In [9]:
extraNewsList = shuffle(list(extraNews), seed=0)
trainNewsList = shuffle(list(trainNews), seed=0)
testNewsList = shuffle(list(testNews), seed=0)

In [10]:
# We get urls for the LDA model:
if config['useExtraNews']:
    urlsForModel = extraNewsList + trainNewsList + testNewsList
else:
    urlsForModel = trainNewsList + testNewsList + extraNewsList
urlsForModel = urlsForModel[:config['maxDocuments']]
# We get urls to vectorize for the training and the inference:
urlsToVectorize = copy.deepcopy(urlsForModel)
for url in trainNewsList + testNewsList:
    if url not in urlsToVectorize:
        urlsToVectorize.append(url)
# We get url to infere for the scoring:
urlsToInfere = trainNewsList + testNewsList
# Print all:
log(str(len(urlsForModel)) + " urls for urlsForModel:\n" + b(urlsForModel), logger=logger)
log(str(len(urlsToVectorize)) + " urls for urlsToVectorize:\n" + b(urlsToVectorize), logger=logger)
log(str(len(urlsToInfere)) + " urls for urlsToInfere:\n" + b(urlsToInfere), logger=logger)

500 urls for urlsForModel:
[
  http://bit.ly/2hRc6ws,
  http://www.whec.com/news/rhinos-2018-season-jeopardy/4671231/,
  ...,
  http://www.cbc.ca/news/canada/nova-scotia/ocean-technology-the-future-of-atlantic-canada-s-economy-1,
  https://patch.com/connecticut/fairfield/residents-urged-get-flu-shot-fairfield
]
19854 urls for urlsToVectorize:
[
  http://bit.ly/2hRc6ws,
  http://www.whec.com/news/rhinos-2018-season-jeopardy/4671231/,
  ...,
  http://www.spokesman.com/stories/2018/jan/30/idaho-agency-oks-funding-for-priest-lake-improveme/,
  https://www.nbcsandiego.com/news/local/Ramona-Snowboarder-Is-Olympic-Bound-471325214.html
]


In [11]:
# We get sentences:
sentences = getNewsSentences(urlsToVectorize, logger=logger)
bp(sentences, logger)

Unable to create index url in twinews news
twinews news (version 1.0) initialised.
  0% [                    ]
  9% [=                   ] (53.291s left)
 19% [===                 ] (45.291s left)
 29% [=====               ] (39.818s left)
[
  [ [ A, drip, ..., Folsom, . ], [ Today, __int_2__, ..., drought, . ], ..., [ Contact, him, at, __email__ ], [ Joe, Grindstaff, ..., Agency, . ] ],
  [ [ ", There, ..., statement, . ], [ ", We, ..., Rhinos, . ], ..., [ Capelli, Stadium, ..., County, . ], [ Monroe, County, ..., parks, . ] ],
  ...,
  [ [ The, Idaho, ..., lakes, . ], [ The, upgrades, ..., River, . ], ..., [ The, water, ..., funds, . ], [ Butch, Otter, ..., year, . ] ],
  [ [ NBC, __int_1__, ..., Russia, . ], [ Seamus, bleeds, ..., green, . ], ..., [ Then, on, ..., qualify, . ], [ estimates, his, ..., return, . ] ]
]


In [12]:
# We flatten sentences:
for i in range(len(sentences)):
    sentences[i] = flattenLists(sentences[i])
docs = sentences
bp(docs, logger)

[ [ A, drip, ..., Agency, . ], [ ", There, ..., parks, . ], ..., [ The, Idaho, ..., year, . ], [ NBC, __int_1__, ..., return, . ] ]


In [13]:
# Lower case:
if config['lowercase']:
    for i in range(len(docs)):
        for u in range(len(docs[i])):
            docs[i][u] = docs[i][u].lower()
bp(docs, logger)

[ [ a, drip, ..., agency, . ], [ ", there, ..., parks, . ], ..., [ the, idaho, ..., year, . ], [ nbc, __int_1__, ..., return, . ] ]


In [22]:
# Lemmatization:
if config['doLemmatization']:
    lemmatizer = WordNetLemmatizer()
    pbar = ProgressBar(len(docs), logger=logger, message="Lemmatization")
    for i in range(len(docs)):
        for u in range(len(docs[i])):
            docs[i][u] = lemmatizer.lemmatize(docs[i][u])
        pbar.tic()
bp(docs, logger)

  0% [                    ]
  9% [=                   ] (2m 24.302s left)
 19% [===                 ] (1m 46.946s left)
 29% [=====               ] (1m 27.618s left)
[ [ a, drip, ..., agency, . ], [ ", there, ..., park, . ], ..., [ the, idaho, ..., year, . ], [ nbc, __int_1__, ..., return, . ] ]


# Infering topic vectors

In [15]:
if config['useSklearn']:
    if config['useTFIDF']:
        vectorizer = TfidfVectorizer\
        (
            sublinear_tf=True,
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            # lowercase=True, # Doesn't work because we erased preprocessor
            min_df=config['min_df'],
            max_df=config['max_df'],
        )
    else:
        vectorizer = CountVectorizer\
        (
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            # lowercase=True, # Doesn't work because we erased preprocessor
            min_df=config['min_df'],
            max_df=config['max_df'],
        )
    vectors = vectorizer.fit_transform(docs)
    assert vectors.shape[0] == len(urlsToVectorize)
    vectorsForModel = vectors[:len(urlsForModel)]
    assert vectorsForModel.shape[0] == config['maxDocuments']
    i = 0
    for url in urlsToVectorize:
        if url == trainNewsList[0]:
            break
        i += 1
    assert i == len(extraNews) or i == 0
    vectorsForInference = vectors[i:i + len(trainNews) + len(testNews)]
    assert vectorsForInference.shape[0] == len(trainNews) + len(testNews)
    learning_method = 'online'
    learning_offset = 1.0
    lda = LatentDirichletAllocation\
    (
        n_components=config['n_components'],
        max_iter=config['max_iter'],
        learning_method=learning_method,
        learning_offset=learning_offset,
        random_state=0,
        n_jobs=cpuCount(),
    )
    lda.fit(vectorsForModel)
    inferedVectors = lda.transform(vectorsForInference)
    assert inferedVectors.shape[0] == len(trainNews) + len(testNews)
    assert inferedVectors[0].shape[0] == config['n_components']
    topics = []
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(lda.components_):
        wordProb = []
        for i in range(len(topic)):
            prob = topic[i]
            word = feature_names[i]
            wordProb.append((word, prob))
        wordProb = sortBy(wordProb, desc=True, index=1)[:100]
        current = dict()
        for word, prob in wordProb:
            current[word] = prob
        topics.append(current)
else:
    dictionary = gensim.corpora.Dictionary(docs)
    dictionary.filter_extremes(no_below=config['min_df'])
    bow = [dictionary.doc2bow(doc) for doc in docs]
    if config['useTFIDF']:
        tfidf = gensim.models.TfidfModel(bow)
        bow = tfidf[bow]
    assert len(bow) == len(urlsToVectorize)
    bowForModel = bow[:len(urlsForModel)]
    assert len(bowForModel) == config['maxDocuments']
    i = 0
    for url in urlsToVectorize:
        if url == trainNewsList[0]:
            break
        i += 1
    assert i == len(extraNews) or i == 0
    bowForInference = bow[i:i + len(trainNews) + len(testNews)]
    assert len(bowForInference) == len(trainNews) + len(testNews)
    lda_model = gensim.models.LdaMulticore\
    (
        bowForModel,
        num_topics=config['n_components'],
        id2word=dictionary,
        iterations=config['max_iter'],
        workers=cpuCount(),
        passes=3,
    )
    inferedVectors = []
    for current in bowForInference:
        topicProbDistrib = lda_model[current]
        currentVector = [0.0] * config['n_components']
        for t, v in topicProbDistrib:
            currentVector[t] = v
        inferedVectors.append(np.array(currentVector))
    assert len(inferedVectors) == len(trainNews) + len(testNews)
    assert len(inferedVectors[0]) == config['n_components']
    topics = []
    for i in range(lda_model.num_topics):
        current = dict()
        for x in lda_model.get_topic_terms(i, topn=100):
            current[dictionary[x[0]]] = x[1]
        topics.append(current)

# Serialization

In [17]:
if not isNotebook:
    dirPath = nosaveDir() + "/lda-models"
    mkdir(dirPath)
    serialize(lda, dirPath + "/lda1.pickle")
    exit()

# Making a dict url --> topic vector

In [25]:
assert len(urlsToInfere) == len(inferedVectors)
urlsVectors = dict()
for i in range(len(urlsToInfere)):
    urlsVectors[urlsToInfere[i]] = inferedVectors[i]
bp(urlsVectors, logger)

{
  http://a.msn.com/00/en-ca/AAuP4NT?ocid=st: [9.64320154e-06 9.64320154e-06 9.64320154e-06 9.64320154e-06
   9.64320154e-06 9.64320154e-06 9.64320154e-06 9.64320154e-06
   9.64320154e-06 9.64320154e-06 9.64320154e-06 9.64320154e-06
   9.64320154e-06 9.64320154e-06 9.64320154e-06 9.64320154e-06
   9.64320154e-06 9.64320154e-06 9.64320154e-06 9.64320154e-06
   9.64320154e-06 9.64320154e-06 9.64320154e-06 9.64320154e-06
   9.64320154e-06 9.64320154e-06 9.64320154e-06 9.64320154e-06
   9.64320154e-06 8.04699325e-01 9.64320154e-06 9.64320154e-06
   9.64320154e-06 9.64320154e-06 9.64320154e-06 9.64320154e-06
   9.64320154e-06 9.64320154e-06 9.64320154e-06 9.64320154e-06
   9.64320154e-06 9.64320154e-06 9.64320154e-06 9.64320154e-06
   9.64320154e-06 9.64320154e-06 9.64320154e-06 9.64320154e-06
   9.64320154e-06 9.64320154e-06 9.64320154e-06 9.64320154e-06
   9.64320154e-06 9.64320154e-06 9.64320154e-06 9.64320154e-06
   9.64320154e-06 9.64320154e-06 9.64320154e-06 1.92421385e-01
   9.64320

In [26]:
urlsTexts = dict()
allTexts = getNewsText(urlsToInfere, logger=logger)
for i in range(len(urlsToInfere)):
    urlsTexts[urlsToInfere[i]] = allTexts[i]
bp(urlsTexts, logger)

  0% [                    ]
  9% [=                   ] (50.411s left)
 19% [===                 ] (44.011s left)
 29% [=====               ] (38.044s left)
{
  http://a.msn.com/00/en-ca/AAuP4NT?ocid=st: The Bank of Canada raised its benchmark interest rate to 1.25 per cent Wednesday and signalled that,,
  http://a.msn.com/00/en-us/AAuSDKo?ocid=st: Three weeks after picking up a controversial cargo in the U.K., a liquefied natural gas tanker made ,
  http://a.msn.com/00/en-us/AAv6fui?ocid=st: Last March, Royal Dutch Shell said it was selling most of its stake in Canada's oil sands, a vast pr,
  http://a.msn.com/00/en-us/AAva3us?ocid=st: Home Depot is awarding its hourly employees in the U.S. a one-time cash bonus of as much as $1,000 f,
  ...,
  https://yesmeansyesblog.wordpress.com/2009/11/24/predator-redux/: by Thomas
Meet The Predators , which featured a 2002 study co-authored by psychology professor David,
  https://zdubbzattmom.wordpress.com/2018/01/29/zakks-manifesto-or-hitting-t

In [27]:
userId = list(trainUsers.keys())[1]
xvectors = []
xurls = []
for url in trainUsers[userId]:
    xvectors.append(urlsVectors[url])
    xurls.append(url)
xvectors = np.array(xvectors)
bp(xurls, logger)
bp(xvectors, logger)
yvectors = []
yurls = []
for url in candidates[userId][0]:
    yvectors.append(urlsVectors[url])
    yurls.append(url)
yvectors = np.array(yvectors)
bp(yurls, logger)
bp(yvectors, logger)

[
  http://bit.ly/2heH8Nm,
  http://wapo.st/2zbVacE,
  ...,
  https://fb.me/9Chq3ugeZ,
  https://www.nbcnews.com/news/us-news/trump-s-history-breaking-decorum-remarks-race-ethnicity-n837181
]
[[2.80898876e-05 2.80898876e-05 2.80898876e-05 ... 2.80898876e-05
  2.80898876e-05 2.80898876e-05]
 [5.31349628e-06 5.31349628e-06 5.31349628e-06 ... 5.31349628e-06
  5.31349628e-06 5.31349628e-06]
 [3.38983051e-05 3.38983051e-05 3.38983051e-05 ... 3.38983051e-05
  3.38983051e-05 3.38983051e-05]
 ...
 [2.71002710e-05 2.71002710e-05 2.71002710e-05 ... 2.71002710e-05
  2.71002710e-05 2.71002710e-05]
 [1.00806452e-05 1.00806452e-05 1.00806452e-05 ... 1.00806452e-05
  1.00806452e-05 1.00806452e-05]
 [8.65051903e-06 8.65051903e-06 8.65051903e-06 ... 8.65051903e-06
  8.65051903e-06 8.65051903e-06]]
[
  https://www.naplesnews.com/story/sports/high-school/girls-basketball/2018/02/08/prep-girls-basketbal,
  https://buff.ly/2EnN3sV,
  ...,
  https://mississippitoday.org/2018/01/30/who-gets-state-dollars-to-

In [28]:
sims = cosine_similarity(xvectors, yvectors)

In [None]:
# TODO faire une proportion de top pour faire la moyenne de similarité d'un candidat
# Donc soit la variable prends un ratio par rapport au nombre de train, soit un nombre de train...

In [None]:
if False:
    for i in range(10):
        trainIndex = random.choice(list(range(len(xurls))))
        testIndex = random.choice(list(range(len(yurls))))
        trainUrl = xurls[trainIndex]
        testUrl = yurls[testIndex]
        trainText = urlsTexts[trainUrl]
        testText = urlsTexts[testUrl]
        sim = sims[trainIndex][testIndex]
        if len(trainText) < 2000 and len(testText) < 2000 and (sim < 0.2 or sim > 0.8):
            log(sim, logger)
            log("\n", logger)
            log(trainUrl, logger)
            log(trainText, logger)
            log("\n", logger)
            log(testUrl, logger)
            log(testText, logger)
            log("\n" * 2 + '-' * 20 + "\n" * 2, logger)

In [None]:
for i in range(10):
    currentSims = []
    for u in range(len(yurls)):
        currentSims.append((yurls[u], urlsTexts[yurls[u]], sims[i][u]))
    topSim = sortBy(currentSims, index=2, desc=True)[:3]
    topDissim = sortBy(currentSims, index=2, desc=False)[:3]
    trainUrl = xurls[i]
    trainText = urlsTexts[trainUrl]
    log(trainUrl, logger)
    log(trainText, logger)
    log("\n", logger)
    log("MOST SIMILARS", logger)
    log("\n", logger)
    for url, text, sim in topSim:
        log(sim, logger)
        log(url, logger)
        log(text, logger)
        log("\n", logger)
    log("MOST DISSIMILARS", logger)
    log("\n", logger)
    for url, text, sim in topDissim:
        log(sim, logger)
        log(url, logger)
        log(text, logger)
        log("\n", logger)
    log("\n", logger)
    log("\n" * 2 + '-' * 20 + "\n" * 2, logger)        