# Commands

In [None]:
# cd ~/twinews-logs ; jupython -o nohup-generic-$HOSTNAME.out --venv st-venv ~/Workspace/Python/Datasets/Twinews/twinews/models/generic.ipynb

# Init

In [1]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [2]:
isNotebook = '__file__' not in locals()

In [3]:
TEST = isNotebook # isNotebook, True, False

In [4]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from nlptools.preprocessing import *
from nlptools.basics import *
from twinews.utils import *
from twinews.models.ranking import *
from machinelearning.iterator import *

In [5]:
logger = Logger(tmpDir('logs') + "/generic.log") if isNotebook else Logger("generic-" + getHostname() + ".log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

# Config

In [6]:
config = \
{
    'splitVersion': 2,
    'maxUsers': 2 if TEST else None, # Sub-sampling
    'distance': 'js', # 'cosine', 'euclidean', 'kl', 'js' (ValueError math domain error pour kl et js)
    'historyRef': 0.5, # 0.1, 0.3, 0.6, 1.0, 1, 3, 10, 30
}

In [7]:
def _getCache(key):
    (user, password, host) = getMongoAuth(user='hayj')
    cache = SerializableDict\
    (
        key,
        user=user, host=host, password=password,
        useMongodb=True, logger=logger,
    )
    return cache

In [8]:
def getVector(url, field, cache, newsCollection):
    text = newsCollection.findOne({'url': url}, projection={field: True})
    theHash = objectToHash(text[field])
    return cache[theHash]

In [9]:
if False:
    (user, password, host) = getMongoAuth(user='hayj')
    cache = SerializableDict\
    (
        "twinews-dbert-94bef_ep32",
        user=user, host=host, password=password,
        useMongodb=True, logger=logger,
    )
    urlCache = SerializableDict\
    (
        "twinews-dbert-94bef_ep32-url",
        user=user, host=host, password=password,
        useMongodb=True, logger=logger,
    )
    pbar = ProgressBar(len(newsCollection), logger=logger, printRatio=0.01)
    for row in newsCollection.find({}, projection={'detokText': True, 'url': True}):
        url = row['url']
        text = row['detokText']
        theHash = objectToHash(text)
        vector = cache[theHash]
        urlCache[url] = vector
        pbar.tic()
    exit()

In [21]:
# DBert-ft:
if True:
    config = mergeDicts(config, {})
    modelName = 'dbert-ft'
    cacheKey = "twinews-dbert-94bef_ep32"
    field = 'detokText'
    getCache = lambda: _getCache(cacheKey)
    # getVector = lambda url: _getVector(url, field='detokText')

In [11]:
# DBert-base:
if False:
    modelName = 'dbert-base'
    cacheKey = "twinews-dbert-base"
    # ?????

In [12]:
# Here we check if we already generated ranking for this model with this specific config:
if not isNotebook:
    if rankingExists(modelName, config, logger=logger):
        raise Exception(modelName + " with this config already exist:\n" + b(config, 5))

In [13]:
bp(config, 5, logger)

{ 'distance': euclidean, 'historyRef': 30, 'maxUsers': 2, 'splitVersion': 2 }


# Getting data

In [14]:
# Getting users and news
evalData = getEvalData(config['splitVersion'], maxExtraNews=0, maxUsers=config['maxUsers'], logger=logger)
(trainUsers, testUsers, trainNews, testNews, candidates, extraNews) = \
(evalData['trainUsers'], evalData['testUsers'], evalData['trainNews'],
 evalData['testNews'], evalData['candidates'], evalData['extraNews'])
bp(evalData.keys(), 5, logger)
log(b(evalData['meta'], 5), logger)

--> tic: 32.57s | message: Eval data loaded
--> toc total duration: 32.84s | message: Got Twinews evaluation data
{ candidates, extraNews, meta, testNews, testUsers, trainNews, trainUsers }
{ 'created': 2020.03.24-14.28.06, 'endDate': 2018-01-15, 'id': 2, 'ranksLength': 1000, 'splitDate': 2017-12-25, 'startDate': 2017-10-01, 'testMaxNewsPerUser': 97, 'testMeanNewsPerUser': 7.22, 'testMinNewsPerUser': 2, 'testNewsCount': 71781, 'totalNewsAvailable': 570210, 'trainMaxNewsPerUser': 379, 'trainMeanNewsPerUser': 26.48, 'trainMinNewsPerUser': 8, 'trainNewsCount': 237150, 'usersCount': 15905 }


In [15]:
# Here it is important to convert urls to lists because we want the same order to retrieve vectors by index...
# And we shuffle it so we do not stick urls a a user at the begin...
# But we seed the random to always have same order...
trainNewsList = shuffle(list(trainNews), seed=0)
testNewsList = shuffle(list(testNews), seed=0)
newsList = trainNewsList + testNewsList

In [16]:
# Print all:
log(str(len(trainNewsList)) + " urls for trainNewsList", logger=logger)
log(str(len(testNewsList)) + " urls for testNewsList", logger=logger)
log(str(len(newsList)) + " urls for newsList", logger=logger)

28 urls for trainNewsList
1989 urls for testNewsList
2017 urls for newsList


In [17]:
def genFunct(containers, field, getCache, *args, **kwargs):
    if not isinstance(containers[0], list):
        containers = [containers]
    cache = getCache()
    newsCollection = getNewsCollection()
    for container in containers:
        for url in container:
            vector = getVector(url, field, cache, newsCollection)
            assert vector is not None
            yield (url, vector)

In [22]:
mli = MLIterator\
(
    chunks(newsList, int(len(newsList) / 100)),
    genFunct, genArgs=(field, getCache,),
    logger=logger,
    printRatio=0.03,
    parallelProcesses=cpuCount(),
)

With parallelProcesses > 1, this iterator is not consistent, meaning 2 iterations over same containers will not give items in the same order
101 containers to process.


In [23]:
urlsVectors = dict()
for url, vector in mli:
    urlsVectors[url] = vector

serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


  0% [                    ]
serializabledict twinews-dbert-94bef_ep32 initialised.
  2% [                    ]


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
  5% [=                   ] (29.45s left)
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


  8% [=                   ] (27.804s left)
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


 11% [==                  ] (22.027s left)
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
 14% [==                  ] (17.658s left)
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
 17% [===                 ] (18.951s left)


twinews news (version 1.0) initialised.


 20% [====                ] (16.114s left)
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
 23% [====                ] (14.309s left)


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
 26% [=====               ] (14.772s left)


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
 29% [=====               ] (13.087s left)
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.


serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.
serializabledict twinews-dbert-94bef_ep32 initialised.


twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.
twinews news (version 1.0) initialised.




In [24]:
bp(urlsVectors, logger)

{
  http://a.msn.com/01/en-ie/BBHzIAS?ocid=st: [-8.46303463e-01 -2.66490638e-01  5.78416169e-01 -4.38786149e-01
    1.10970819e+00  1.19784212e+00  6.91489100e-01  2.85046875e-01
    2.11473122e-01 -1.82770282e-01 -1.36968708e+00 -4.53171790e-01
   -5.14011681e-01  8.22794557e-01  1.05704948e-01 -1.58570781e-02
    1.38759464e-01  3.88196021e-01  2.89313853e-01  4.99396205e-01
    2.29227722e-01  2.20747143e-01  3.05369467e-01 -6.90034389e-01
    6.25108480e-01  3.21680427e-01 -2.69037992e-01 -6.97937012e-01
    6.06300294e-01  2.30082631e-01  8.63310397e-01  1.77334309e+00
   -8.30943108e-01 -4.43621278e-02 -8.46084878e-02  4.88524318e-01
   -2.16439605e-01  3.46250474e-01 -2.54007876e-01  3.76045167e-01
   -4.62324321e-01 -8.00523996e-01 -2.22775340e-03  3.42337489e-01
   -4.67716336e-01  1.37141913e-01 -3.95035863e-01  6.27287030e-01
   -4.83058393e-01  9.81158853e-01  1.88732326e-01  6.79129243e-01
    5.61174154e-01  3.80510479e-01 -1.36917233e-02  1.40710676e+00
    4.31657463e-0

In [25]:
tt.tic("Data loaded")

--> tic: 2m 21.159s | message: Data loaded


141.16

# Ranking

In [26]:
# Read the doc!
rankings = usersRankingsByHistoryDistance\
(
    trainUsers,
    candidates,
    config['historyRef'],
    urlsVectors,
    distanceMetric=config['distance'],
    logger=logger,
)



In [27]:
bp(rankings, logger, 4)

{
  '2617447752': 
  [
    [
      https://www.theguardian.com/football/2018/jan/07/video-assistant-referees-errors-mike-riley,
      https://wp.me/p2g4vo-1QEu,
      https://onmilwaukee.com/sports/articles/fausto-batella-first-and-ten-book.html,
      http://www.argusleader.com/story/sports/2018/01/12/heart-attack-referee/1027871001/,
      http://www.baltimoresun.com/sports/orioles/bs-sp-boog-20170329-story.html,
      https://wp.me/p2g4vo-1QzM,
      http://bit.ly/2l3TKIO,
      http://politi.co/2zGRjR3,
      https://medium.com/music-city-soccer/nashville-sc-signs-four-new-players-to-usl-squad-dcaeab3bdecf,
      https://usat.ly/2D5Obny,
      ...,
      https://www.courier-journal.com/story/opinion/contributors/2017/12/29/kentucky-prison-crowding-justice-system/987086001/,
      http://www.scnow.com/messenger/article_a4c053f0-f3fb-11e7-8186-575364924ac6.html,
      https://fb.me/9fcToJQ6Q,
      https://nyti.ms/2DxakZv,
      http://on.wsoctv.com/2AFtLfL,
      http://www.lcsun-ne

In [28]:
tt.tic("Rankings done")

--> tic: 1.32s | message: Rankings done


1.32

# Adding rankings to the db

In [32]:
# Read the doc!
addRanking(modelName, rankings, config, logger=logger)

In [None]:
tt.tic("Rankings stored")

In [None]:
totalDuration = tt.toc()

In [None]:
notif(modelName + '-' + objectToHash(config)[:5] + " done in " + secondsToHumanReadableDuration(totalDuration) + " on " + getHostname())