## Commands

In [1]:
# oomstopper --no-tail genericmodels ; killbill genericmodels ; cd ~/twinews-logs ; jupython -o nohup-genericmodels-$HOSTNAME.out --venv st-venv ~/Workspace/Python/Datasets/Twinews/twinews/models/genericmodels.ipynb

## Init

In [2]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [3]:
isNotebook = '__file__' not in locals()

In [4]:
TEST = isNotebook # isNotebook, True, False

In [5]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from nlptools.preprocessing import *
from nlptools.basics import *
from twinews.utils import *
from twinews.models.ranking import *
from machinelearning.iterator import *

In [6]:
from twinews.models.genericutils import *

In [7]:
logger = Logger(tmpDir('logs') + "/generic.log") if isNotebook else Logger("generic-" + getHostname() + ".log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

## Config

In [8]:
# from systemtools.jupyterutils import *
# argv = getIpynbArgv()

In [17]:
print(sorted([e[1] for e in [('tfidf', 0.05), ('tfidf', 0.25), ('tfidf', 0.1), ('tfidf', 0.15), ('tfidf', 0.65), ('tfidf', 0.75), ('tfidf', 0.85), ('tfidf', 0.9), ('tfidf', 0.7), ('tfidf', 0.8), ('tfidf', 0.95)]]))

[0.05, 0.1, 0.15, 0.25, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]


In [10]:
tuples = [('doc2vec', 0.1), ('doc2vec', 0.2), ('doc2vec', 0.45), ('infersent', 0.1), ('infersent', 0.2), ('infersent', 0.5), ('dbert-base', 0.2), ('dbert-base', 0.45), ('dbert-base', 1.0), ('dbert-base', 0.55), ('dbert-ft', 0.5), ('dbert-ft', 0.55), ('sent2vec', 0.1), ('sent2vec', 0.2), ('sent2vec', 0.45), ('sent2vec', 0.5), ('sent2vec', 0.55), ('sent2vec', 0.6), ('bert', 0.1), ('bert', 0.2), ('bert', 0.55), ('usent', 0.2)]
tipis = "60 59 58 63 00 02 57 05 03 06 07 56 04 61 88 62 83 94 85 93 89 87 95 92 81 82 90 01 86 91 70 68 77 74 78 75 71 65 67 72 73 69 76 79 64".split(" ")
# tuples = tuples[len(tipis):]
tipis = tipis[:len(tuples)]
tuples = tuples[:len(tipis)]
associations = associate(tipis, tuples)
log("associations: " + b(associations, 5), logger)

associations: { '00': ( bert, 0.1 ), '02': ( bert, 0.2 ), '03': ( bert, 0.55 ), '04': ( dbert-base, 0.2 ), '05': ( dbert-base, 0.45 ), '06': ( dbert-base, 0.55 ), '07': ( dbert-base, 1.0 ), '56': ( dbert-ft, 0.5 ), '57': ( dbert-ft, 0.55 ), '58': ( doc2vec, 0.1 ), '59': ( doc2vec, 0.2 ), '60': ( doc2vec, 0.45 ), '61': ( infersent, 0.1 ), '62': ( infersent, 0.2 ), '63': ( infersent, 0.5 ), '83': ( sent2vec, 0.1 ), '85': ( sent2vec, 0.2 ), '87': ( sent2vec, 0.45 ), '88': ( sent2vec, 0.5 ), '89': ( sent2vec, 0.55 ), '93': ( sent2vec, 0.6 ), '94': ( usent, 0.2 ) }


In [None]:
cacheKey = \
(
    # 'dbert-ft'
    # 'dbert-base'
    # 'infersent'
    # 'usent'
    # 'sent2vec'
    # 'doc2vec'
    'bert'
    # 'stylo'
    # 'word2vec'
)
# if len(argv) == 2:
#     cacheKey = argv[0]
cacheKey = associations[tipiNumber()][0]
cacheField = genericFields[cacheKey]
log("cacheKey: " + str(cacheKey), logger)
log("cacheField: " + str(cacheField), logger)

In [None]:
config = \
{
    'splitVersion': 2,
    'maxUsers': 2 if TEST else None, # Sub-sampling
    
    'distance': 'cosine', # 'cosine', 'euclidean', 'kl', 'js' (ValueError math domain error pour kl et js)
    'historyRef': 0.3, # 0.1, 0.3, 0.4, 0.6, 1.0, 1, 3, 10, 30
}
# if len(argv) == 2:
#     config['historyRef'] = argv[1]
config['historyRef'] = associations[tipiNumber()][1]
log("config: " + b(config, 5), logger)

In [None]:
# config = mergeDicts(config, {})
modelName = cacheKey
getCache = lambda: getGenericCache(cacheKey, verbose=False)

In [None]:
# Here we check if we already generated ranking for this model with this specific config:
if not isNotebook:
    if rankingExists(modelName, config, logger=logger):
        raise Exception(modelName + " with this config already exist:\n" + b(config, 5))

In [None]:
log("Generating rankings for " + modelName + ' (`' + cacheField + '` field) with the config:\n' + b(config, 5), logger)

## Getting data

In [None]:
# Getting users and news
evalData = getEvalData(config['splitVersion'], maxExtraNews=0, maxUsers=config['maxUsers'], logger=logger)
(trainUsers, testUsers, trainNews, testNews, candidates, extraNews) = \
(evalData['trainUsers'], evalData['testUsers'], evalData['trainNews'],
 evalData['testNews'], evalData['candidates'], evalData['extraNews'])
bp(evalData.keys(), 5, logger)
log(b(evalData['meta'], 5), logger)

In [None]:
# Here it is important to convert urls to lists because we want the same order to retrieve vectors by index...
# And we shuffle it so we do not stick urls a a user at the begin...
# But we seed the random to always have same order...
trainNewsList = shuffle(list(trainNews), seed=0)
testNewsList = shuffle(list(testNews), seed=0)
newsList = trainNewsList + testNewsList

In [None]:
# Print all:
log(str(len(trainNewsList)) + " urls for trainNewsList", logger=logger)
log(str(len(testNewsList)) + " urls for testNewsList", logger=logger)
log(str(len(newsList)) + " urls for newsList", logger=logger)

In [None]:
def genFunct(containers, field, getCache, *args, **kwargs):
    if not isinstance(containers[0], list):
        containers = [containers]
    cache = getCache()
    newsCollection = getNewsCollection(verbose=False)
    for container in containers:
        for url in container:
            vector = getVector(url, field, cache, newsCollection)
            assert vector is not None
            yield (url, vector)

In [None]:
mli = MLIterator\
(
    chunks(newsList, 100 if TEST else 5000),
    # chunks(newsList, int(len(newsList) / 100)),
    genFunct, genArgs=(cacheField, getCache,),
    logger=logger,
    printRatio=0.03, queuesMaxSize=1000,
    parallelProcesses=cpuCount(),
)

In [None]:
urlsVectors = dict()
for url, vector in mli:
    urlsVectors[url] = vector

In [None]:
bp(urlsVectors, logger)

In [None]:
tt.tic("Data loaded")

## Ranking

In [None]:
# Read the doc!
rankings = usersRankingsByHistoryDistance\
(
    trainUsers,
    candidates,
    config['historyRef'],
    urlsVectors,
    distanceMetric=config['distance'],
    logger=logger,
)

In [None]:
bp(rankings, logger, 4)

In [None]:
tt.tic("Rankings done")

## Adding rankings to the db

In [None]:
# Read the doc!
addRanking(modelName, rankings, config, logger=logger)

In [None]:
tt.tic("Rankings stored")

In [None]:
totalDuration = tt.toc()

In [None]:
if not TEST:
    notif(modelName + '-' + objectToHash(config)[:5] + " done in " + secondsToHumanReadableDuration(totalDuration) + " on " + getHostname())

## Old stuff

In [None]:
if False:
    (user, password, host) = getMongoAuth(user='hayj')
    cache = SerializableDict\
    (
        "twinews-dbert-94bef_ep32",
        user=user, host=host, password=password,
        useMongodb=True, logger=logger,
    )
    urlCache = SerializableDict\
    (
        "twinews-dbert-94bef_ep32-url",
        user=user, host=host, password=password,
        useMongodb=True, logger=logger,
    )
    pbar = ProgressBar(len(newsCollection), logger=logger, printRatio=0.01)
    for row in newsCollection.find({}, projection={'detokText': True, 'url': True}):
        url = row['url']
        text = row['detokText']
        theHash = objectToHash(text)
        vector = cache[theHash]
        urlCache[url] = vector
        pbar.tic()
    exit()