## Commands and notes

In [1]:
# oomstopper --no-tail combinasons ; killbill combinasons ; cd ~/twinews-logs ; jupython -o nohup-combinasons-$HOSTNAME.out --venv st-venv ~/Workspace/Python/Datasets/Twinews/twinews/models/combinasons.ipynb

In [2]:
# TODO convertir "doc2vec" en "doc2vec-hash" --> en cherchant le meilleur ndcg
# Pour le parametre qui determine les modèles utilisé pour le merge

## Init

In [3]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [4]:
isNotebook = '__file__' not in locals()

In [5]:
TEST = False # isNotebook, True, False

In [6]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from nlptools.preprocessing import *
from nlptools.basics import *
from twinews.utils import *
from twinews.models.ranking import *
from machinelearning.iterator import *
import warnings

In [7]:
from twinews.models.genericutils import *
from twinews.models.ranking import *
from twinews.models.ranking import *

In [8]:
logger = Logger(tmpDir('logs') + "/combinasons.log") if isNotebook else Logger("combinasons-" + getHostname() + ".log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

## Functions

In [9]:
def getBestModelKey(model, splitVersion, maxUsers=None, metric='ndcg'):
    twinewsRankings = getTwinewsRankings(verbose=False)
    twinewsScores = getTwinewsScores(verbose=False)
    keys = twinewsRankings.keys()
    rows = []
    for key in keys:
        meta = twinewsRankings.getMeta(key)
        if meta['splitVersion'] == splitVersion \
        and meta['maxUsers'] == None \
        and meta['model'] == model:
            scoreRow = twinewsScores.findOne({'id': meta['id'], 'metric': metric})
            assert 'score' not in meta
            meta['score'] = scoreRow['score']
            rows.append(meta)
    rows = [(e['id'], e['score']) for e in rows]
    rows = sortBy(rows, index=1, desc=True)
    best = rows[0][0]
    return best

In [10]:
rankingsCache = None
def getRankings(model, userIds=None):
    global rankingsCache
    if rankingsCache is None:
        rankingsCache = dict()
    twinewsRankings = getTwinewsRankings(verbose=False)
    if model in rankingsCache:
        rankings = rankingsCache[model]
    else:
        rankings = twinewsRankings[model]
        rankingsCache[model] = rankings
    assert rankings is not None
    if userIds is not None:
        rankings = dictSelect(rankings, userIds)
    return rankings

In [11]:
def mergeTwinewsRankings(modelsRankings, config, *args, logger=None, verbose=True, **kwargs):
    # We merge rankings for these users:
    mergeRankingsKwargs = dictSelect(config, {'rankAsScore', 'weights', 'alphas', 'betas'})
    combRankings = dict()
    for userId in modelsRankings[0].keys():
        for rkIndex in range(len(modelsRankings[0][userId])):
            currentRankings = []
            for modelIndex in range(len(modelsRankings)):
                currentRankings.append(modelsRankings[modelIndex][userId][rkIndex])
            merged = mergeRankings(currentRankings, **mergeRankingsKwargs, returnScores=False, logger=logger)
            if userId not in combRankings:
                combRankings[userId] = [None] * len(modelsRankings[0][userId])
            combRankings[userId][rkIndex] = merged
    return combRankings

In [12]:
userIdsCache = None
def generateRankings(config, logger=None, verbose=True):
    # We init user ids cache:
    global userIdsCache
    if userIdsCache is None:
        userIdsCache = dict()
    # We get models:
    assert "models" in config
    assert isinstance(config["models"], list) or isinstance(config["models"], set)
    assert len(config["models"]) >= 2
    if isinstance(config["models"], list) and models != sorted(models):
        raise Exception("You need to give models by alphabetic order")
    models = sorted(list(config["models"]))
    # We set default params:
    if not dictContains(config, 'rankAsScore'):
        config['rankAsScore'] = [True] * len(models)
        logWarning("You didn't set rankAsScore", logger=logger, verbose=verbose)
    if not dictContains(config, 'weights'):
        config['weights'] = [1.0 / len(models)] * len(models)
        logWarning("You didn't set weights", logger=logger, verbose=verbose)
    if not dictContains(config, 'alphas'):
        config['alphas'] = [0.5] * len(models)
        logWarning("You didn't set alphas", logger=logger, verbose=verbose)
    if not dictContains(config, 'betas'):
        config['betas'] = [NormalizedLawBeta.LOG] * len(models)
        logWarning("You didn't set betas", logger=logger, verbose=verbose)
    # We get user ids:
    userIds = None
    if config["maxUsers"] is not None:
        if config["maxUsers"] in userIdsCache:
            userIds = userIdsCache[config["maxUsers"]]
        else:
            evalData = getEvalData(config['splitVersion'], maxExtraNews=0, maxUsers=config['maxUsers'],
                                   logger=logger, verbose=verbose)
            userIds = set(evalData['candidates'].keys())
            log("Users: " + b(userIds), logger)
            userIdsCache[config["maxUsers"]] = userIds
    # We get best models:
    models = sorted([getBestModelKey(model, config['splitVersion']) for model in models])
    config['models'] = models
    # We check if rankings exists:
    if rankingExists(modelName, config):
        return (None, None)
    # Initi tt:
    tt = TicToc(logger=logger)
    tt.tic("Starting " + b(models))
    # We get rankings:
    modelsRankings = []
    for model in models:
        currentRks = getRankings(model, userIds=userIds)
        element = currentRks[list(currentRks.keys())[0]][0][0]
        if isinstance(element, str):
            logWarning("No scores with items in rankings of " + model, logger, verbose=verbose)
            return (None, None)
        modelsRankings.append(currentRks)
    tt.tic("Got all rankings")
    # We chunk rankings:
    userIdsChunks = split(sorted(list(modelsRankings[0].keys())), cpuCount())
    modelsRankingsChunks = []
    for userIdsChunk in userIdsChunks:
        current = []
        for currentRankings in modelsRankings:
            current.append(dictSelect(currentRankings, userIdsChunk))
        modelsRankingsChunks.append(current)
    modelsRankingsChunks = chunks(modelsRankingsChunks, 1)
    tt.tic("Rankings chunked")
    # We define the gen funct:
    def genFunct(containers, *args, **kwargs):
        for modelsRankings in containers:
            # with warnings.catch_warnings(): # Doesn't work
            #     if filterWarnings:
            #         warnings.filterwarnings('ignore', r'encountered in double')
            yield mergeTwinewsRankings(modelsRankings, *args, **kwargs)
    # We define the mli:
    mli = MLIterator(modelsRankingsChunks, genFunct=genFunct, genArgs=(config,),
                     parallelProcesses=cpuCount(), maxParallelProcesses=cpuCount(),
                     logger=logger, verbose=False)
    # We get all merges:
    allCombRankings = []
    for current in mli:
        allCombRankings.append(current)
    tt.tic("Got all merges from sub-processes")
    # We merge all:
    combRankings = mergeDicts(allCombRankings)
    tt.toc(b(models) + " DONE.")
    # And we return it:
    return (config, combRankings)

## Exec

In [13]:
config = \
{
    'splitVersion': 2,
    'maxUsers': 2 if TEST else None, # Sub-sampling
}

In [14]:
modelName = 'combin'

In [15]:
modelsDomain = {'lda', 'nmf', 'tfidf', 'bm25', 'dbert-ft', 'dbert-base', 
          'stylo', 'infersent', 'sent2vec', 'word2vec', 'doc2vec', 'usent', 'bert'}
rankAsScoreDomain = {True, False}
alphasDomain = {0.1, 0.25, 0.5, 0.75, 0.9}
betasDomain = {NormalizedLawBeta.LOG, NormalizedLawBeta.EXP}

In [16]:
def combineModels(modelsDomain, amount):
    modelsComb = [set(e) for e in combine(modelsDomain, amount) if len(e) == len(set(e))]
    result = []
    for e in modelsComb:
        if e not in result:
            result.append(e)
    result = sorted(result, key=lambda x: str(sorted(list(x))))
    return result

In [17]:
modelsComb2 = combineModels(modelsDomain, 2)

In [18]:
if TEST:
    modelsComb2 = modelsComb2[:1]

In [19]:
# modelsComb2 = [e for e in modelsComb2 if "word2vec" in str(e)]

In [20]:
bp(modelsComb2, 5, logger)

[ { bert, bm25 }, { bert, dbert-base }, { bert, dbert-ft }, { bert, doc2vec }, { bert, infersent }, { bert, lda }, { bert, nmf }, { bert, sent2vec }, { bert, stylo }, { bert, tfidf }, { bert, usent }, { bert, word2vec }, { bm25, dbert-base }, { bm25, dbert-ft }, { bm25, doc2vec }, { bm25, infersent }, { bm25, lda }, { bm25, nmf }, { bm25, sent2vec }, { bm25, stylo }, { bm25, tfidf }, { bm25, usent }, { bm25, word2vec }, { dbert-base, dbert-ft }, { dbert-base, doc2vec }, { dbert-base, infersent }, { dbert-base, lda }, { dbert-base, nmf }, { dbert-base, sent2vec }, { dbert-base, stylo }, { dbert-base, tfidf }, { dbert-base, usent }, { dbert-base, word2vec }, { dbert-ft, doc2vec }, { dbert-ft, infersent }, { dbert-ft, lda }, { dbert-ft, nmf }, { dbert-ft, sent2vec }, { dbert-ft, stylo }, { dbert-ft, tfidf }, { dbert-ft, usent }, { dbert-ft, word2vec }, { doc2vec, infersent }, { doc2vec, lda }, { doc2vec, nmf }, { doc2vec, sent2vec }, { doc2vec, stylo }, { doc2vec, tfidf }, { doc2vec, usen

In [21]:
rankAsScore = [False, False]
weights = [0.5, 0.5]
alphas = [0.5, 0.5]
betas = [NormalizedLawBeta.LOG, NormalizedLawBeta.LOG]

In [None]:
for models in pb(modelsComb2, logger=logger, printRatio=0.01):
    currentConfig = mergeDicts\
    (
        config,
        {
            'models': models,
            'rankAsScore': rankAsScore,
            'weights': weights,
            'alphas': alphas,
            'betas': betas,
        }
    )
    (currentConfig, rankings) = generateRankings(currentConfig, logger=logger)
    if currentConfig is not None:
        addRanking(modelName, rankings, currentConfig, logger=logger)
        warnFreeRAM(logger=logger)
        if freeRAM() < 4:
            rankingsCache = None
            log("Breaking the loop", logger)
            break
    else:
        log(b(models) + " already added", logger)

{ bert, bm25 } already added
  1% [                    ]
{ bert, dbert-base } already added
  2% [                    ]
{ bert, dbert-ft } already added
  3% [                    ]
{ bert, doc2vec } already added
  5% [=                   ] (32.93s left)
{ bert, infersent } already added
  6% [=                   ] (34.31s left)
--> tictoc starts... | message: Starting [ bert-65e2b, lda-d4f1d ]
--> tic: 29.93s | message: Got all rankings
--> tic: 0.05s | message: Rankings chunked
--> tic: 18.39s | message: Got all merges from sub-processes
--> toc total duration: 48.41s | message: [ bert-65e2b, lda-d4f1d ] DONE.
62.8g of RAM remaining.
  7% [=                   ] (14m 6.959s left)
--> tictoc starts... | message: Starting [ bert-65e2b, nmf-9cd4f ]
--> tic: 20.74s | message: Got all rankings
--> tic: 0.03s | message: Rankings chunked
--> tic: 23.37s | message: Got all merges from sub-processes
--> toc total duration: 44.18s | message: [ bert-65e2b, nmf-9cd4f ] DONE.
60.43g of RAM remaini