## Commands and notes

In [None]:
# oomstopper --no-tail combinasons ; killbill combinasons ; cd ~/twinews-logs ; jupython -o nohup-combinasons-$HOSTNAME.out --venv st-venv ~/Workspace/Python/Datasets/Twinews/twinews/models/combinasons.ipynb

In [None]:
# TODO convertir "doc2vec" en "doc2vec-hash" --> en cherchant le meilleur ndcg
# Pour le parametre qui determine les modèles utilisé pour le merge

## Init

In [1]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [2]:
isNotebook = '__file__' not in locals()

In [3]:
TEST = isNotebook # isNotebook, True, False

In [4]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from nlptools.preprocessing import *
from nlptools.basics import *
from twinews.utils import *
from twinews.models.ranking import *
from machinelearning.iterator import *

In [5]:
from twinews.models.genericutils import *
from twinews.models.ranking import *
from twinews.models.ranking import *

In [6]:
logger = Logger(tmpDir('logs') + "/combinasons.log") if isNotebook else Logger("combinasons-" + getHostname() + ".log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

## Functions

In [7]:
def getBestModelKey(model, splitVersion, maxUsers=None, metric='ndcg'):
    twinewsRankings = getTwinewsRankings(verbose=False)
    twinewsScores = getTwinewsScores(verbose=False)
    keys = twinewsRankings.keys()
    rows = []
    for key in keys:
        meta = twinewsRankings.getMeta(key)
        if meta['splitVersion'] == splitVersion \
        and meta['maxUsers'] == None \
        and meta['model'] == model:
            scoreRow = twinewsScores.findOne({'id': meta['id'], 'metric': metric})
            assert 'score' not in meta
            meta['score'] = scoreRow['score']
            rows.append(meta)
    rows = [(e['id'], e['score']) for e in rows]
    rows = sortBy(rows, index=1, desc=True)
    best = rows[0][0]
    return best

In [8]:
rankingsCache = None
def getRankings(model, userIds=None):
    global rankingsCache
    if rankingsCache is None:
        rankingsCache = dict()
    twinewsRankings = getTwinewsRankings(verbose=False)
    if model in rankingsCache:
        rankings = rankingsCache[model]
    else:
        rankings = twinewsRankings[model]
        rankingsCache[model] = rankings
    assert rankings is not None
    if userIds is not None:
        rankings = dictSelect(rankings, userIds)
    return rankings

In [9]:
userIdsCache = None
def generateRankings(config, logger=None, verbose=True):
    # We init user ids cache:
    global userIdsCache
    if userIdsCache is None:
        userIdsCache = dict()
    # We get models:
    assert "models" in config
    assert isinstance(config["models"], list) or isinstance(config["models"], set)
    assert len(config["models"]) >= 2
    if isinstance(config["models"], list) and models != sorted(models):
        raise Exception("You need to give models by alphabetic order")
    models = sorted(list(config["models"]))
    # We set default params:
    if not dictContains(config, 'rankAsScore'):
        config['rankAsScore'] = [True] * len(models)
        logWarning("You didn't set rankAsScore", logger=logger, verbose=verbose)
    if not dictContains(config, 'weights'):
        config['weights'] = [1.0 / len(models)] * len(models)
        logWarning("You didn't set weights", logger=logger, verbose=verbose)
    if not dictContains(config, 'alphas'):
        config['alphas'] = [0.5] * len(models)
        logWarning("You didn't set alphas", logger=logger, verbose=verbose)
    if not dictContains(config, 'betas'):
        config['betas'] = [NormalizedLawBeta.LOG] * len(models)
        logWarning("You didn't set betas", logger=logger, verbose=verbose)
    # We get user ids:
    userIds = None
    if config["maxUsers"] is not None:
        if config["maxUsers"] in userIdsCache:
            userIds = userIdsCache[config["maxUsers"]]
        else:
            evalData = getEvalData(config['splitVersion'], maxExtraNews=0, maxUsers=config['maxUsers'],
                                   logger=logger, verbose=verbose)
            userIds = set(evalData['candidates'].keys())
            log("Users: " + b(userIds), logger)
            userIdsCache[config["maxUsers"]] = userIds
    # We get best models:
    models = sorted([getBestModelKey(model, config['splitVersion']) for model in models])
    log("Setting models as: " + b(models), logger)
    config['models'] = models
    # We get rankings:
    modelsRankings = []
    for model in models:
        modelsRankings.append(getRankings(model, userIds=userIds))
    # We generate rankings:
    mergeRankingsKwargs = dictSelect(config, {'rankAsScore', 'weights', 'alphas', 'betas'})
    combRankings = dict()
    for userId in modelsRankings[0].keys():
        for rkIndex in range(len(modelsRankings[0][userId])):
            currentRankings = []
            for modelIndex in range(len(modelsRankings)):
                currentRankings.append(modelsRankings[modelIndex][userId][rkIndex])
            merged = mergeRankings(currentRankings, **mergeRankingsKwargs, returnScores=False, logger=logger)
            if userId not in combRankings:
                combRankings[userId] = [None] * len(modelsRankings[0][userId])
            combRankings[userId][rkIndex] = merged
    # And we return it:
    return (config, combRankings)

## Exec

In [10]:
config = \
{
    'splitVersion': 2,
    'maxUsers': None if TEST else None, # Sub-sampling
}

In [11]:
modelName = 'combin'

In [12]:
modelsDomain = {'lda', 'nmf', 'tfidf', 'bm25', 'dbert-ft', 'dbert-base', 
          'stylo', 'infersent', 'sent2vec', 'word2vec', 'doc2vec', 'usent', 'bert'}
rankAsScoreDomain = {True, False}
alphasDomain = {0.1, 0.25, 0.5, 0.75, 0.9}
betasDomain = {NormalizedLawBeta.LOG, NormalizedLawBeta.EXP}

In [13]:
def combineModels(modelsDomain, amount):
    modelsComb = [set(e) for e in combine(modelsDomain, amount) if len(e) == len(set(e))]
    result = []
    for e in modelsComb:
        if e not in result:
            result.append(e)
    result = sorted(result, key=lambda x: str(sorted(list(x))))
    return result

In [14]:
modelsComb2 = combineModels(modelsDomain, 2)

In [15]:
if TEST:
    modelsComb2 = modelsComb2[:2]

In [16]:
bp(modelsComb2, 5, logger)

[ { bert, bm25 }, { bert, dbert-base } ]


In [17]:
rankAsScore = [True, True]
weights = [0.5, 0.5]
alphas = [0.5, 0.5]
betas = [NormalizedLawBeta.LOG, NormalizedLawBeta.LOG]

In [None]:
for models in pb(modelsComb2, logger=logger, printRatio=0.01):
    currentConfig = mergeDicts\
    (
        config,
        {
            'models': models,
            'rankAsScore': rankAsScore,
            'weights': weights,
            'alphas': alphas,
            'betas': betas,
        }
    )
    (currentConfig, rankings) = generateRankings(currentConfig, logger=logger)
    try:
        addRanking(modelName, rankings, currentConfig, logger=logger)
    except Exception as e:
        log(str(e), logger)
    warnFreeRAM()
    if freeRAM() < 4:
        rankingsCache = None
        break

Setting models as: [ bert-65e2b, bm25-1eb2a ]
