In [None]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [None]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from datastructuretools.cache import *
from twinews.utils import *
from twinews.evaluation.utils import *
from twinews.models.ranking import *

In [None]:
from dataviztools import bokehutils
from bokeh.plotting import output_notebook, show
from bokeh.layouts import gridplot
from bokeh.plotting import figure
output_notebook()

In [None]:
import importlib
importlib.reload(bokehutils)

In [None]:
logger = Logger()

# Printing stats on rankings

In [None]:
twinewsRankings = getTwinewsRankings()
models = set()
for row in twinewsRankings.collection.find({"meta.splitVersion": 1, 'meta.maxUsers': None}):
    if row['meta']['model'] != 'combin':
        models.add(row['meta']['model'])
models.remove('worst')
models.remove('ideal')
models.remove('random')
keys = set()
for model in models:
    keys.add(twinewsRankings.collection.find_one({'meta.model': model, "meta.splitVersion": 1, 'meta.maxUsers': None})['id'])
keys.add("combin-8f846")
keys.add("combin-efad4")
keys = sorted(list(keys))
print(keys)

In [None]:
def rankingReport(rankings, maxUsers=None, logger=None, verbose=True, pbarVerbose=True):
    if maxUsers is not None:
        keys = shuffle(list(rankings.keys()), seed=0)[:maxUsers]
        rankings = dictSelect(rankings, keys)
    # We find all data:
    medians = []
    means = []
    rkvs = []
    scoress = []
    allScores = []
    for userId in pb(list(rankings.keys()), logger=logger, verbose=pbarVerbose):
        assert isinstance(rankings[userId][0][0], tuple)
        scores = [e[1] for e in rankings[userId][0]]
        scores = normalizeRankingScores(scores)
        medians.append(np.median(scores))
        means.append(np.mean(scores))
        rkvs.append(rankingVariance(scores))
        scoress.append(scores)
        allScores += list(scores)
    # We print means and std:
    for values, label in \
    [
        (medians, "Median"), (means, "Mean"),
        (rkvs, "Ranking variance")
    ]:
        std = np.std(values)
        # std = mean_confidence_interval(values)
        std = truncateFloat(std, 2)
        mean = truncateFloat(np.mean(values), 2)
        text = label + ": " + str(mean) + " ±" + str(std) + " (95%)"
        log(text, logger, verbose=verbose)
    # We show all scores histogram:
    log("Histogram of all scores (as distances):", logger=logger, verbose=verbose)
    show(bokehutils.barplot(allScores, 30, hideTools=True, hideXAxis=True, hideYAxis=True, title="", height=100, width=300))
    # We show a sample of scores:
    log("Histogram samples of scores (as distances):", logger=logger, verbose=verbose)
    scoress = shuffle(list(scoress))
    figures = []
    for row in range(3):
        currentFigures = []
        for col in range(3):
            scores = scoress.pop()
            f = bokehutils.barplot(scores, 10, hideTools=True, hideXAxis=True, hideYAxis=True, title="", height=40, width=100)
            currentFigures.append(f)
        figures.append(currentFigures)
    grid = gridplot(figures, toolbar_options={'logo': None})
    show(grid)

In [None]:
tt = TicToc()
tt.tic()
for key in keys:
    print("#" * 20 + " " + key + " " + "#" * 20)
    rankings = twinewsRankings[key]
    tt.tic("Got " + key + " rankings")
    if rankings is None:
        print("No rankings")
    else:
        if isinstance(rankings[list(rankings.keys())[0]][0][0], tuple):
            try:
                rankingReport(rankings, maxUsers=None, pbarVerbose=False)
            except Exception as e:
                logException(e)
            tt.tic("Stats printed for " + key)
        else:
            print("No scores")
tt.toc()

# Avantage d'un modèle dans une combinaison

In [None]:
def getDominance(combin, logger=None, verbose=False):
    meta = twinewsRankings.getMeta(combin)
    model1, model2 = meta['models'][0], meta['models'][1]
    combinRankings = rankingsCache[combin]
    model1Rankings = rankingsCache[model1]
    model2Rankings = rankingsCache[model2]
    userIds = list(combinRankings.keys())
    domin1 = 0
    domin2 = 0
    dominationRatio1 = None
    for i in range(1000000):
        userId = random.choice(userIds)
        crk = pruneScores(combinRankings[userId][0])
        m1rk = pruneScores(model1Rankings[userId][0])
        m2rk = pruneScores(model2Rankings[userId][0])
        urlIndex = getRandomInt(0, 999)
        url = crk[urlIndex]
        position1 = m1rk.index(url)
        position2 = m2rk.index(url)
        distance1 = abs(position1 - urlIndex)
        distance2 = abs(position2 - urlIndex)
        if distance1 == distance2:
            pass
        elif distance1 < distance2:
            domin1 += 1
        else:
            domin2 += 1
        if i > 1000 and i % 10000 == 0:
            newDominationRatio1 = domin1 / (domin1 + domin2)
            if dominationRatio1 is not None:
                if abs(dominationRatio1 - newDominationRatio1) < 0.0005:
                    break
            dominationRatio1 = newDominationRatio1
            log("dominationRatio1: " + str(truncateFloat(dominationRatio1, 2)), logger, verbose=verbose)
            log('-' * 10, logger, verbose=verbose)
    dominationRatio1 = domin1 / (domin1 + domin2)
    return dominationRatio1

In [None]:
twinewsRankings = getTwinewsRankings()
rankingsCache = Cache(twinewsRankings.get, minFreeRAM=20, logger=logger)

In [None]:
combinsSplit1 = getCombinIds(1)
combinsSplit2 = getCombinIds(2)
bp(combinsSplit1, logger)
bp(combinsSplit2, logger)

In [None]:
dominancesSD = getDominancesSD()
bp(list(dominancesSD.items()), logger)

In [None]:
for combin in pb(list(combinsSplit1) + list(combinsSplit2), logger=logger, printRatio=0.01):
    if combin not in dominancesSD:
        meta = twinewsRankings.getMeta(combin)
        model1, model2 = meta['models'][0], meta['models'][1]
        rankAsScore = meta['rankAsScore']
        log("#" * 10 + " " + combin + " (" + model1 + " / " + model2 + ") " + "#" * 10, logger)
        log("rankAsScore: " + str(rankAsScore), logger)
        dominationRatio1 = getDominance(combin, logger=logger, verbose=False)
        log("Dominance prob of " + model1 + ": " + str(truncateFloat(dominationRatio1, 2)), logger)
        # log("#" * 60, logger)
        dominancesSD[combin] = dominationRatio1

# Trash

In [None]:
if False:
    def getDominances(combin):
        meta = twinewsRankings.getMeta(combin)
        model1, model2 = meta['models'][0], meta['models'][1]
        combinRankings = rankingsCache[combin]
        model1Rankings = rankingsCache[model1]
        model2Rankings = rankingsCache[model2]
        userIds = list(combinRankings.keys())
        p1s = []
        p2s = []
        p1 = None
        p2 = None
        for i in range(1000000):
            userId = random.choice(userIds)
            crk = pruneScores(combinRankings[userId][0])
            m1rk = pruneScores(model1Rankings[userId][0])
            m2rk = pruneScores(model2Rankings[userId][0])
            urlIndex = getRandomInt(0, 999)
            url = crk[urlIndex]
            position1 = m1rk.index(url)
            position2 = m2rk.index(url)
            p1s.append(position1 < urlIndex)
            p2s.append(position2 < urlIndex)
            if i % 10000 == 0:
                newp1 = p1s.count(True) / len(p1s)
                newp2 = p2s.count(True) / len(p2s)
                if p1 is not None and p2 is not None:
                    if abs(p1 - newp1) < 0.001 and abs(p2 - newp2) < 0.001:
                        break
                p1, p2 = newp1, newp2
                # print("p1: " + str(truncateFloat(p1, 2)))
                # print("p2: " + str(truncateFloat(p2, 2)))
                # print('-' * 10)
        p1 = p1s.count(True) / len(p1s)
        p2 = p2s.count(True) / len(p2s)
        return (p1, p2)