In [1]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [2]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from datastructuretools.cache import *
from twinews.utils import *
from twinews.evaluation.utils import *
from twinews.models.ranking import *

In [3]:
from dataviztools import bokehutils
from bokeh.plotting import output_notebook, show
from bokeh.layouts import gridplot
from bokeh.plotting import figure
output_notebook()

In [4]:
import importlib
importlib.reload(bokehutils)

<module 'dataviztools.bokehutils' from '/hosthome/Workspace/Python/Utils/DatavizTools/dataviztools/bokehutils.py'>

In [5]:
logger = Logger()

# Printing stats on rankings

In [6]:
twinewsRankings = getTwinewsRankings()
models = set()
for row in twinewsRankings.collection.find({"meta.splitVersion": 1, 'meta.maxUsers': None}):
    if row['meta']['model'] != 'combin':
        models.add(row['meta']['model'])
models.remove('worst')
models.remove('ideal')
models.remove('random')
keys = set()
for model in models:
    keys.add(twinewsRankings.collection.find_one({'meta.model': model, "meta.splitVersion": 1, 'meta.maxUsers': None})['id'])
keys.add("combin-8f846")
keys.add("combin-efad4")
keys = sorted(list(keys))
print(keys)

['bert-a7c9a', 'bm25-933f7', 'combin-8f846', 'combin-efad4', 'dbert-base-d092a', 'dbert-ft-7847a', 'doc2vec-e013a', 'infersent-77ec7', 'jaccard-1d3f1', 'lda-82272', 'nmf-6078e', 'sent2vec-32f92', 'stylo-5c321', 'tfidf-4a1dc', 'usent-64ec0', 'word2vec-f3c38']


In [7]:
def rankingReport(rankings, maxUsers=None, logger=None, verbose=True, pbarVerbose=True):
    if maxUsers is not None:
        keys = shuffle(list(rankings.keys()), seed=0)[:maxUsers]
        rankings = dictSelect(rankings, keys)
    # We find all data:
    medians = []
    means = []
    rkvs = []
    scoress = []
    allScores = []
    for userId in pb(list(rankings.keys()), logger=logger, verbose=pbarVerbose):
        assert isinstance(rankings[userId][0][0], tuple)
        scores = [e[1] for e in rankings[userId][0]]
        scores = normalizeRankingScores(scores)
        medians.append(np.median(scores))
        means.append(np.mean(scores))
        rkvs.append(rankingVariance(scores))
        scoress.append(scores)
        allScores += list(scores)
    # We print means and std:
    for values, label in \
    [
        (medians, "Median"), (means, "Mean"),
        (rkvs, "Ranking variance")
    ]:
        std = np.std(values)
        # std = mean_confidence_interval(values)
        std = truncateFloat(std, 2)
        mean = truncateFloat(np.mean(values), 2)
        text = label + ": " + str(mean) + " ±" + str(std) + " (95%)"
        log(text, logger, verbose=verbose)
    # We show all scores histogram:
    log("Histogram of all scores (as distances):", logger=logger, verbose=verbose)
    show(bokehutils.barplot(allScores, 30, hideTools=True, hideXAxis=True, hideYAxis=True, title="", height=100, width=300))
    # We show a sample of scores:
    log("Histogram samples of scores (as distances):", logger=logger, verbose=verbose)
    scoress = shuffle(list(scoress))
    figures = []
    for row in range(3):
        currentFigures = []
        for col in range(3):
            scores = scoress.pop()
            f = bokehutils.barplot(scores, 10, hideTools=True, hideXAxis=True, hideYAxis=True, title="", height=40, width=100)
            currentFigures.append(f)
        figures.append(currentFigures)
    grid = gridplot(figures, toolbar_options={'logo': None})
    show(grid)

In [8]:
tt = TicToc()
tt.tic()
for key in keys:
    print("#" * 20 + " " + key + " " + "#" * 20)
    rankings = twinewsRankings[key]
    tt.tic("Got " + key + " rankings")
    if rankings is None:
        print("No rankings")
    else:
        if isinstance(rankings[list(rankings.keys())[0]][0][0], tuple):
            try:
                rankingReport(rankings, maxUsers=None, pbarVerbose=False)
            except Exception as e:
                logException(e)
            tt.tic("Stats printed for " + key)
        else:
            print("No scores")
tt.toc()

--> tictoc starts...
#################### bert-a7c9a ####################
--> tic: 26.17s | message: Got bert-a7c9a rankings
Median: 0.18 ±0.04 (95%)
Mean: 0.2 ±0.04 (95%)
Ranking variance: 0.04 ±0.03 (95%)
Histogram of all scores (as distances):


Histogram samples of scores (as distances):


--> tic: 2m 42.539s | message: Stats printed for bert-a7c9a
#################### bm25-933f7 ####################
--> tic: 31.03s | message: Got bm25-933f7 rankings
Median: 0.75 ±0.09 (95%)
Mean: 0.73 ±0.09 (95%)
Ranking variance: 0.03 ±0.04 (95%)
Histogram of all scores (as distances):


Histogram samples of scores (as distances):


--> tic: 1m 52.48s | message: Stats printed for bm25-933f7
#################### combin-8f846 ####################
--> tic: 53.13s | message: Got combin-8f846 rankings
No scores
#################### combin-efad4 ####################
--> tic: 42.32s | message: Got combin-efad4 rankings
No scores
#################### dbert-base-d092a ####################
--> tic: 26.77s | message: Got dbert-base-d092a rankings
Median: 0.19 ±0.06 (95%)
Mean: 0.21 ±0.06 (95%)
Ranking variance: 0.03 ±0.02 (95%)
Histogram of all scores (as distances):


Histogram samples of scores (as distances):


--> tic: 2m 35.33s | message: Stats printed for dbert-base-d092a
#################### dbert-ft-7847a ####################
--> tic: 28.97s | message: Got dbert-ft-7847a rankings
Median: 0.56 ±0.06 (95%)
Mean: 0.55 ±0.06 (95%)
Ranking variance: 0.01 ±0.0 (95%)
Histogram of all scores (as distances):


Histogram samples of scores (as distances):


--> tic: 2m 10.09s | message: Stats printed for dbert-ft-7847a
#################### doc2vec-e013a ####################
--> tic: 32.5s | message: Got doc2vec-e013a rankings
Median: 0.6 ±0.06 (95%)
Mean: 0.59 ±0.05 (95%)
Ranking variance: 0.01 ±0.01 (95%)
Histogram of all scores (as distances):


Histogram samples of scores (as distances):


--> tic: 2m 7.31s | message: Stats printed for doc2vec-e013a
#################### infersent-77ec7 ####################
--> tic: 37.75s | message: Got infersent-77ec7 rankings
Median: 0.24 ±0.05 (95%)
Mean: 0.26 ±0.05 (95%)
Ranking variance: 0.03 ±0.03 (95%)
Histogram of all scores (as distances):


Histogram samples of scores (as distances):


--> tic: 2m 29.46s | message: Stats printed for infersent-77ec7
#################### jaccard-1d3f1 ####################
--> tic: 30.99s | message: Got jaccard-1d3f1 rankings
Median: 0.85 ±0.04 (95%)
Mean: 0.83 ±0.04 (95%)
Ranking variance: 0.04 ±0.04 (95%)
Histogram of all scores (as distances):


Histogram samples of scores (as distances):


--> tic: 1m 39.959s | message: Stats printed for jaccard-1d3f1
#################### lda-82272 ####################
--> tic: 32.31s | message: Got lda-82272 rankings
Median: 0.9 ±0.06 (95%)
Mean: 0.84 ±0.06 (95%)
Ranking variance: 0.01 ±0.01 (95%)
Histogram of all scores (as distances):


Histogram samples of scores (as distances):


--> tic: 1m 51.86s | message: Stats printed for lda-82272
#################### nmf-6078e ####################
--> tic: 31.45s | message: Got nmf-6078e rankings
Median: 0.77 ±0.08 (95%)
Mean: 0.73 ±0.07 (95%)
Ranking variance: 0.01 ±0.0 (95%)
Histogram of all scores (as distances):


Histogram samples of scores (as distances):


--> tic: 2m 0.069s | message: Stats printed for nmf-6078e
#################### sent2vec-32f92 ####################
--> tic: 30.57s | message: Got sent2vec-32f92 rankings
Median: 0.34 ±0.04 (95%)
Mean: 0.36 ±0.04 (95%)
Ranking variance: 0.01 ±0.01 (95%)
Histogram of all scores (as distances):


Histogram samples of scores (as distances):


--> tic: 2m 26.24s | message: Stats printed for sent2vec-32f92
#################### stylo-5c321 ####################
--> tic: 33.24s | message: Got stylo-5c321 rankings
Median: 0.01 ±0.01 (95%)
Mean: 0.02 ±0.01 (95%)
Ranking variance: 0.19 ±0.18 (95%)
Histogram of all scores (as distances):


Histogram samples of scores (as distances):


--> tic: 2m 47.58s | message: Stats printed for stylo-5c321
#################### tfidf-4a1dc ####################
--> tic: 32.32s | message: Got tfidf-4a1dc rankings
Median: 0.79 ±0.06 (95%)
Mean: 0.77 ±0.06 (95%)
Ranking variance: 0.04 ±0.05 (95%)
Histogram of all scores (as distances):


Histogram samples of scores (as distances):


--> tic: 1m 57.73s | message: Stats printed for tfidf-4a1dc
#################### usent-64ec0 ####################
--> tic: 33.68s | message: Got usent-64ec0 rankings
Median: 0.57 ±0.07 (95%)
Mean: 0.56 ±0.06 (95%)
Ranking variance: 0.0 ±0.0 (95%)
Histogram of all scores (as distances):


Histogram samples of scores (as distances):


--> tic: 2m 13.28s | message: Stats printed for usent-64ec0
#################### word2vec-f3c38 ####################
--> tic: 35.75s | message: Got word2vec-f3c38 rankings


  return [(i - theMin) / (theMax - theMin) for i in theList]


Exception type: <class 'ValueError'>
Exception: could not convert string to float: 'nan.00'
Traceback (most recent call last):
  File "<ipython-input-8-39a2eb1ae2aa>", line 12, in <module>
    rankingReport(rankings, maxUsers=None, pbarVerbose=False)
  File "<ipython-input-7-5e8707b244ad>", line 28, in rankingReport
    std = truncateFloat(std, 2)
  File "/hosthome/Workspace/Python/Utils/SystemTools/systemtools/number.py", line 135, in truncateFloat
    return float('.'.join([i, (d+'0'*n)[:n]]))
ValueError: could not convert string to float: 'nan.00'

--> tic: 1m 49.5s | message: Stats printed for word2vec-f3c38
--> toc total duration: 39m 42.51s


2382.51

# Avantage d'un modèle dans une combinaison

In [9]:
def getDominance(combin, logger=None, verbose=False):
    meta = twinewsRankings.getMeta(combin)
    model1, model2 = meta['models'][0], meta['models'][1]
    combinRankings = rankingsCache[combin]
    model1Rankings = rankingsCache[model1]
    model2Rankings = rankingsCache[model2]
    userIds = list(combinRankings.keys())
    domin1 = 0
    domin2 = 0
    dominationRatio1 = None
    for i in range(1000000):
        userId = random.choice(userIds)
        crk = pruneScores(combinRankings[userId][0])
        m1rk = pruneScores(model1Rankings[userId][0])
        m2rk = pruneScores(model2Rankings[userId][0])
        urlIndex = getRandomInt(0, 999)
        url = crk[urlIndex]
        position1 = m1rk.index(url)
        position2 = m2rk.index(url)
        distance1 = abs(position1 - urlIndex)
        distance2 = abs(position2 - urlIndex)
        if distance1 == distance2:
            pass
        elif distance1 < distance2:
            domin1 += 1
        else:
            domin2 += 1
        if i > 1000 and i % 10000 == 0:
            newDominationRatio1 = domin1 / (domin1 + domin2)
            if dominationRatio1 is not None:
                if abs(dominationRatio1 - newDominationRatio1) < 0.0005:
                    break
            dominationRatio1 = newDominationRatio1
            log("dominationRatio1: " + str(truncateFloat(dominationRatio1, 2)), logger, verbose=verbose)
            log('-' * 10, logger, verbose=verbose)
    dominationRatio1 = domin1 / (domin1 + domin2)
    return dominationRatio1

In [10]:
twinewsRankings = getTwinewsRankings()
rankingsCache = Cache(twinewsRankings.get, minFreeRAM=20, logger=logger)

In [11]:
combinsSplit1 = getCombinIds(1)
combinsSplit2 = getCombinIds(2)
bp(combinsSplit1, logger)
bp(combinsSplit2, logger)

{ combin-00a04, combin-01931, ..., combin-fbf78, combin-fe28e }
{ combin-01199, combin-018e2, ..., combin-faf94, combin-fc69a }


In [12]:
dominancesSD = getDominancesSD()
bp(list(dominancesSD.items()), logger)

[ ( combin-3a676, 0.4 ), ( combin-3c5f3, 0.07 ), ..., ( combin-7f69a, 0.32 ), ( combin-a6f82, 0.33 ) ]


serializabledict twinews-dominances initialised.


In [13]:
for combin in pb(list(combinsSplit1) + list(combinsSplit2), logger=logger, printRatio=0.01):
    if combin not in dominancesSD:
        meta = twinewsRankings.getMeta(combin)
        model1, model2 = meta['models'][0], meta['models'][1]
        rankAsScore = meta['rankAsScore']
        log("#" * 10 + " " + combin + " (" + model1 + " / " + model2 + ") " + "#" * 10, logger)
        log("rankAsScore: " + str(rankAsScore), logger)
        dominationRatio1 = getDominance(combin, logger=logger, verbose=False)
        log("Dominance prob of " + model1 + ": " + str(truncateFloat(dominationRatio1, 2)), logger)
        # log("#" * 60, logger)
        dominancesSD[combin] = dominationRatio1

  0% [                    ]
  0% [                    ]
  1% [                    ]
  2% [                    ]
  3% [                    ]
  4% [                    ]
  5% [=                   ] (0.326s left)
  6% [=                   ] (0.277s left)
  7% [=                   ] (0.36s left)
  8% [=                   ] (0.316s left)
  9% [=                   ] (0.376s left)
 10% [==                  ] (0.338s left)
 11% [==                  ] (0.306s left)
 12% [==                  ] (0.35s left)
 13% [==                  ] (0.321s left)
 14% [==                  ] (0.356s left)
 15% [===                 ] (0.329s left)
 16% [===                 ] (0.358s left)
 17% [===                 ] (0.334s left)
 18% [===                 ] (0.357s left)
 19% [===                 ] (0.335s left)
 20% [====                ] (0.316s left)
 21% [====                ] (0.335s left)
 22% [====                ] (0.316s left)
 23% [====                ] (0.333s left)
 24% [====                ] (0.316s 

# Trash

In [None]:
if False:
    def getDominances(combin):
        meta = twinewsRankings.getMeta(combin)
        model1, model2 = meta['models'][0], meta['models'][1]
        combinRankings = rankingsCache[combin]
        model1Rankings = rankingsCache[model1]
        model2Rankings = rankingsCache[model2]
        userIds = list(combinRankings.keys())
        p1s = []
        p2s = []
        p1 = None
        p2 = None
        for i in range(1000000):
            userId = random.choice(userIds)
            crk = pruneScores(combinRankings[userId][0])
            m1rk = pruneScores(model1Rankings[userId][0])
            m2rk = pruneScores(model2Rankings[userId][0])
            urlIndex = getRandomInt(0, 999)
            url = crk[urlIndex]
            position1 = m1rk.index(url)
            position2 = m2rk.index(url)
            p1s.append(position1 < urlIndex)
            p2s.append(position2 < urlIndex)
            if i % 10000 == 0:
                newp1 = p1s.count(True) / len(p1s)
                newp2 = p2s.count(True) / len(p2s)
                if p1 is not None and p2 is not None:
                    if abs(p1 - newp1) < 0.001 and abs(p2 - newp2) < 0.001:
                        break
                p1, p2 = newp1, newp2
                # print("p1: " + str(truncateFloat(p1, 2)))
                # print("p2: " + str(truncateFloat(p2, 2)))
                # print('-' * 10)
        p1 = p1s.count(True) / len(p1s)
        p2 = p2s.count(True) / len(p2s)
        return (p1, p2)