This notebook allow you to print results of models in tables

In [1]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [2]:
isNotebook = '__file__' not in locals()

In [3]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from twinews.utils import *

In [4]:
import pandas as pd
from IPython.display import display, HTML

In [17]:
def printResults\
(
    model=None,
    splitVersion=None,
    metaFilter={}, # A dict that map field to mandatory values
    metricsFilter=None, # A set of allowed metrics
    noSubsampling=True,
    logger=None,
    sortBy=None,
):
    twinewsRankings = getTwinewsRankings(verbose=False)
    twinewsScores = getTwinewsScores(verbose=False)
    data = []
    if noSubsampling and "maxUsers" not in metaFilter:
        metaFilter = mergeDicts(metaFilter, {"maxUsers": None})
    if model is not None and "model" not in metaFilter:
        metaFilter = mergeDicts(metaFilter, {"model": model})
    if splitVersion is not None and "splitVersion" not in metaFilter:
        metaFilter = mergeDicts(metaFilter, {"splitVersion": splitVersion})
    for key in twinewsRankings.keys():
        toKeep = True
        meta = twinewsRankings.getMeta(key)
        if 'historyRef' in meta:
            meta['historyRef'] = str(meta['historyRef'])
        for filtKey in metaFilter:
            if filtKey not in meta:
                raise Exception(filtKey + "is not in " + b(meta, 5))
            if metaFilter[filtKey] != meta[filtKey]:
                toKeep = False
                break
        if toKeep:
            data.append(meta)
    if len(data) == 0:
        log("No data found", logger)
    else:
        try:
            refKeys = data[0].keys()
            for e in data:
                assert e.keys() == refKeys
        except:
            raise Exception("Some data keys doesn't match:\n" + b(data, 5))
        if len(data) > 1:
            keysHavingSameValues = set(data[0].keys())
            baseValues = data[0]
            for current in data[1:]:
                for key in baseValues.keys():
                    if key in keysHavingSameValues and baseValues[key] != current[key]:
                        keysHavingSameValues.remove(key)
            sameValues = dict()
            for key in keysHavingSameValues:
                sameValues[key] = data[0][key]
            log("These values are common to all rows:\n", logger)
            for key, value in sameValues.items():
                log("\t- " + str(key) + ": " + str(value), logger)
            for i in range(len(data)):
                for key in keysHavingSameValues:
                    del data[i][key]
        # We add scores:
        metrics = set()
        for current in data:
            key = current['id']
            scores = twinewsScores.find({'id': key})
            for score in scores:
                if metricsFilter is None or score['metric'] in metricsFilter:
                    metrics.add(score['metric'])
                    current[score['metric']] = score['score']
        if len(metrics) > 0:
            metrics = sorted(list(metrics))
            if sortBy is None:
                sortBy = metrics[0]
        else:
            metrics = []
        df = pd.DataFrame(data)
        df = reorderDFColumns(df, start=['id'], end=metrics)
        if sortBy not in df.columns:
            sortBy = None
        if sortBy is not None:
            df.sort_values(sortBy, ascending=False, inplace=True)
        display(df)
        return df

In [18]:
df = printResults\
(
    model='lda',
    splitVersion=2,
    noSubsampling=True,
    metaFilter=\
    {
        # "implementation": "gensim-lda",
    },
    metricsFilter=None, # {'ndcg', 'ndcg@10'},
    sortBy='ndcg',
)

These values are common to all rows:

	- model: lda
	- maxUsers: None
	- maxDF: 300
	- useExtraNews: False
	- splitVersion: 2
	- ldaLearningDecay: 0.7
	- lowercase: True
	- ldaLearningMethod: online


Unnamed: 0,id,distance,doLemmatization,historyRef,implementation,ldaLearningOffset,maxDocuments,maxIter,minDF,nbTopics,useTFIDF,ndcg,ndcg@10
6,lda-d1220,cosine,False,0.3,gensim-lda,1.0,100000,60,0.0005,100,False,0.386483,0.157042
2,lda-af807,cosine,False,30.0,gensim-lda,1.0,10000,60,0.0005,100,False,0.379418,0.157717
9,lda-759de,cosine,False,0.6,gensim-lda,1.0,10000,60,0.0005,100,False,0.374841,0.151466
1,lda-64d0c,cosine,False,1.0,gensim-lda,1.0,10000,60,0.0005,100,False,0.374784,0.152779
3,lda-2fa8e,euclidean,False,0.3,gensim-lda,1.0,10000,60,0.0005,100,False,0.361689,0.13783
14,lda-b849e,cosine,False,0.3,gensim-lda,1.0,10000,60,0.0005,300,False,0.356511,0.132101
7,lda-7786b,cosine,True,0.3,gensim-lda,1.0,10000,60,0.0005,100,False,0.355055,0.127154
10,lda-d3b82,cosine,False,0.3,gensim-lda,1.0,10000,60,0.0005,100,False,0.35335,0.125064
0,lda-94782,cosine,False,3.0,gensim-lda,1.0,10000,60,0.0005,100,False,0.338766,0.104603
15,lda-186ce,cosine,False,1.0,gensim-lda,1.0,10000,60,0.0005,100,False,0.316092,0.080907


In [None]:
df = printResults\
(
    model='nmf',
    splitVersion=2,
    noSubsampling=True,
    metaFilter=\
    {
        # "implementation": "gensim-lda",
    },
    metricsFilter=None, # {'ndcg', 'ndcg@10'},
    sortBy='ndcg',
)