# Commands

In [None]:
# cd twinews-logs ; jupython -o nohup-topicmodels-$HOSTNAME.out --venv st-venv ~/Workspace/Python/Datasets/Twinews/twinews/models/topicmodels.ipynb

In [None]:
# Sources:
# https://towardsdatascience.com/lets-build-an-article-recommender-using-lda-f22d71b7143e
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

# Init

In [1]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""
#import sys ; sys.path.append('/home/yuting/PycharmProjects/Twinews')

In [2]:
isNotebook = '__file__' not in locals()

In [3]:
TEST = isNotebook

In [4]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from databasetools.mongo import *
from newstools.goodarticle.utils import *
from nlptools.preprocessing import *
from nlptools.news import parser as newsParser
from machinelearning.iterator import *
from twinews.utils import *
from twinews.models.ranking import *

In [5]:
from nlptools.basics import *

In [6]:
from nltk.stem import WordNetLemmatizer 
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import LdaMulticore
from gensim.test.utils import common_corpus, common_dictionary
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import gensim
from math import log2
from math import sqrt
from numpy import asarray

In [7]:
logger = Logger(tmpDir('logs') + "/lda.log") if isNotebook else Logger("lda-" + getHostname() + ".log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

In [8]:
logger = Logger(tmpDir('logs') + "/lda.log") if isNotebook else Logger("lda-" + getHostname() + ".log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

# Config

In [9]:
config = \
{
    'splitVersion': 2,
    
    'maxUsers': 2 if TEST else None, # Sub-sampling
    'maxDocuments': 30 if TEST else 10000,
    'useExtraNews': False if TEST else False, # None = unlimited, 0 = no extra news
    'minDF': 1 / 500 if TEST else 1 / 2000, # Remove words that have a document frequency ratio lower than 1 / 500
    'maxDF': 300, # Remove top 300 voc elements
    #maybe TF-IDF here?
    
    'nbTopics': 30 if TEST else 100, # 30, 100
    'lowercase': False if TEST else True,
    'doLemmatization': False if TEST else False,
    # <https://www.quora.com/How-do-you-combine-LDA-and-tf-idf>
    # <https://www.quora.com/Why-is-the-performance-improved-by-using-TFIDF-instead-of-bag-of-words-in-LDA-clustering>
    'useTFIDF': True,
    
    'maxIter': 2 if TEST else 60, # 30 for lda, 200 for nmf
    
    'nmfInit': 'nndsvd', # None, 'nndsvd'
    'nmfL1Ratio': 0, # 0.0, 0.5, 1.0
    'nmfAlpha': 0.1, # 0.0, 0.1
    
    'ldaLearningMethod': 'online',
    'ldaLearningOffset': 1.0, # 1.0, 10.0
    'ldaLearningDecay': 0.7, # 0.5, 0.7, 0.9, 1.0
    
    'implementation': 'gensim-lda', # gensim-lda, sklearn-lda, sklearn-nmf
    'distance': 'cosine', # 'cosine', 'euclidean', 'kl', 'js'
    # The historyRef param is very important, it allow to choose, for a particular candidate,
    # how many train history items will be used to calculate the similarity with
    # the user history.
    # Float are ratio on train history
    # Integers are absolute number of train item in the history
    # For example:
    #  * 1.0 will allow to mean similarities of a candidate with all train history items
    #  * 1 will allow to use only the most similar train item for the similarity of
    #    the candidate with the history of the user
    #  * 0.5 will allow to use the half of history for each candidates
    #  * 3 to use 3 most similar items with the current candidate...
    'historyRef': 0.3, # 1, 1.0, 0.5, 0.3, 3, 10
}

In [10]:
# Since this script gather nmf + lda, we tweak the config so that it will correspond to parameters of one model:
if 'nmf' in config['implementation']:
    modelName = "nmf"
    newConfig = dict()
    for key, value in config.items():
        if not key.startswith('lda'):
            newConfig[key] = value
    config = newConfig
    del config['useTFIDF']
elif 'lda' in config['implementation']:
    modelName = "lda"
    newConfig = dict()
    for key, value in config.items():
        if not key.startswith('nmf'):
            newConfig[key] = value
    config = newConfig

In [11]:
# Since this script gather nmf + lda, we tweak the config so that it will correspond to parameters of one model:
if 'nmf' in config['implementation']:
    modelName = "nmf"
    newConfig = dict()
    for key, value in config.items():
        if not key.startswith('lda'):
            newConfig[key] = value
    config = newConfig
    del config['useTFIDF']
elif 'lda' in config['implementation']:
    modelName = "lda"
    newConfig = dict()
    for key, value in config.items():
        if not key.startswith('nmf'):
            newConfig[key] = value
    config = newConfig

In [12]:
# Here we check if we already generated ranking for this model with this specific config:
if not isNotebook:
    assert not rankingExists(modelName, config, logger=logger)

# Getting data

In [13]:
# Getting users and news
evalData = getEvalData(config['splitVersion'], maxExtraNews=config['maxDocuments'],
                       maxUsers=config['maxUsers'], logger=logger)
(trainUsers, testUsers, trainNews, testNews, candidates, extraNews) = \
(evalData['trainUsers'], evalData['testUsers'], evalData['trainNews'],
 evalData['testNews'], evalData['candidates'], evalData['extraNews'])
bp(evalData.keys(), 5, logger)
log(b(evalData['meta'], 5), logger)

--> tic: 17.07s | message: Eval data loaded
twinews news (version 1.0) initialised.


KeyboardInterrupt: 

In [1]:
len(testNews)

NameError: name 'testNews' is not defined

In [14]:
# Here it is important to convert urls to lists because we want the same order to retrieve vectors by index...
# And we shuffle it so we do not stick urls a a user at the begin...
# But we seed the random to always have same order...
extraNewsList = shuffle(list(extraNews), seed=0)
trainNewsList = shuffle(list(trainNews), seed=0)
testNewsList = shuffle(list(testNews), seed=0)

In [15]:
print(len(extraNewsList))
print(len(trainNewsList))
print(len(testNewsList))

30
28
1989


In [33]:
# We get urls for the LDA model:
if config['useExtraNews']: # ? it's always False, so extraNewsList is always at the end.
    urlsForModel = extraNewsList + trainNewsList + testNewsList
else:
    urlsForModel = trainNewsList + testNewsList + extraNewsList

# print(len(urlsForModel))
# print(config['maxDocuments'])
urlsForModel = urlsForModel[:config['maxDocuments']] # control the # of news, 30 for test, 10000 for training
# print(len(urlsForModel))

# We get urls to vectorize for the training and the inference:
urlsForModelSet = set(urlsForModel) # delet the repeated urls
# print(len(urlsForModelSet))

urlsToVectorize = copy.deepcopy(urlsForModel) # deep copy, a new object will not be influence by the origal data
# print(len(urlsToVectorize))
for url in trainNewsList + testNewsList:
    if url not in urlsForModelSet:
        urlsToVectorize.append(url) # to make sure we have all the urls
# print(len(urlsToVectorize))
# We get urls to infere for the scoring:
urlsToInfere = trainNewsList + testNewsList
# Print all:
log(str(len(urlsForModel)) + " urls for urlsForModel:\n" + b(urlsForModel), logger=logger)
log(str(len(urlsToVectorize)) + " urls for urlsToVectorize:\n" + b(urlsToVectorize), logger=logger)
log(str(len(urlsToInfere)) + " urls for urlsToInfere:\n" + b(urlsToInfere), logger=logger)

30 urls for urlsForModel:
[
  https://www.si.com/soccer/2017/12/20/el-clasico-real-madrid-barcelona-preview-zidane-valverde-messi-,
  http://www.nydailynews.com/sports/soccer/spain-world-cup-jersey-sparking-controversy-article-1.36166,
  ...,
  http://english.alarabiya.net/en/perspective/features/2018/01/01/ANALYSIS-How-these-protests-in-Iran-,
  https://www.amny.com/news/bill-de-blasio-monuments-1.16133418
]
2017 urls for urlsToVectorize:
[
  https://www.si.com/soccer/2017/12/20/el-clasico-real-madrid-barcelona-preview-zidane-valverde-messi-,
  http://www.nydailynews.com/sports/soccer/spain-world-cup-jersey-sparking-controversy-article-1.36166,
  ...,
  https://www.dailykos.com/story/2018/1/1/1728967/-On-the-road-to-somewhere-some-Scientific-Proof-of-C,
  http://berkshireeagle.com/stories/our-opinion-governor-is-playing-politicswith-homeless-shelters,528
]
2017 urls for urlsToInfere:
[
  https://www.si.com/soccer/2017/12/20/el-clasico-real-madrid-barcelona-preview-zidane-valverde-mess

In [34]:
# We get sentences:
sentences = getNewsSentences(urlsToVectorize, logger=logger)
#print(len(sentences))
bp(sentences, logger)

  0% [                    ]
  9% [=                   ] (58.726s left)
 19% [===                 ] (49.936s left)
 29% [=====               ] (43.334s left)
[
  [ [ Back, in, ..., performance, . ], [ ", They, ..., conference, . ], ..., [ Regardless, of, ..., rival, . ], [ We, find, ..., season, . ] ],
  [ [ MADRID, The, ..., the, 1930s ], [ The, jersey, ..., from, __int_4__ ], ..., [ Its, main, ..., red, . ], [ Spain, is, ..., country, . ] ],
  ...,
  [ [ by, Angela, ..., Thailand, . ], [ More, than, ..., heat, . ], ..., [ Sometimes, we, ..., trees, ... ], [ Enjoy, it, ..., gone, . ] ],
  [ [ Try, one, ..., cats, . ], [ Add, a, ..., process, . ], ..., [ If, Bay, ..., neighbors, . ], [ And, if, ..., polls, . ] ]
]


In [35]:
sentences[1]

[['MADRID',
  'The',
  'Spain',
  'jersey',
  'that',
  'will',
  'be',
  'used',
  'in',
  'the',
  'World',
  'Cup',
  'has',
  'sparked',
  'controversy',
  'after',
  'being',
  'linked',
  'to',
  'the',
  'Republican',
  'flag',
  'of',
  'the',
  '1930s'],
 ['The',
  'jersey',
  'has',
  'the',
  'colors',
  'red',
  ',',
  'yellow',
  'and',
  'blue',
  ',',
  'but',
  'some',
  'say',
  'that',
  'from',
  'afar',
  'the',
  'blue',
  'appears',
  'to',
  'have',
  'the',
  'same',
  'purple',
  'tone',
  'of',
  'the',
  'Republican',
  'flag',
  'used',
  'from',
  '__int_4__'],
 ['The',
  'flag',
  'is',
  'still',
  'used',
  'by',
  'those',
  'who',
  'oppose',
  'the',
  'Spanish',
  'monarch',
  '.'],
 ['Apparel',
  'maker',
  'Adidas',
  'said',
  'there',
  'is',
  'no',
  'political',
  'implications',
  'in',
  'the',
  'jersey',
  ',',
  'and',
  'that',
  'the',
  'design',
  'was',
  'agreed',
  'upon',
  'with',
  'the',
  'Spanish',
  'soccer',
  'federation',

In [36]:
# We flatten sentences:
for i in range(len(sentences)):
    sentences[i] = flattenLists(sentences[i])
docs = sentences
bp(docs, logger)

[ [ Back, in, ..., season, . ], [ MADRID, The, ..., country, . ], ..., [ by, Angela, ..., gone, . ], [ Try, one, ..., polls, . ] ]


In [37]:
docs[1]

['MADRID',
 'The',
 'Spain',
 'jersey',
 'that',
 'will',
 'be',
 'used',
 'in',
 'the',
 'World',
 'Cup',
 'has',
 'sparked',
 'controversy',
 'after',
 'being',
 'linked',
 'to',
 'the',
 'Republican',
 'flag',
 'of',
 'the',
 '1930s',
 'The',
 'jersey',
 'has',
 'the',
 'colors',
 'red',
 ',',
 'yellow',
 'and',
 'blue',
 ',',
 'but',
 'some',
 'say',
 'that',
 'from',
 'afar',
 'the',
 'blue',
 'appears',
 'to',
 'have',
 'the',
 'same',
 'purple',
 'tone',
 'of',
 'the',
 'Republican',
 'flag',
 'used',
 'from',
 '__int_4__',
 'The',
 'flag',
 'is',
 'still',
 'used',
 'by',
 'those',
 'who',
 'oppose',
 'the',
 'Spanish',
 'monarch',
 '.',
 'Apparel',
 'maker',
 'Adidas',
 'said',
 'there',
 'is',
 'no',
 'political',
 'implications',
 'in',
 'the',
 'jersey',
 ',',
 'and',
 'that',
 'the',
 'design',
 'was',
 'agreed',
 'upon',
 'with',
 'the',
 'Spanish',
 'soccer',
 'federation',
 '.',
 'The',
 'jersey',
 ',',
 'that',
 'has',
 'the',
 'colors',
 'red',
 ',',
 'yellow',
 'and'

In [38]:
# Lower case:
if config['lowercase']:
    for i in pb(list(range(len(docs))), logger=logger, message="Lower casing"):
        for u in range(len(docs[i])):
            docs[i][u] = docs[i][u].lower()
bp(docs, logger)

[ [ Back, in, ..., season, . ], [ MADRID, The, ..., country, . ], ..., [ by, Angela, ..., gone, . ], [ Try, one, ..., polls, . ] ]


In [39]:
# Lemmatization:
if config['doLemmatization']:
    lemmatizer = WordNetLemmatizer()
    pbar = ProgressBar(len(docs), logger=logger, message="Lemmatization")
    for i in range(len(docs)):
        for u in range(len(docs[i])):
            docs[i][u] = lemmatizer.lemmatize(docs[i][u])
        pbar.tic()
bp(docs, logger)

[ [ Back, in, ..., season, . ], [ MADRID, The, ..., country, . ], ..., [ by, Angela, ..., gone, . ], [ Try, one, ..., polls, . ] ]


In [40]:
# Filtering the corpus:
docs = filterCorpus(docs, minDF=config['minDF'], maxDF=config['maxDF'],
                    removeEmptyDocs=False, allowEmptyDocs=False, logger=logger)
for doc in docs: assert len(doc) > 0
bp(docs, logger)

Voc removed because of minDF (44267 elements):
{ 1000th, 100s, 100th, 100x, 103rd, 104th, 108th, 10Weather, 10X, 10x, ..., zooms, zoonotic, zoos, 💗, 😁, 😄, 😇, 😒, 😢, 🙌 }
Voc removed because of maxDF (300 elements):
{ ", ', (, ), ,, -, ., :, ;, ?, ..., with, without, work, working, world, would, year, years, you, your }
73.9% of voc will be removed.
[ [ Back, August, ..., champion, season ], [ Spain, jersey, ..., independence, rest ], ..., [ Angela, Fritz, ..., til, gone ], [ Try, cents, ..., remember, polls ] ]


In [41]:
print(len(docs))

2017


In [42]:
tt.tic("Data preprocessed")

--> tic: 12m 6.289s | message: Data preprocessed


726.29

# Infering topic vectors

At this step we generate `inferedVectors` and `topics`

In [43]:
if config['implementation'] == 'gensim-lda':
    dictionary = gensim.corpora.Dictionary(docs)
    # dictionary.filter_extremes(no_below=config['min_df'])
    bow = [dictionary.doc2bow(doc) for doc in docs]  # bag-of-words
    if config['useTFIDF']:
        tfidf = gensim.models.TfidfModel(bow)
        bow = tfidf[bow]                             # bag-of-words with tfdif, why here again filtering the corpus?
    assert len(bow) == len(urlsToVectorize)
    bowForModel = bow[:len(urlsForModel)]            # 30 for TEST 
    assert len(bowForModel) == config['maxDocuments']
    i = 0
    for url in urlsToVectorize:
        if url == trainNewsList[0]:
            break
        i += 1
    assert i == len(extraNews) or i == 0
    bowForInference = bow[i:i + len(trainNews) + len(testNews)] # make sure the bow for inference == train + test
    assert len(bowForInference) == len(trainNews) + len(testNews)
    lda_model = gensim.models.LdaMulticore\
    (
        bowForModel,
        num_topics=config['nbTopics'],
        id2word=dictionary,
        iterations=config['maxIter'],
        decay=config['ldaLearningDecay'],
        offset=config['ldaLearningOffset'],
        workers=cpuCount(),
        passes=3, # what is passes
    )
    inferedVectors = []
    for current in bowForInference:
        topicProbDistrib = lda_model[current]
        currentVector = [0.0] * config['nbTopics']
        for t, v in topicProbDistrib:
            currentVector[t] = v
        inferedVectors.append(np.array(currentVector))
    assert len(inferedVectors) == len(trainNews) + len(testNews)
    assert len(inferedVectors[0]) == config['nbTopics']
    topics = []
    for i in range(lda_model.num_topics):
        current = dict()
        for x in lda_model.get_topic_terms(i, topn=100):
            current[dictionary[x[0]]] = x[1]
        topics.append(current)

In [44]:
if config['implementation'] == 'sklearn-lda' or config['implementation'] == 'sklearn-nmf':
    if config['implementation'] == 'sklearn-nmf' or config['useTFIDF']:
        vectorizer = TfidfVectorizer\
        (
            sublinear_tf=True,
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            # lowercase=True, # Doesn't work because we erased preprocessor
        )
    else:
        vectorizer = CountVectorizer\
        (
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            # lowercase=True, # Doesn't work because we erased preprocessor
        )
    vectors = vectorizer.fit_transform(docs)  # W 
    assert vectors.shape[0] == len(urlsToVectorize)
    vectorsForModel = vectors[:len(urlsForModel)]
    assert vectorsForModel.shape[0] == config['maxDocuments']
    i = 0
    for url in urlsToVectorize:
        if url == trainNewsList[0]:
            break
        i += 1
    assert i == len(extraNews) or i == 0
    vectorsForInference = vectors[i:i + len(trainNews) + len(testNews)]
    assert vectorsForInference.shape[0] == len(trainNews) + len(testNews)
    if config['implementation'] == 'sklearn-lda':
        model = LatentDirichletAllocation\
        (
            n_components=config['nbTopics'],
            learning_method=config['ldaLearningMethod'],
            learning_offset=config['ldaLearningOffset'],
            learning_decay=config['ldaLearningDecay'],
            random_state=0,
            n_jobs=cpuCount(),
            max_iter=config['maxIter'],
        )
    else:
        model = NMF\
        (
            n_components=config['nbTopics'],
            random_state=0,
            alpha=config['nmfAlpha'],
            l1_ratio=config['nmfL1Ratio'],
            init=config['nmfInit'],
            max_iter=config['maxIter'],
        )
    model.fit(vectorsForModel)
    inferedVectors = model.transform(vectorsForInference)
    assert inferedVectors.shape[0] == len(trainNews) + len(testNews)
    assert inferedVectors[0].shape[0] == config['nbTopics']
    topics = []
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        wordProb = []
        for i in range(len(topic)):
            prob = topic[i]
            word = feature_names[i]
            wordProb.append((word, prob))
        wordProb = sortBy(wordProb, desc=True, index=1)[:100]
        current = dict()
        for word, prob in wordProb:
            current[word] = prob
        topics.append(current)

In [52]:
inferedVectors

[array([0.02236069, 0.35153988, 0.02236069, 0.02236069, 0.02236069,
        0.02236069, 0.02236069, 0.02236069, 0.02236069, 0.02236069,
        0.02236069, 0.02236069, 0.02236069, 0.02236069, 0.02236069,
        0.02236069, 0.02236069, 0.02236069, 0.02236069, 0.02236069,
        0.02236069, 0.02236069, 0.02236069, 0.02236069, 0.02236069,
        0.02236069, 0.02236069, 0.02236069, 0.02236069, 0.02236069],
       dtype=float32),
 array([0.01742257, 0.01742257, 0.01742257, 0.01742257, 0.01742257,
        0.01742257, 0.01742257, 0.49474543, 0.01742257, 0.01742257,
        0.01742257, 0.01742257, 0.01742257, 0.01742257, 0.01742257,
        0.01742257, 0.01742257, 0.01742257, 0.01742257, 0.01742257,
        0.01742257, 0.01742257, 0.01742257, 0.01742257, 0.01742257,
        0.01742257, 0.01742257, 0.01742257, 0.01742257, 0.01742257],
       dtype=float32),
 array([0.02058528, 0.02058528, 0.02058528, 0.02058528, 0.02058528,
        0.02058528, 0.02058528, 0.02058528, 0.02058528, 0.4030269 ,


# Showing topics

In [49]:
def printTopics(topics, maxWords=10, logger=None):
    for i in range(len(topics)):
        log(str(i) + ": " + str(" ".join(list(topics[i].keys())[:10])), logger)

In [51]:
topics

[{'Denver': 9.01615e-05,
  'housing': 8.701018e-05,
  'Dallas': 8.4640844e-05,
  'regime': 8.2603074e-05,
  'Cup': 8.090944e-05,
  'FC': 7.979674e-05,
  'team': 7.937162e-05,
  'World': 7.9021396e-05,
  'Iranian': 7.862996e-05,
  'city': 7.853943e-05,
  'players': 7.7328965e-05,
  'Olympics': 7.708867e-05,
  'teams': 7.6353324e-05,
  'Colorado': 7.630897e-05,
  'games': 7.5242344e-05,
  'No': 7.494265e-05,
  'Hunt': 7.4444244e-05,
  'protesters': 7.4433876e-05,
  'FIFA': 7.402797e-05,
  'club': 7.37847e-05,
  'game': 7.363704e-05,
  'soccer': 7.359024e-05,
  'Olympic': 7.356183e-05,
  'Italy': 7.330939e-05,
  'Barcelona': 7.211587e-05,
  'Iran': 7.1738345e-05,
  'Rio': 7.171344e-05,
  'Stadium': 7.15983e-05,
  'play': 7.148905e-05,
  'Brazil': 7.128731e-05,
  'rankings': 7.114538e-05,
  'Games': 7.095193e-05,
  'league': 7.0877446e-05,
  'Korea': 7.07556e-05,
  'Hancock': 7.074866e-05,
  'Beck': 7.0040835e-05,
  'Front': 6.996175e-05,
  'Dr.': 6.995203e-05,
  'Blasio': 6.993096e-05,
  

In [50]:
def printTopicsOf(vector, topics, logger=None):
    topicsRepr = ""
    topTopics = sortBy([(i, score) for i, score in enumerate(vector) if score > 0.1], desc=True, index=1)[:3]
    log("Top topics number are: " + str(" ".join([str(e[0]) for e in topTopics])), logger)
    currentTopics = [topics[e[0]] for e in topTopics]
    printTopics(currentTopics, logger=logger)

In [48]:
printTopics(topics, logger=logger)

0: Denver housing Dallas regime Cup FC team World Iranian city
1: Barcelona Messi Ham Fame Soccer Frisco Hunt Cup FC club
2: Colorado Denver city coal Cup Hickenlooper methane values Dallas West
3: Olympics Dallas games FC Games Denver Olympic hosting Colorado players
4: Denver Colorado Amazon taxes bids city Hickenlooper tax housing Cup
5: Ham drainage Goldberg Denver widening Cup city Beck project games
6: Denver FC Barcelona Dallas city Blasio Colorado Olympics Cup Hunt
7: jersey flag Cup Spain Barcelona tax Amazon colors World Republican
8: Denver city Cup jersey Colorado Winter Beck tax Olympic players
9: Amazon bids finalists Denver Colorado Cup housing taxes auction bidding
10: Colorado Denver Olympic Games Ham Olympics Winter Hickenlooper city West
11: Colorado tax Cup cry midfielder city fan taxes Denver Go
12: Denver Cup Brazil Barcelona Olympics games Beck players Rio football
13: Messi Barcelona jersey flag Cup Spain colors games yellow federation
14: Madrid Barcelona Ham D

# Making a dict url --> topic vector

In [53]:
assert len(urlsToInfere) == len(inferedVectors)
urlsVectors = dict()
for i in range(len(urlsToInfere)):
    urlsVectors[urlsToInfere[i]] = inferedVectors[i]

In [55]:
bp(urlsVectors)

{
  http://a.msn.com/01/en-ie/BBHzIAS?ocid=st: [0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334
   0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334
   0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334
   0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334
   0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334],
  http://a.msn.com/06/en-us/BBH7EiO?ocid=st: [0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334
   0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334
   0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334
   0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334
   0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334],
  http://abcn.ws/2CbqxlW: [0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334
   0.03333334 0.03333334 0.03333334 0.03333334 0.03333334 0.03333334
   0.03333334 0.03333334 0.03333334 0.03333334 0.0333333

--> tic: 24.4s | message: Rankings stored


24.4

In [3]:
tt.tic("Rankings stored")

NameError: name 'tt' is not defined

# Old stuff

In [46]:
# Serialization:
if False:
    try:
        dirPath = nosaveDir() + "/sklearn-models"
        mkdir(dirPath)
        configHash = objectToHash(config)[:5]
        serialize(model, dirPath + "/model-" + configHash + ".pickle")
        toJsonFile(config, dirPath + "/config-" + configHash + ".json")
    except Exception as e:
        logException(e, logger=logger)

In [2]:
# Read the doc!
rankings = usersRankingsByHistoryDistance\
(
    trainUsers,
    candidates,
    config['historyRef'],
    urlsVectors,
    distanceMetric=config['distance'],
    logger=logger,
)


NameError: name 'usersRankingsByHistoryDistance' is not defined

In [None]:
bp(rankings, logger, 4)

In [None]:
bp(rankings, logger, 4)