# Commands

In [1]:
# cd twinews-logs ; jupython -o nohup-topicmodels-$HOSTNAME.out --venv st-venv ~/Workspace/Python/Datasets/Twinews/twinews/models/topicmodels.ipynb

In [2]:
# Sources:
# https://towardsdatascience.com/lets-build-an-article-recommender-using-lda-f22d71b7143e
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

# Init

In [3]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [4]:
isNotebook = '__file__' not in locals()

In [5]:
TEST = isNotebook

In [6]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from databasetools.mongo import *
from newstools.goodarticle.utils import *
from nlptools.preprocessing import *
from nlptools.news import parser as newsParser
from machinelearning.iterator import *
from twinews.utils import *
from twinews.models.ranking import *

In [7]:
from nlptools.topicmodeling import *
from nltk.stem import WordNetLemmatizer 
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import LdaMulticore
from gensim.test.utils import common_corpus, common_dictionary
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import gensim
from math import log2
from math import sqrt
from numpy import asarray

In [8]:
logger = Logger(tmpDir('logs') + "/lda.log") if isNotebook else Logger("lda-" + getHostname() + ".log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

# Config

In [9]:
config = \
{
    'splitVersion': 2,
    
    'maxUsers': 2 if TEST else None, # Sub-sampling
    'maxDocuments': 30 if TEST else 10000,
    'useExtraNews': False if TEST else False, # None = unlimited, 0 = no extra news
    'minDF': 1 / 500 if TEST else 1 / 2000, # Remove words that have a document frequency ratio lower than 1 / 500
    'maxDF': 300, # Remove top 300 voc elements
    
    'nbTopics': 30 if TEST else 100, # 30, 100
    'lowercase': False if TEST else True,
    'doLemmatization': False if TEST else False,
    # <https://www.quora.com/How-do-you-combine-LDA-and-tf-idf>
    # <https://www.quora.com/Why-is-the-performance-improved-by-using-TFIDF-instead-of-bag-of-words-in-LDA-clustering>
    'useTFIDF': False,
    
    'maxIter': 2 if TEST else 60, # 30 for lda, 200 for nmf
    
    'nmfInit': 'nndsvd', # None, 'nndsvd'
    'nmfL1Ratio': 0, # 0.0, 0.5, 1.0
    'nmfAlpha': 0.1, # 0.0, 0.1
    
    'ldaLearningMethod': 'online',
    'ldaLearningOffset': 1.0, # 1.0, 10.0
    'ldaLearningDecay': 0.7, # 0.5, 0.7, 0.9, 1.0
    
    'implementation': 'gensim-lda', # gensim-lda, sklearn-lda, sklearn-nmf
    'distance': 'cosine', # 'cosine', 'euclidean', 'kl', 'js'
    # The historyRef param is very important, it allow to choose, for a particular candidate,
    # how many train history items will be used to calculate the similarity with
    # the user history.
    # Float are ratio on train history
    # Integers are absolute number of train item in the history
    # For example:
    #  * 1.0 will allow to mean similarities of a candidate with all train history items
    #  * 1 will allow to use only the most similar train item for the similarity of
    #    the candidate with the history of the user
    #  * 0.5 will allow to use the half of history for each candidates
    #  * 3 to use 3 most similar items with the current candidate...
    'historyRef': 30, # 1, 1.0, 0.6, 0.3, 3, 10, 30
}

In [10]:
# Since this script gather nmf + lda, we tweak the config so that it will correspond to parameters of one model:
if 'nmf' in config['implementation']:
    modelName = "nmf"
    newConfig = dict()
    for key, value in config.items():
        if not key.startswith('lda'):
            newConfig[key] = value
    config = newConfig
    del config['useTFIDF']
elif 'lda' in config['implementation']:
    modelName = "lda"
    newConfig = dict()
    for key, value in config.items():
        if not key.startswith('nmf'):
            newConfig[key] = value
    config = newConfig

In [11]:
# Here we check if we already generated ranking for this model with this specific config:
if not isNotebook:
    if rankingExists(modelName, config, logger=logger):
        raise Exception(modelName + " with this config already exist:\n" + b(config, 5))

# Getting data

In [12]:
# Getting users and news
evalData = getEvalData(config['splitVersion'], maxExtraNews=config['maxDocuments'],
                       maxUsers=config['maxUsers'], logger=logger)
(trainUsers, testUsers, trainNews, testNews, candidates, extraNews) = \
(evalData['trainUsers'], evalData['testUsers'], evalData['trainNews'],
 evalData['testNews'], evalData['candidates'], evalData['extraNews'])
bp(evalData.keys(), 5, logger)
log(b(evalData['meta'], 5), logger)

--> tic: 31.89s | message: Eval data loaded
twinews news (version 1.0) initialised.
--> tic: 7.11s | message: Extra news downloaded
--> toc total duration: 39.01s | message: Got Twinews evaluation data
{ candidates, extraNews, meta, testNews, testUsers, trainNews, trainUsers }
{ 'created': 2020.03.24-14.28.06, 'endDate': 2018-01-15, 'id': 2, 'ranksLength': 1000, 'splitDate': 2017-12-25, 'startDate': 2017-10-01, 'testMaxNewsPerUser': 97, 'testMeanNewsPerUser': 7.22, 'testMinNewsPerUser': 2, 'testNewsCount': 71781, 'totalNewsAvailable': 570210, 'trainMaxNewsPerUser': 379, 'trainMeanNewsPerUser': 26.48, 'trainMinNewsPerUser': 8, 'trainNewsCount': 237150, 'usersCount': 15905 }


In [13]:
# Here it is important to convert urls to lists because we want the same order to retrieve vectors by index...
# And we shuffle it so we do not stick urls a a user at the begin...
# But we seed the random to always have same order...
extraNewsList = shuffle(list(extraNews), seed=0)
trainNewsList = shuffle(list(trainNews), seed=0)
testNewsList = shuffle(list(testNews), seed=0)

In [14]:
# We get urls for the LDA model:
if config['useExtraNews']:
    urlsForModel = extraNewsList + trainNewsList + testNewsList
else:
    urlsForModel = trainNewsList + testNewsList + extraNewsList
urlsForModel = urlsForModel[:config['maxDocuments']]
# We get urls to vectorize for the training and the inference:
urlsForModelSet = set(urlsForModel)
urlsToVectorize = copy.deepcopy(urlsForModel)
for url in trainNewsList + testNewsList:
    if url not in urlsForModelSet:
        urlsToVectorize.append(url)
# We get urls to infere for the scoring:
urlsToInfere = trainNewsList + testNewsList
# Print all:
log(str(len(urlsForModel)) + " urls for urlsForModel:\n" + b(urlsForModel), logger=logger)
log(str(len(urlsToVectorize)) + " urls for urlsToVectorize:\n" + b(urlsToVectorize), logger=logger)
log(str(len(urlsToInfere)) + " urls for urlsToInfere:\n" + b(urlsToInfere), logger=logger)

30 urls for urlsForModel:
[
  http://on.si.com/2yDAEx9,
  http://www.forbes.com/sites/bobbymcmahon/2017/11/25/messi-and-barcelona-sign-a-new-contract-to-2021-,
  ...,
  http://thehill.com/media/368673-fox-friends-host-criticized-trump-shithole-comment-moments-before-he,
  https://boston.curbed.com/boston-development/2018/1/10/16870266/allston-project-would-add-74-apartme
]
2017 urls for urlsToVectorize:
[
  http://on.si.com/2yDAEx9,
  http://www.forbes.com/sites/bobbymcmahon/2017/11/25/messi-and-barcelona-sign-a-new-contract-to-2021-,
  ...,
  http://www.sfchronicle.com/entertainment/article/The-street-vendor-who-makes-peace-his-profession-12,
  http://www.foxnews.com/politics/2018/01/04/house-russia-probe-ending-as-it-began-mess.amp.html?__twi
]
2017 urls for urlsToInfere:
[
  http://on.si.com/2yDAEx9,
  http://www.forbes.com/sites/bobbymcmahon/2017/11/25/messi-and-barcelona-sign-a-new-contract-to-2021-,
  ...,
  http://www.sfchronicle.com/entertainment/article/The-street-vendor-who-m

In [15]:
# We get sentences:
sentences = getNewsSentences(urlsToVectorize, logger=logger)
bp(sentences, logger)

twinews news (version 1.0) initialised.
  0% [                    ]
  9% [=                   ] (37.223s left)
 19% [===                 ] (31.335s left)
 29% [=====               ] (27.459s left)
[
  [ [ Juventus, midfielder, ..., media, . ], [ The, -, ..., page, . ], ..., [ Sturaro, has, ..., league, . ], [ Juventus, do, ..., Sunday, . ] ],
  [ [ Barcelona, has, ..., until, __int_4__ ], [ The, previous, ..., June, __int_4__ ], ..., [ Messi, has, ..., marksman, . ], [ Collective, honors, ..., Cups, . ] ],
  ...,
  [ [ Kate, Winslet, ..., vendors, . ], [ On, April, ..., him, . ], ..., [ Murphy, beamed, ..., to, . ], [ Beth, Spotswood, ..., Datebook, . ] ],
  [ [ Want, FOX, ..., day, ? ], [ Sign, up, here, . ], ..., [ Brianna, McClelland, ..., report, . ], [ Want, FOX, ..., day, ? ] ]
]


In [16]:
# We flatten sentences:
for i in range(len(sentences)):
    sentences[i] = flattenLists(sentences[i])
docs = sentences
bp(docs, logger)

[ [ Juventus, midfielder, ..., Sunday, . ], [ Barcelona, has, ..., Cups, . ], ..., [ Kate, Winslet, ..., Datebook, . ], [ Want, FOX, ..., day, ? ] ]


In [17]:
# Lower case:
if config['lowercase']:
    for i in pb(list(range(len(docs))), logger=logger, message="Lower casing"):
        for u in range(len(docs[i])):
            docs[i][u] = docs[i][u].lower()
bp(docs, logger)

[ [ Juventus, midfielder, ..., Sunday, . ], [ Barcelona, has, ..., Cups, . ], ..., [ Kate, Winslet, ..., Datebook, . ], [ Want, FOX, ..., day, ? ] ]


In [18]:
# Lemmatization:
if config['doLemmatization']:
    lemmatizer = WordNetLemmatizer()
    pbar = ProgressBar(len(docs), logger=logger, message="Lemmatization")
    for i in range(len(docs)):
        for u in range(len(docs[i])):
            docs[i][u] = lemmatizer.lemmatize(docs[i][u])
        pbar.tic()
bp(docs, logger)

[ [ Juventus, midfielder, ..., Sunday, . ], [ Barcelona, has, ..., Cups, . ], ..., [ Kate, Winslet, ..., Datebook, . ], [ Want, FOX, ..., day, ? ] ]


In [19]:
# Filtering the corpus:
docs = filterCorpus(docs, minDF=config['minDF'], maxDF=config['maxDF'],
                    removeEmptyDocs=False, allowEmptyDocs=False, logger=logger)
for doc in docs: assert len(doc) > 0
bp(docs, logger)

Voc removed because of minDF (44267 elements):
{ 1000th, 100s, 100th, 100x, 103rd, 104th, 108th, 10Weather, 10X, 10x, ..., zooms, zoonotic, zoos, 💗, 😁, 😄, 😇, 😒, 😢, 🙌 }
Voc removed because of maxDF (300 elements):
{ ", ', (, ), ,, -, ., :, ;, ?, ..., with, without, work, working, world, would, year, years, you, your }
73.9% of voc will be removed.
[ [ midfielder, issued, ..., Stadium, Sunday ], [ Barcelona, confirmed, ..., Club, World ], ..., [ Kate, gets, ..., column, appears ], [ Want, News, ..., Report, inbox ] ]


In [20]:
tt.tic("Data preprocessed")

--> tic: 1m 19.37s | message: Data preprocessed


79.37

# Infering topic vectors

At this step we generate `inferedVectors` and `topics`

In [21]:
if config['implementation'] == 'gensim-lda':
    dictionary = gensim.corpora.Dictionary(docs)
    # dictionary.filter_extremes(no_below=config['min_df'])
    bow = [dictionary.doc2bow(doc) for doc in docs]
    if config['useTFIDF']:
        tfidf = gensim.models.TfidfModel(bow)
        bow = tfidf[bow]
    assert len(bow) == len(urlsToVectorize)
    bowForModel = bow[:len(urlsForModel)]
    assert len(bowForModel) == config['maxDocuments']
    i = 0
    for url in urlsToVectorize:
        if url == trainNewsList[0]:
            break
        i += 1
    assert i == len(extraNews) or i == 0
    bowForInference = bow[i:i + len(trainNews) + len(testNews)]
    assert len(bowForInference) == len(trainNews) + len(testNews)
    lda_model = gensim.models.LdaMulticore\
    (
        bowForModel,
        num_topics=config['nbTopics'],
        id2word=dictionary,
        iterations=config['maxIter'],
        decay=config['ldaLearningDecay'],
        offset=config['ldaLearningOffset'],
        workers=cpuCount(),
        passes=3,
    )
    inferedVectors = []
    for current in bowForInference:
        topicProbDistrib = lda_model[current]
        currentVector = [0.0] * config['nbTopics']
        for t, v in topicProbDistrib:
            currentVector[t] = v
        inferedVectors.append(np.array(currentVector))
    assert len(inferedVectors) == len(trainNews) + len(testNews)
    assert len(inferedVectors[0]) == config['nbTopics']
    topics = []
    for i in range(lda_model.num_topics):
        current = dict()
        for x in lda_model.get_topic_terms(i, topn=100):
            current[dictionary[x[0]]] = x[1]
        topics.append(current)

In [22]:
if config['implementation'] == 'sklearn-lda' or config['implementation'] == 'sklearn-nmf':
    if config['implementation'] == 'sklearn-nmf' or config['useTFIDF']:
        vectorizer = TfidfVectorizer\
        (
            sublinear_tf=True,
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            # lowercase=True, # Doesn't work because we erased preprocessor
        )
    else:
        vectorizer = CountVectorizer\
        (
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            # lowercase=True, # Doesn't work because we erased preprocessor
        )
    vectors = vectorizer.fit_transform(docs)
    assert vectors.shape[0] == len(urlsToVectorize)
    vectorsForModel = vectors[:len(urlsForModel)]
    assert vectorsForModel.shape[0] == config['maxDocuments']
    i = 0
    for url in urlsToVectorize:
        if url == trainNewsList[0]:
            break
        i += 1
    assert i == len(extraNews) or i == 0
    vectorsForInference = vectors[i:i + len(trainNews) + len(testNews)]
    assert vectorsForInference.shape[0] == len(trainNews) + len(testNews)
    if config['implementation'] == 'sklearn-lda':
        model = LatentDirichletAllocation\
        (
            n_components=config['nbTopics'],
            learning_method=config['ldaLearningMethod'],
            learning_offset=config['ldaLearningOffset'],
            learning_decay=config['ldaLearningDecay'],
            random_state=0,
            n_jobs=cpuCount(),
            max_iter=config['maxIter'],
        )
    else:
        model = NMF\
        (
            n_components=config['nbTopics'],
            random_state=0,
            alpha=config['nmfAlpha'],
            l1_ratio=config['nmfL1Ratio'],
            init=config['nmfInit'],
            max_iter=config['maxIter'],
        )
    model.fit(vectorsForModel)
    inferedVectors = model.transform(vectorsForInference)
    assert inferedVectors.shape[0] == len(trainNews) + len(testNews)
    assert inferedVectors[0].shape[0] == config['nbTopics']
    topics = []
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        wordProb = []
        for i in range(len(topic)):
            prob = topic[i]
            word = feature_names[i]
            wordProb.append((word, prob))
        wordProb = sortBy(wordProb, desc=True, index=1)[:100]
        current = dict()
        for word, prob in wordProb:
            current[word] = prob
        topics.append(current)

In [23]:
tt.tic("Model fitted and topic vectors infered")

--> tic: 11.54s | message: Model fitted and topic vectors infered


11.54

# Showing topics

In [24]:
def printTopics(topics, maxWords=10, logger=None):
    for i in range(len(topics)):
        log(str(i) + ": " + str(" ".join(list(topics[i].keys())[:10])), logger)

In [25]:
def printTopicsOf(vector, topics, logger=None):
    topicsRepr = ""
    topTopics = sortBy([(i, score) for i, score in enumerate(vector) if score > 0.1], desc=True, index=1)[:3]
    log("Top topics number are: " + str(" ".join([str(e[0]) for e in topTopics])), logger)
    currentTopics = [topics[e[0]] for e in topTopics]
    printTopics(currentTopics, logger=logger)

In [26]:
printTopics(topics, logger=logger)

0: Olympics Cup Barcelona Madrid Dallas games Rio league players FC
1: Ham jersey Cup Denver flag Colorado widening drainage World Spain
2: Denver Amazon city taxes property bids ad values Dallas finalists
3: Denver Cup Colorado Barcelona Ham housing players team match city
4: Ham Denver Fox games Colorado Cup Olympic Games club Friends
5: Colorado Denver Cup Beck Winter Barcelona Hancock Messi city Olympics
6: Cup Messi Denver Barcelona players housing games Beck Colorado World
7: FC Dallas Hunt Fame Soccer Frisco Cup Hall players Stadium
8: Barcelona Colorado Cup Messi No players Denver World Madrid Ham
9: Cup Denver players play Colorado Dallas housing games game club
10: Cup jersey World Ham flag Spain Beck Dr. colors soccer
11: Ham players Denver Italy Cup ball Barcelona Colorado team supporters
12: Denver Colorado Cup Winter Olympics players league World Games play
13: Ham West match club Madrid relegation Denver supporters Barcelona League
14: jersey Denver flag Olympics Boston 

# Making a dict url --> topic vector

In [27]:
assert len(urlsToInfere) == len(inferedVectors)
urlsVectors = dict()
for i in range(len(urlsToInfere)):
    urlsVectors[urlsToInfere[i]] = inferedVectors[i]

# Ranking

In [28]:
if False and isNotebook:
    config['historyRef'] = 1.0

In [29]:
historyRef = config['historyRef']
assert (isinstance(historyRef, int) and historyRef >= 1) or (isinstance(historyRef, float) and historyRef > 0.0 and historyRef <= 1.0)

In [30]:
# Read the doc!
rankings = usersRankingsByHistoryDistance\
(
    trainUsers,
    candidates,
    historyRef,
    urlsVectors,
    distanceMetric=config['distance'],
    logger=logger,
)



In [31]:
bp(rankings, logger, 4)

{
  '2617447752': 
  [
    [
      http://ontario.ca/b3v8,
      http://nyti.ms/2zaO6c6,
      http://www.courant.com/opinion/editorials/hc-ed-ct-needs-left-lane-law-20180101-story.html,
      http://bit.ly/2qPkanf,
      http://wsvn.com/news/local/burglar-returns-bike-he-stole-outside-of-ne-miami-dade-home/,
      https://medium.com/reaching-out/when-the-church-kicked-me-out-this-is-where-i-went-2e214adedf92?source=linkShare-35084a546d22-1515599098,
      https://fb.me/8AgCIOtvd,
      https://trib.al/HSlqzZ8,
      http://bit.ly/2CUajSH,
      https://uofacesmg.wordpress.com/2018/01/07/wonder-and-kindness/,
      ...,
      https://medium.com/shanghaiist/chinese-shoe-company-tricks-people-into-swiping-instagram-ad-with-fake-strand-of-hair-54d8a2d8ec1d,
      https://buff.ly/2DfcIEr,
      http://www.stltoday.com/news/local/govt-and-politics/protesters-volunteers-urge-city-to-do-more-to-help-homeless/article_4c7990fc-6d8d-579b-bb7a-649fbe225926.html?utm_medium=social&utm_source=facebo

In [32]:
tt.tic("Rankings done")

--> tic: 0.29s | message: Rankings done


0.29

# Adding rankings to the db

In [33]:
# Read the doc!
addRanking(modelName, rankings, config, logger=logger)

Exception type: <class 'gridfs.errors.FileExists'>
Exception: file with _id ObjectId('5e7d19898175116238a94a07') already exists
Traceback (most recent call last):
  File "/home/hayj/.local/share/virtualenvs/st-venv/lib/python3.6/site-packages/gridfs/grid_file.py", line 296, in __flush
    self._file, session=self._session)
  File "/home/hayj/.local/share/virtualenvs/st-venv/lib/python3.6/site-packages/pymongo/collection.py", line 693, in insert_one
    session=session),
  File "/home/hayj/.local/share/virtualenvs/st-venv/lib/python3.6/site-packages/pymongo/collection.py", line 607, in _insert
    bypass_doc_val, session)
  File "/home/hayj/.local/share/virtualenvs/st-venv/lib/python3.6/site-packages/pymongo/collection.py", line 595, in _insert_one
    acknowledged, _insert_command, session)
  File "/home/hayj/.local/share/virtualenvs/st-venv/lib/python3.6/site-packages/pymongo/mongo_client.py", line 1243, in _retryable_write
    return self._retry_with_session(retryable, func, s, None)

In [34]:
tt.tic("Rankings stored")

--> tic: 0.25s | message: Rankings stored


0.25

In [35]:
totalDuration = tt.toc()

--> toc total duration: 1m 31.489s


In [36]:
notif(modelName + '-' + objectToHash(config)[:5] + " done in " + secondsToHumanReadableDuration(totalDuration))

# Old notebook stuff

In [37]:
if isNotebook:
    urlsTexts = dict()
    allTexts = getNewsText(urlsToInfere, logger=logger)
    for i in range(len(urlsToInfere)):
        urlsTexts[urlsToInfere[i]] = allTexts[i]

  0% [                    ]
  9% [=                   ] (35.777s left)
 19% [===                 ] (31.817s left)
 29% [=====               ] (27.998s left)


In [38]:
if isNotebook:
    userId = list(trainUsers.keys())[1]
    xvectors = []
    xurls = []
    for url in trainUsers[userId]:
        xvectors.append(urlsVectors[url])
        xurls.append(url)
    xvectors = np.array(xvectors)
    yvectors = []
    yurls = []
    for url in candidates[userId][0]:
        yvectors.append(urlsVectors[url])
        yurls.append(url)
    yvectors = np.array(yvectors)
    distances = getDistances(xvectors, yvectors, metric='cosine')

In [39]:
# Printing some docs with topics:
if isNotebook:
    for i in range(10):
        urls = random.choice([xurls, yurls])
        url = random.choice(urls)
        text = urlsTexts[url]
        vector = urlsVectors[url]
        log(url, logger)
        printTopicsOf(vector, topics, logger=logger)
        log(text, logger)
        log("\n" * 2, logger)

http://ow.ly/dHbA30hEO4n
Top topics number are: 
Ever since the recent wave of protests in Iran began on December 28, Western media coverage has disproportionately focused on socioeconomic causes as the main drivers behind citizens' anger. Economic hardship is undeniably a key root cause, but to ignore the underlying political grievances is to lose sight of the bigger fight for the country's future. Although Iranian President Hassan Rouhani may be neither the primary target nor the proximate cause of the demonstrations, his record in office since winning reelection last May has been an enormous disappointment to the nearly 24 million Iranians who voted for his second term. Instead of seeking to be his own man, Rouhani has repeatedly fallen back into following the playbook of the unelected Supreme Leader Ayatollah Ali Khamenei, widening the already dangerous disconnect between ordinary Iranians and the ruling Shiite Islamist elite that purports to represent them. Nothing short of major 




http://www.delmarvanow.com/story/news/2018/01/09/chesapeake-housing-mission-reaches-300-homes-milestone/1014566001/
Top topics number are: 
Jim Richardson expressed his gratitude for the work by the Salisbury City Police Department and Chesapeake Housing Mission. (Photo: Chesapeake Housing Mission)
"Making the homes of those less fortunate more livable and secure" — that's the mission of Chesapeake Housing Mission.
The nonprofit organization has done just that for 300 families since it was founded in 2009.
A milestone build — project number 300 — was for a Salisbury couple, Jim Richardson, 65, and his wife, Roseanne, who is 67.
Richardson's story began 15 years ago, with her struggle with depression and anxiety following the violent death of a close family member.
Last summer, she had a fall in her home, breaking ribs and a bone in her neck, and fracturing vertebrae, in addition to other injuries. She was also diagnosed with Parkinson's Syndrome, and Lewy body dementia, which can im




https://www.theguardian.com/football/2017/nov/20/west-ham-fans-stop-calling-999
Top topics number are: 13
0: Ham West match club Madrid relegation Denver supporters Barcelona League
• Essex police tweet message to disgruntled Hammers supporters • West Ham in relegation zone after defeat to Watford on Sunday
West Ham United fans have been warned to stop calling 999 to complain about their team. The side suffered a 2-0 defeat at Watford on Sunday in David Moyes's first match in charge, with goals from Will Hughes and Richarlison settling the match at Vicarage Road.
West Ham's Andy Carroll leaves his mark on Watford but barely troubles the match | Simon Burnton Read more
It seems some supporters have been taking their grievances too far though, with the Essex Police Force Control Room urging them to stop using the emergency number.
A tweet from the @EPControlRoom account read: "Ringing 999 because @WestHamUtd have lost again and you aren't sure what to do is not acceptable! It is a com




https://fb.me/849QGCqbZ
Top topics number are: 
Jerry Van Dyke played a nightclub comic in the short-lived '60s sitcom "Accidental Family." (NBC)
He was the younger brother of a more famous performer, but it is perfectly possible that Jerry Van Dyke was the sibling whose work you knew or liked best.
On and off television from the early 1960s to 2015, including eight seasons of "Coach" and a late-career recurring role on "The Middle, " Van Dyke, who died Friday at age 86, may have shared a name and DNA with the star of "Bye, Bye, Birdie, " "Mary Poppins" and, of course, "The Dick Van Dyke Show."
But he was an original, not a knock-off, funny and charming — funny and testy, later — in ways unmistakably his own.
The Van Dykes hailed from Danville, Ill., in the rural Midwest, and compared to Dick's bright-eyed, polished persona, Jerry kept something of the country about him — he was a dropper of Gs, a user of "ain't." His manner was softer than that of his older brother, sweet and sleep

In [40]:
# Printing some similar docs:
if isNotebook:
    for i in range(distances.shape[0]):
        if i > 100:
            break
        # Get train:
        trainUrl = xurls[i]
        trainText = urlsTexts[trainUrl]
        trainVector = urlsVectors[trainUrl]
        log(trainUrl, logger)
        printTopicsOf(trainVector, topics, logger=logger)
        log(trainText[:2000], logger)
        log("\n", logger)
        # Get distances:
        currentDistances = []
        for u in range(len(yurls)):
            currentDistances.append((yurls[u], urlsTexts[yurls[u]], distances[i][u]))
        topSim = sortBy(currentDistances, index=2, desc=False)[:3]
        topDissim = sortBy(currentDistances, index=2, desc=True)[:3]
        # Print similars:
        log("MOST SIMILARS", logger)
        log("\n", logger)
        for url, text, dist in topSim:
            log(dist, logger)
            log(url, logger)
            printTopicsOf(urlsVectors[url], topics, logger=logger)
            log(text[:2000], logger)
            log("\n", logger)
        # Print dissimilars:
        log("MOST DISSIMILARS", logger)
        log("\n", logger)
        for url, text, dist in topDissim:
            log(dist, logger)
            log(url, logger)
            printTopicsOf(urlsVectors[url], topics, logger=logger)
            log(text[:2000], logger)
            log("\n", logger)
        log("\n", logger)
        log("\n" * 2 + '-' * 20 + "\n" * 2, logger)        

https://www.theguardian.com/football/2017/nov/06/slaven-bilic-sacked-west-ham-david-moyes-manager
Top topics number are: 13
0: Ham West match club Madrid relegation Denver supporters Barcelona League
West Ham United are set to appoint David Moyes on an initial six-month contract and hand him the responsibility of hauling them away from relegation danger after finally losing faith in Slaven Bilic's ability to reverse the team's decline.
The club hope the former Sunderland manager can take training as early as on Tuesday morning with Alan Irvine, Chris Woods and possibly Stuart Pearce joining him as part of his backroom staff. Moyes will be offered a fresh two-year deal if he keeps West Ham in the Premier League.
Bilic, whose contract was due to run out next summer, was sacked on Monday morning after a dismal start to the season that has left West Ham in 18th place and with two wins from their first 11 Premier League games.
While the club's owners, David Gold and David Sullivan, had hope



MOST DISSIMILARS


0.8828763
http://bit.ly/2AkNlxU
Top topics number are: 7
0: FC Dallas Hunt Fame Soccer Frisco Cup Hall players Stadium
U.S. Soccer enticed more than 6,000 teenagers nationwide to join development academies by trumpeting that the DA would accelerate the development of world-class female players, that it would offer advantages in recruiting and that it would be the primary pathway to being selected for a youth national team.
That plan changed when Oklahoma pledge Taylor Tufts found a way to continue playing for Southlake Carroll through a loophole that not everyone knows exists.
One of the best soccer players in Texas was planning to spend her junior year as a team manager, just so she could continue to be around her high school team while playing for a development academy.
"I think academy is starting to figure out that they may have to let these kids play high school soccer if they want to keep them, " Coppell coach Ryan Dunlevy said. "They're trying to figure out 



0.0
http://nyti.ms/2zaO6c6
Top topics number are: 
China Shrugs Off Debt Worries as Xi Takes Firmer Economic Grip
China's new statement on economic priorities illustrates President Xi Jinping's growing clout, as well as a subtle shift in how the country may address its debt pile.CreditFred Dufour/Agence France-Presse — Getty Images
SHANGHAI — It's Xi Jinping's economy now, and he isn't too worried about debt.
China signaled its economic priorities on Wednesday at the end of a meeting of top Communist Party economic leaders with a statement indicating that President Xi is fully in charge. Labeled "Xi Jinping Thought on Socialist Economy With Chinese Characteristics, " the statement called for trimming industrial overcapacity, controlling the supply of money and other moves that have been staples of China's other recent declarations.
"Prudent monetary policy should be kept neutral, the floodgates of monetary supply should be controlled, and credit and social financing should see reason







--------------------


http://soccer.nbcsports.com/2017/11/07/var-problems-causing-major-headache-for-bundesliga/
Top topics number are: 
BERLIN (AP) Accusations of manipulation and incompetence have dogged the Bundesliga's trial of the video assistant referee this season, forcing the German soccer federation to act quickly to pacify angry team officials.
The DFB dismissed Hellmut Krug as head of its VAR project this week after tabloid claims he influenced two key decisions in a game between Schalke, his hometown club, and visiting Wolfsburg on Oct. 28.
[ VOTE: PST's Big American soccer survey ]
Lutz Michael Froehlich, the head of the federation's refereeing department, will take over.
Krug and Marco Fritz, the video assistant operating in Cologne that day last month, denied the allegations, and the DFB said it was impossible for supervisors to intervene during games. But the federation evidently felt a fresh start was needed after weeks of controversy since VAR started this seas



0.5711882
https://sportsday.dallasnews.com/soccer/soccer/2018/01/01/fc-dallas-buy-new-years-gift
Top topics number are: 7
0: FC Dallas Hunt Fame Soccer Frisco Cup Hall players Stadium
Waking up to the new year FC Dallas fans might be somewhat surprised to find out their club may have acquired a veteran, European, World Cup experienced center back. Frisco, meet Reto Ziegler.
Update 10:08 am - FC Dallas' replied to inquires about Ziegler with the following: "Our technical staff is constantly exploring options to improve our club. We won't have anything further at this time." So, there's that.
While nothing has been released from Toyota Stadium, the rest of the soccer world seem to think this is a done deal. Many of the player databases are showing Ziegler as a Huntsman and coming to Texas on a free transfer. How the center back will be accounted for in one of MLS' different salary cap loopholes is not yet known. No matter the price, Oscar Pareja is getting a rather well traveled player



0.0
http://www.courant.com/opinion/editorials/hc-ed-ct-needs-left-lane-law-20180101-story.html
Top topics number are: 
The state legislature might have more pressing issues, but when it convenes in February, legislators should pass a law making it clear to drivers that the left lane is for passing and turning left.
"Slower Traffic Keep Right" signs already line hilly roads in Connecticut, but it's a good idea for drivers everywhere, whether on a steep hill or a flat interstate.
Connecticut has an epidemic of left-lane lollygaggers, not only on its highways but on roads such as Route 44 west of Hartford and the Berlin Turnpike. Though many drivers complain about weavers who switch lanes to get around slower vehicles, the cause is often a car loitering in the left lane.
Many states have laws restricting the left lanes of divided highways to vehicles that are passing traffic, and there's a good reason for it: The right-hand side of any vehicle has a larger blind spot. Passing a vehicle 



MOST SIMILARS


1.2695789e-05
https://medium.com/music-city-soccer/nashville-sc-signs-four-new-players-to-usl-squad-dcaeab3bdecf
Top topics number are: 7
0: FC Dallas Hunt Fame Soccer Frisco Cup Hall players Stadium
Nashville SC signs four new players to USL squad
This is Link-Up Play, the daily roundup of Nashville soccer news from Music City Soccer. If you would like to have it delivered directly to your inbox, click here .
Nashville SC announced the signing of four new players to its USL squad on Thursday, bringing the total players on board for their inaugural season in the second flight to 17. It's a mix of low-ceiling, high-floor veterans and young, unproven talents.
29-year-old defender Justin Davis began his professional career with Minnesota United in 2011 when the Loons were in the NASL. He made the jump with the club to MLS in 2017, starting out as United's first-choice left fullback. Davis lost his spot, however, and saw his minutes disappear in the second half of the sea



0.6529234
http://ow.ly/L28b30hsjkh
Top topics number are: 2
0: Denver Amazon city taxes property bids ad values Dallas finalists
Assistant City Manager Jessica Binnquist will succeed Mark Yokoyama as Alhambra's city manager, the city announced Tuesday.
Binnquist has served as interim city manager since Yokoyama's retirement Nov. 2. Yokoyama was the city's police chief for about five years before making the switch to city manager in May 2016.
In her eight years with the city, Binnquist has spearheaded the city's use of federal grant funding from the U.S. Department of Housing, including contracting with Pasadena's Union Station Homeless Services to provide services in Alhambra.
While the city had hired consultant Bob Murray & Associates to conduct a search for candidates, the City Council found after interviewing the candidates that Binnquist was the right choice, Mayor Stephen Sham said.
"The City Council has been extremely impressed with Ms. Binnquist's ability to step into the inte



0.06485641
http://inthesetimes.com/article/20828/paula-jean-swearengin-joe-manchin-senate-west-virginia
Top topics number are: 13
0: Ham West match club Madrid relegation Denver supporters Barcelona League
Paula Jean Swearengin is running to unseat Sen. Joe Manchin in West Virginia and rein in the power of the coal industry in the state. (Youtube/The Young Turks)
Paula Jean Swearengin supported Bernie Sanders in 2016. Now she's challenging Big Coal and the Senate's most conservative Democrat.
Our incumbents, Democrats and Republicans, have been coal industry servants. They haven't been servants for the people and definitely haven't been friends of coal miners.
On Dec. 27, 2017, self-professed "hillbilly" Paula Jean Swearengin flew from her hometown of West Virginia to the posh enclave of Beverly Hills, Calif., to participate in a forum of reform-minded women running for Congress. But Swearengin, who is running in the Democratic primary for West Virginia Senate against incumbent Joe M

Top topics number are: 2
0: Denver Amazon city taxes property bids ad values Dallas finalists
UPDATE: The Internal Revenue Service said late Wednesday that homeowners can only deduct prepaid property taxes from their next income tax filings if those property taxes were assessed and paid during 2017. Most Washington-area jurisdictions say they have not yet sent out property tax bills for the coming year which, according to the IRS, would mean property owners can not pay in advance and then claim a deduction before a new federal cap on deductions takes effect.
In other words, thousands of homeowners who scrambled to prepay their 2018 property taxes this week may have done so in vain.
The federal tax legislation that takes effect Jan. 1 will cap the state and local deductions on federal tax returns at $10,000. Residents who planned to pay next year's taxes before this year ends were hoping their efforts will allow them to take advantage of the deductions one last time. Here is a roundup o



MOST DISSIMILARS


0.58213973
http://bit.ly/2AkNlxU
Top topics number are: 7
0: FC Dallas Hunt Fame Soccer Frisco Cup Hall players Stadium
U.S. Soccer enticed more than 6,000 teenagers nationwide to join development academies by trumpeting that the DA would accelerate the development of world-class female players, that it would offer advantages in recruiting and that it would be the primary pathway to being selected for a youth national team.
That plan changed when Oklahoma pledge Taylor Tufts found a way to continue playing for Southlake Carroll through a loophole that not everyone knows exists.
One of the best soccer players in Texas was planning to spend her junior year as a team manager, just so she could continue to be around her high school team while playing for a development academy.
"I think academy is starting to figure out that they may have to let these kids play high school soccer if they want to keep them, " Coppell coach Ryan Dunlevy said. "They're trying to figure out



MOST SIMILARS


0.062747
http://www.bbc.co.uk/sport/football/42683488
Top topics number are: 20
0: Barcelona Messi Denver Colorado Winter Olympics club Games Ham play
Jefferson Lerma (left) told Spanish TV after the game: "These racist actions shouldn't happen"
Celta Vigo's former Liverpool striker Iago Aspas has denied an allegation he racially abused an opposition player.
Levante midfielder Jefferson Lerma, 23, claimed Aspas used a racial slur during Celta's 1-0 La Liga victory on Sunday.
After he told Spanish television of the alleged abuse, his club issued a statement supporting him.
Celta also released a statement in which 30-year-old Spain international Aspas said he "didn't call" Lerma "what he said I did".
He added: "What is said on the pitch stays on the pitch. For that reason I'm not going to repeat what he said to me."
Celta's statement added they "strongly defend fair play and respect on and off the pitch", values the club said were "part of our identity".
Aspas joined Li



0.88544
https://www.washingtonpost.com/local/can-you-prepay-your-real-estate-taxes-before-the-tax-bill-takes-effect-find-out-here/2017/12/26/5539b14c-ea7c-11e7-8a6a-80acf0774e64_story.html?utm_term=.d3b4b6c86a6e
Top topics number are: 2
0: Denver Amazon city taxes property bids ad values Dallas finalists
UPDATE: The Internal Revenue Service said late Wednesday that homeowners can only deduct prepaid property taxes from their next income tax filings if those property taxes were assessed and paid during 2017. Most Washington-area jurisdictions say they have not yet sent out property tax bills for the coming year which, according to the IRS, would mean property owners can not pay in advance and then claim a deduction before a new federal cap on deductions takes effect.
In other words, thousands of homeowners who scrambled to prepay their 2018 property taxes this week may have done so in vain.
The federal tax legislation that takes effect Jan. 1 will cap the state and local deductions on

When Trump got to Washington, the biggest prison companies were already making more money than ever from immigrant detention, and today their prospects have never looked rosier. Fueled by the administration's deportation dragnet, Immigration and Customs Enforcement has predicted a surge in its daily population of detainees, from around 34,000 in July to more than 51,000 over the next year — and prison companies are more than happy to accommodate. In April, GEO Group executives won the administration's first private immigration detention contract, for a facility in Conroe, Texas, that's expected to bring in $44 million annually. This year the company also started operating an ICE detention center in Folkston, Georgia, that could boost i


0.0
http://www.stltoday.com/entertainment/movies/reviews/aaron-sorkin-makes-all-the-right-moves-with-molly-s/article_6716412c-5679-52e5-ad59-b6c9d76cc676.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share
Top topics number are: 
Molly Bl







--------------------


https://www.dallasnews.com/news/frisco/2017/12/12/new-soccer-hall-fame-frisco-will-kick-offin-october-induction-ceremony-games
Top topics number are: 7
0: FC Dallas Hunt Fame Soccer Frisco Cup Hall players Stadium
Get Unlimited Digital Access Your first month is less than a dollar. $0.99 for first 4 weeks Subscribe Now
FRISCO — The goal is in sight for the National Soccer Hall of Fame.
Since the 2015 ground-breaking, much of the construction work at the south end of Toyota Stadium has been below ground level. That area will be home to the locker rooms, training rooms and coaches' offices, along with a new club space, patio and media center.
Above that, the new street-level space housing a more upscale club, premier seating and the Hall of Fame is just starting to take shape.
"I knew it was going to be big, but we made it Texas-sized, " FC Dallas president Dan Hunt said Tuesday during a news conference to announce the opening day festivities.
A new home
The 

Top topics number are: 2
0: Denver Amazon city taxes property bids ad values Dallas finalists
UPDATE: The Internal Revenue Service said late Wednesday that homeowners can only deduct prepaid property taxes from their next income tax filings if those property taxes were assessed and paid during 2017. Most Washington-area jurisdictions say they have not yet sent out property tax bills for the coming year which, according to the IRS, would mean property owners can not pay in advance and then claim a deduction before a new federal cap on deductions takes effect.
In other words, thousands of homeowners who scrambled to prepay their 2018 property taxes this week may have done so in vain.
The federal tax legislation that takes effect Jan. 1 will cap the state and local deductions on federal tax returns at $10,000. Residents who planned to pay next year's taxes before this year ends were hoping their efforts will allow them to take advantage of the deductions one last time. Here is a roundup o



0.0
http://nyti.ms/2zaO6c6
Top topics number are: 
China Shrugs Off Debt Worries as Xi Takes Firmer Economic Grip
China's new statement on economic priorities illustrates President Xi Jinping's growing clout, as well as a subtle shift in how the country may address its debt pile.CreditFred Dufour/Agence France-Presse — Getty Images
SHANGHAI — It's Xi Jinping's economy now, and he isn't too worried about debt.
China signaled its economic priorities on Wednesday at the end of a meeting of top Communist Party economic leaders with a statement indicating that President Xi is fully in charge. Labeled "Xi Jinping Thought on Socialist Economy With Chinese Characteristics, " the statement called for trimming industrial overcapacity, controlling the supply of money and other moves that have been staples of China's other recent declarations.
"Prudent monetary policy should be kept neutral, the floodgates of monetary supply should be controlled, and credit and social financing should see reason







--------------------


https://www.si.com/soccer/2017/12/20/el-clasico-real-madrid-barcelona-preview-zidane-valverde-messi-ronaldo
Top topics number are: 20
0: Barcelona Messi Denver Colorado Winter Olympics club Games Ham play
Back in August, after losing 5-1 on aggregate to Real Madrid in the Spanish Super Cup, Ernesto Valverde, having just finished his first preseason as Barcelona's new manager, sat in the press room and reflected on his team's poor performance.
"They [Real Madrid] had a sense of energy, pressed more intensely and punished on our mistakes, " he said speaking in the post-match conference.
But as the conference went on, Valverde highlighted another reason for the loss. "Things have happened this preseason that can affect the team's imbalance. So we have to find a new balance, play well, all-around and find results."
For a quiet, reserved, disciplined man such as Valverde, one can only imagine how difficult this past summer must have been. Neymar's transfer saga,



0.8151067
https://sportsday.dallasnews.com/soccer/soccer/2018/01/01/fc-dallas-buy-new-years-gift
Top topics number are: 7
0: FC Dallas Hunt Fame Soccer Frisco Cup Hall players Stadium
Waking up to the new year FC Dallas fans might be somewhat surprised to find out their club may have acquired a veteran, European, World Cup experienced center back. Frisco, meet Reto Ziegler.
Update 10:08 am - FC Dallas' replied to inquires about Ziegler with the following: "Our technical staff is constantly exploring options to improve our club. We won't have anything further at this time." So, there's that.
While nothing has been released from Toyota Stadium, the rest of the soccer world seem to think this is a done deal. Many of the player databases are showing Ziegler as a Huntsman and coming to Texas on a free transfer. How the center back will be accounted for in one of MLS' different salary cap loopholes is not yet known. No matter the price, Oscar Pareja is getting a rather well traveled player



0.54133135
https://www.forbes.com/sites/omribarzilay/2018/01/10/10-real-estate-startups-to-watch-in-2018/2/#5a298519332a
Top topics number are: 
The last decade has witnessed an exponential growth in real estate tech startups. Globally, the number of startups rose from 176 in 2008 to 1,274 by 2017. In the same period, cumulative investments in these startups soared from $2.4 billion to $33.7 billion.
2017 was a monumental year for real estate tech with some incredible financing rounds, IPOs, exits and ICOs. Softbank Vision Fund has shown great confidence in the sector, deploying $4.4 billion to the coworking behemoth WeWork, $450 million to tech enabled brokerage Compass and $120 million to home insurance company Lemonade. Fifth Wall Ventures raised $212 million for its first fund, the biggest dedicated real estate technology fund to date.
Redfin's IPO was one of the most successful IPOs of this year, soaring 45% in its first day. Multifamily listing company ForRent was acquired for 

Top topics number are: 7
0: FC Dallas Hunt Fame Soccer Frisco Cup Hall players Stadium
FC Dallas defender Maynor Figueroa (31, center) celebrates with team mates after he scored a goal in the sixth minute of the first half of a soccer game between the Colorado Rapids and FC Dallas on Wednesday, September 27, 2017 at Toyota Stadium in Frisco, Texas. (Ashley Landis/The Dallas Morning News)
"Lanus' edge is it goes out to seek out (players) where others don't go. That's the fundamental difference", Ricchetti said during an interview last month in Buenos Aires.
With a $5.5 million budget for 2016, Ricchetti was part of the coaching staff lead by Jorge Almiron who took Lanus to the Copa Libertadores final game in November after winning the Argentine league the previous year.
Pablo Ricchetti knows what working under a limited budget means, but he thinks it all comes down to using the funds wisely.
Toronto spent the most, with $20.1 million in salaries - They won the championship less than thr

Top topics number are: 2
0: Denver Amazon city taxes property bids ad values Dallas finalists
UPDATE: The Internal Revenue Service said late Wednesday that homeowners can only deduct prepaid property taxes from their next income tax filings if those property taxes were assessed and paid during 2017. Most Washington-area jurisdictions say they have not yet sent out property tax bills for the coming year which, according to the IRS, would mean property owners can not pay in advance and then claim a deduction before a new federal cap on deductions takes effect.
In other words, thousands of homeowners who scrambled to prepay their 2018 property taxes this week may have done so in vain.
The federal tax legislation that takes effect Jan. 1 will cap the state and local deductions on federal tax returns at $10,000. Residents who planned to pay next year's taxes before this year ends were hoping their efforts will allow them to take advantage of the deductions one last time. Here is a roundup o

# Old stuff

In [41]:
# Serialization:
if False:
    try:
        dirPath = nosaveDir() + "/sklearn-models"
        mkdir(dirPath)
        configHash = objectToHash(config)[:5]
        serialize(model, dirPath + "/model-" + configHash + ".pickle")
        toJsonFile(config, dirPath + "/config-" + configHash + ".json")
    except Exception as e:
        logException(e, logger=logger)