# Commands

In [1]:
# cd misc-logs ; ./mv-old-logs.sh ; jupython --venv st-venv ~/notebooks/twinews/topicmodels.ipynb

In [2]:
# Sources:
# https://towardsdatascience.com/lets-build-an-article-recommender-using-lda-f22d71b7143e
# https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

# Init

In [3]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [4]:
isNotebook = '__file__' not in locals()

In [5]:
TEST = isNotebook

In [6]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from databasetools.mongo import *
from newstools.goodarticle.utils import *
from nlptools.preprocessing import *
from nlptools.news import parser as newsParser
from machinelearning.iterator import *
from twinews.utils import *

In [7]:
from nlptools.topicmodeling import *
from nltk.stem import WordNetLemmatizer 
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import LdaMulticore
from gensim.test.utils import common_corpus, common_dictionary
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import gensim
from math import log2
from math import sqrt
from numpy import asarray

In [8]:
logger = Logger(tmpDir('logs') + "/lda.log") if isNotebook else Logger("lda.log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

# Config

In [9]:
config = \
{
    'splitVersion': 2,
    
    'maxUsers': 20 if TEST else None, # Sub-sampling
    'maxDocuments': 500 if TEST else 10000,
    'useExtraNews': False if TEST else False, # None = unlimited, 0 = no extra news
    'minDF': 1 / 500, # Remove words that have a document frequency ratio lower than 1 / 500
    'maxDF': 300, # Remove top 300 voc elements
    
    'nbTopics': 30, # 30, 100
    'lowercase': True,
    'doLemmatization': False if TEST else True,
    # <https://www.quora.com/How-do-you-combine-LDA-and-tf-idf>
    # <https://www.quora.com/Why-is-the-performance-improved-by-using-TFIDF-instead-of-bag-of-words-in-LDA-clustering>
    'useTFIDF': False,
    
    'maxIter': 20 if TEST else 30, # 30 for lda, 200 for nmf
    
    'nmfInit': 'nndsvd', # None, 'nndsvd'
    'nmfL1Ratio': 0, # 0.0, 0.5, 1.0
    'nmfAlpha': 0.1, # 0.0, 0.1
    
    'ldaLearningMethod': 'online',
    'ldaLearningOffset': 1.0, # 1.0, 10.0
    'ldaLearningDecay': 0.7, # 0.5, 0.7, 0.9, 1.0
    
    'model': 'gensim-lda', # gensim-lda, sklearn-lda, sklearn-nmf
    'distance': 'euclidean', # 'cosine', 'euclidean', 'kl', 'js'
    # The historyRef param is very important, it allow to choose, for a particular candidate,
    # how many train history items will be used to calculate the similarity with
    # the user history.
    # Float are ratio on train history
    # Integers are absolute number of train item in the history
    # For example:
    #  * 1.0 will allow to mean similarities of a candidate with all train history items
    #  * 1 will allow to use only the most similar train item for the similarity of
    #    the candidate with the history of the user
    #  * 0.5 will allow to use the half of history for each candidates
    #  * 3 to use 3 most similar items with the current candidate...
    'historyRef': 1.0, # 1, 1.0, 0.5, 0.3, 3, 10
}

In [10]:
if 'nmf' in config['model']:
    modelName = "nmf"
    newConfig = dict()
    for key, value in config.items():
        if not key.startswith('lda'):
            newConfig[key] = value
    config = newConfig
    del config['useTFIDF']
elif 'lda' in config['model']:
    modelName = "lda"
    newConfig = dict()
    for key, value in config.items():
        if not key.startswith('nmf'):
            newConfig[key] = value
    config = newConfig

# Getting data

In [11]:
# Getting users and news
evalData = getEvalData(config['splitVersion'], maxExtraNews=config['maxDocuments'],
                       maxUsers=config['maxUsers'], logger=logger)
(trainUsers, testUsers, trainNews, testNews, candidates, extraNews) = \
(evalData['trainUsers'], evalData['testUsers'], evalData['trainNews'],
 evalData['testNews'], evalData['candidates'], evalData['extraNews'])
bp(evalData.keys(), 5, logger)
log(b(evalData['meta'], 5), logger)

--> tic: 3.92s | message: Eval data loaded
twinews news (version 1.0) initialised.
--> tic: 7.8s | message: Extra news downloaded
--> toc total duration: 11.73s | message: Got Twinews evaluation data
{ candidates, extraNews, meta, testNews, testUsers, trainNews, trainUsers }
{ 'created': 2020.03.24-14.28.06, 'endDate': 2018-01-15, 'id': 2, 'ranksLength': 1000, 'splitDate': 2017-12-25, 'startDate': 2017-10-01, 'testMaxNewsPerUser': 97, 'testMeanNewsPerUser': 7.22, 'testMinNewsPerUser': 2, 'testNewsCount': 71781, 'totalNewsAvailable': 570210, 'trainMaxNewsPerUser': 379, 'trainMeanNewsPerUser': 26.48, 'trainMinNewsPerUser': 8, 'trainNewsCount': 237150, 'usersCount': 15905 }


In [12]:
extraNewsList = shuffle(list(extraNews), seed=0)
trainNewsList = shuffle(list(trainNews), seed=0)
testNewsList = shuffle(list(testNews), seed=0)

In [13]:
# We get urls for the LDA model:
if config['useExtraNews']:
    urlsForModel = extraNewsList + trainNewsList + testNewsList
else:
    urlsForModel = trainNewsList + testNewsList + extraNewsList
urlsForModel = urlsForModel[:config['maxDocuments']]
# We get urls to vectorize for the training and the inference:
urlsToVectorize = copy.deepcopy(urlsForModel)
for url in trainNewsList + testNewsList:
    if url not in urlsToVectorize:
        urlsToVectorize.append(url)
# We get url to infere for the scoring:
urlsToInfere = trainNewsList + testNewsList
# Print all:
log(str(len(urlsForModel)) + " urls for urlsForModel:\n" + b(urlsForModel), logger=logger)
log(str(len(urlsToVectorize)) + " urls for urlsToVectorize:\n" + b(urlsToVectorize), logger=logger)
log(str(len(urlsToInfere)) + " urls for urlsToInfere:\n" + b(urlsToInfere), logger=logger)

500 urls for urlsForModel:
[
  http://www.dailymail.co.uk/news/article-5108873/Joe-Barton-apologizes-leaked-nude-photo.html,
  https://www.vanityfair.com/news/2017/12/trump-cant-believe-no-ones-thrown-a-parade-in-his-honor,
  ...,
  https://www.vox.com/policy-and-politics/2017/10/26/16554682/voxcare-childrens-health-insurance-progr,
  https://www.nytimes.com/interactive/2017/12/08/us/puerto-rico-hurricane-maria-death-toll.html?ref=to
]
18009 urls for urlsToVectorize:
[
  http://www.dailymail.co.uk/news/article-5108873/Joe-Barton-apologizes-leaked-nude-photo.html,
  https://www.vanityfair.com/news/2017/12/trump-cant-believe-no-ones-thrown-a-parade-in-his-honor,
  ...,
  http://detroit.cbslocal.com/2018/01/14/report-patriots-dc-matt-patricia-expected-to-be-named-lions-h,
  https://www.huffingtonpost.com/entry/hartnett-white-confirm_us_5a39781fe4b06d1621b01f4b
]
18009 urls for urlsToInfere:
[
  http://www.dailymail.co.uk/news/article-5108873/Joe-Barton-apologizes-leaked-nude-photo.html,
 

In [14]:
# We get sentences:
sentences = getNewsSentences(urlsToVectorize, logger=logger)
bp(sentences, logger)

twinews news (version 1.0) initialised.
  0% [                    ]
  9% [=                   ] (48.807s left)
 19% [===                 ] (42.066s left)
 29% [=====               ] (36.215s left)
[
  [ [ Texas, Congressman, ..., it, . ], [ The, Republican, ..., wife, . ], ..., [ Barton, joined, ..., Texas, . ], [ He, is, ..., Committee, . ] ],
  [ [ On, Wednesday, ..., time, . ], [ The, momentous, ..., pants, . ], ..., [ Hashed, Out, ..., W.S.J., ) ], [ Gary, Cohn, ..., Hive, ) ] ],
  ...,
  [ [ Report, :, ..., Texas, . ], [ (, Photo, ..., over, . ], ..., [ If, -, ..., Quinn, . ], [ Detroit, publicly, ..., Cooter, . ] ],
  [ [ Democrats, are, ..., nomination, . ], [ With, Republican, ..., holidays, . ], ..., [ That, could, ..., Sens, . ], [ Lamar, Alexander, ..., issue, . ] ]
]


In [15]:
# We flatten sentences:
for i in range(len(sentences)):
    sentences[i] = flattenLists(sentences[i])
docs = sentences
bp(docs, logger)

[ [ Texas, Congressman, ..., Committee, . ], [ On, Wednesday, ..., Hive, ) ], ..., [ Report, :, ..., Cooter, . ], [ Democrats, are, ..., issue, . ] ]


In [16]:
# Lower case:
if config['lowercase']:
    for i in range(len(docs)):
        for u in range(len(docs[i])):
            docs[i][u] = docs[i][u].lower()
bp(docs, logger)

[ [ texas, congressman, ..., committee, . ], [ on, wednesday, ..., hive, ) ], ..., [ report, :, ..., cooter, . ], [ democrats, are, ..., issue, . ] ]


In [17]:
# Lemmatization:
if config['doLemmatization']:
    lemmatizer = WordNetLemmatizer()
    pbar = ProgressBar(len(docs), logger=logger, message="Lemmatization")
    for i in range(len(docs)):
        for u in range(len(docs[i])):
            docs[i][u] = lemmatizer.lemmatize(docs[i][u])
        pbar.tic()
bp(docs, logger)

Lemmatization   0% [                    ]
Lemmatization   9% [=                   ] (2m 3.728s left)
Lemmatization  19% [===                 ] (1m 32.497s left)
Lemmatization  29% [=====               ] (1m 16.097s left)
[ [ texas, congressman, ..., committee, . ], [ on, wednesday, ..., hive, ) ], ..., [ report, :, ..., cooter, . ], [ democrat, are, ..., issue, . ] ]


In [18]:
# Filtering the corpus:
docs = filterCorpus(docs, minDF=config['minDF'], maxDF=config['maxDF'],
                    removeEmptyDocs=False, allowEmptyDocs=False, logger=logger)
for doc in docs: assert len(doc) > 0
bp(docs, logger)

Voc removed because of minDF (116146 elements):
{ 0, 00s, 019568429x, 0640l, 0cean, 0day, 0f, 0x, 0x800f0845, 1, ..., 😃, 😄, 😇, 😉, 😍, 😒, 😠, 😢, 🙅, 🙌 }
Voc removed because of maxDF (300 elements):
{ ", ', (, ), ,, -, ., :, ;, ?, ..., without, woman, work, working, world, would, year, yet, you, your }
90.1% of voc will be removed.
[ [ texas, congressman, ..., commerce, committee ], [ wednesday, congress, ..., face, tax ], ..., [ patriot, matt, ..., jim, bob ], [ democrat, mounting, ..., key, policy ] ]


In [19]:
tt.tic("Data preprocessed")

--> tic: 2m 56.5s | message: Data preprocessed


176.5

# Infering topic vectors

In [20]:
if config['model'] == 'gensim-lda':
    dictionary = gensim.corpora.Dictionary(docs)
    # dictionary.filter_extremes(no_below=config['min_df'])
    bow = [dictionary.doc2bow(doc) for doc in docs]
    if config['useTFIDF']:
        tfidf = gensim.models.TfidfModel(bow)
        bow = tfidf[bow]
    assert len(bow) == len(urlsToVectorize)
    bowForModel = bow[:len(urlsForModel)]
    assert len(bowForModel) == config['maxDocuments']
    i = 0
    for url in urlsToVectorize:
        if url == trainNewsList[0]:
            break
        i += 1
    assert i == len(extraNews) or i == 0
    bowForInference = bow[i:i + len(trainNews) + len(testNews)]
    assert len(bowForInference) == len(trainNews) + len(testNews)
    lda_model = gensim.models.LdaMulticore\
    (
        bowForModel,
        num_topics=config['nbTopics'],
        id2word=dictionary,
        iterations=config['maxIter'],
        decay=config['ldaLearningDecay'],
        offset=config['ldaLearningOffset'],
        workers=cpuCount(),
        passes=3,
    )
    inferedVectors = []
    for current in bowForInference:
        topicProbDistrib = lda_model[current]
        currentVector = [0.0] * config['nbTopics']
        for t, v in topicProbDistrib:
            currentVector[t] = v
        inferedVectors.append(np.array(currentVector))
    assert len(inferedVectors) == len(trainNews) + len(testNews)
    assert len(inferedVectors[0]) == config['nbTopics']
    topics = []
    for i in range(lda_model.num_topics):
        current = dict()
        for x in lda_model.get_topic_terms(i, topn=100):
            current[dictionary[x[0]]] = x[1]
        topics.append(current)

In [21]:
if config['model'] == 'sklearn-lda' or config['model'] == 'sklearn-nmf':
    if config['model'] == 'sklearn-nmf' or config['useTFIDF']:
        vectorizer = TfidfVectorizer\
        (
            sublinear_tf=True,
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            # lowercase=True, # Doesn't work because we erased preprocessor
        )
    else:
        vectorizer = CountVectorizer\
        (
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            # lowercase=True, # Doesn't work because we erased preprocessor
        )
    vectors = vectorizer.fit_transform(docs)
    assert vectors.shape[0] == len(urlsToVectorize)
    vectorsForModel = vectors[:len(urlsForModel)]
    assert vectorsForModel.shape[0] == config['maxDocuments']
    i = 0
    for url in urlsToVectorize:
        if url == trainNewsList[0]:
            break
        i += 1
    assert i == len(extraNews) or i == 0
    vectorsForInference = vectors[i:i + len(trainNews) + len(testNews)]
    assert vectorsForInference.shape[0] == len(trainNews) + len(testNews)
    if config['model'] == 'sklearn-lda':
        model = LatentDirichletAllocation\
        (
            n_components=config['nbTopics'],
            learning_method=config['ldaLearningMethod'],
            learning_offset=config['ldaLearningOffset'],
            learning_decay=config['ldaLearningDecay'],
            random_state=0,
            n_jobs=cpuCount(),
            max_iter=config['maxIter'],
        )
    else:
        model = NMF\
        (
            n_components=config['nbTopics'],
            random_state=0,
            alpha=config['nmfAlpha'],
            l1_ratio=config['nmfL1Ratio'],
            init=config['nmfInit'],
            max_iter=config['maxIter'],
        )
    model.fit(vectorsForModel)
    inferedVectors = model.transform(vectorsForInference)
    assert inferedVectors.shape[0] == len(trainNews) + len(testNews)
    assert inferedVectors[0].shape[0] == config['nbTopics']
    topics = []
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        wordProb = []
        for i in range(len(topic)):
            prob = topic[i]
            word = feature_names[i]
            wordProb.append((word, prob))
        wordProb = sortBy(wordProb, desc=True, index=1)[:100]
        current = dict()
        for word, prob in wordProb:
            current[word] = prob
        topics.append(current)

In [22]:
tt.tic("Model fitted and topic vectors infered")

--> tic: 41.34s | message: Model fitted and topic vectors infered


41.34

# Showing topics

In [23]:
def printTopics(topics, maxWords=10, logger=None):
    for i in range(len(topics)):
        log(str(i) + ": " + str(" ".join(list(topics[i].keys())[:10])), logger)

In [24]:
def printTopicsOf(vector, topics, logger=None):
    topicsRepr = ""
    topTopics = sortBy([(i, score) for i, score in enumerate(vector) if score > 0.1], desc=True, index=1)[:3]
    log("Top topics number are: " + str(" ".join([str(e[0]) for e in topTopics])), logger)
    currentTopics = [topics[e[0]] for e in topTopics]
    printTopics(currentTopics, logger=logger)

In [25]:
printTopics(topics, logger=logger)

0: harassment sexual republican speaker email senate lawmaker party pilot complaint
1: tax bill republican senate cut disclosure individual campaign policy income
2: mr. weinstein employee ms. china sexual lauer meeting executive producer
3: republican district court political vote party democrat conservative mr. democratic
4: political tax drug republican court health federal parking port policy
5: player game north ms. football play bank league mr. health
6: ms. louis mr. robertson sexual human goodman comedy consciousness brain
7: died gun student actor known photo friend fbi __float_2__ film
8: disclosure cdc analyst word veteran administration speech political budget investigation
9: journal health disclosure court board study paper environmental research speech
10: market credit security drug game sexual social mindfulness harassment bus
11: tax truck republican chip health congress funding bank enrollee billion
12: agency epa policy tax administration energy air power federal pr

# Serialization

In [26]:
if not isNotebook:
    try:
        dirPath = nosaveDir() + "/sklearn-models"
        mkdir(dirPath)
        configHash = objectToHash(config)[:5]
        serialize(model, dirPath + "/model-" + configHash + ".pickle")
        toJsonFile(config, dirPath + "/config-" + configHash + ".json")
    except Exception as e:
        logException(e, logger=logger)

# Making a dict url --> topic vector and a dict url --> text

In [27]:
assert len(urlsToInfere) == len(inferedVectors)
urlsVectors = dict()
for i in range(len(urlsToInfere)):
    urlsVectors[urlsToInfere[i]] = inferedVectors[i]

In [28]:
if isNotebook:
    urlsTexts = dict()
    allTexts = getNewsText(urlsToInfere, logger=logger)
    for i in range(len(urlsToInfere)):
        urlsTexts[urlsToInfere[i]] = allTexts[i]

  0% [                    ]
  9% [=                   ] (47.186s left)
 19% [===                 ] (39.984s left)
 29% [=====               ] (34.254s left)


# Prints

In [31]:
if isNotebook:
    userId = list(trainUsers.keys())[11]
    xvectors = []
    xurls = []
    for url in trainUsers[userId]:
        xvectors.append(urlsVectors[url])
        xurls.append(url)
    xvectors = np.array(xvectors)
    yvectors = []
    yurls = []
    for url in candidates[userId][0]:
        yvectors.append(urlsVectors[url])
        yurls.append(url)
    yvectors = np.array(yvectors)
    distances = getDistances(xvectors, yvectors, metric='cosine')

In [32]:
# Printing some docs with topics:
if isNotebook:
    for i in range(10):
        urls = random.choice([xurls, yurls])
        url = random.choice(urls)
        text = urlsTexts[url]
        vector = urlsVectors[url]
        log(url, logger)
        printTopicsOf(vector, topics, logger=logger)
        log(text, logger)
        log("\n" * 2, logger)

https://lnkd.in/dD7W2RP
Top topics number are: 7 5
0: died gun student actor known photo friend fbi __float_2__ film
1: player game north ms. football play bank league mr. health
Thousands of veterans will receive a unique experience this college football bowl season thanks to the nonprofit Veteran Ticket Foundation, or Vet Tix.
The organization, founded in 2008, has provided more than one million tickets to sporting events, concerts and other programs in the past year alone.
Out of 40 bowl games schedule between NCAA teams this year, Vet Tix will provide tickets to 19 of the games.
That includes the Goodyear Cotton Bowl between the University of Southern California and Ohio State University in Arlington, Texas, on Friday; the Fiesta Bowl between Washington and Penn State in Phoenix on Saturday; the College Football Playoff at the Rose Bowl between Georgia and Oklahoma on Jan. 1; and the College Football Playoff National Championship in Atlanta on Jan. 8.
Since its founding, Vet Tix ha




https://www.cnsnews.com/news/article/terence-p-jeffrey/gop-congress-presides-over-highest-spending-obamas-stimulus
Top topics number are: 14 26
0: mattress derek site prison treatment fiscal air federal spending affiliate
1: tax cut housing republican income bill denver affordable increase policy
(CNSNews.com) - Real federal spending in fiscal 2017, which ended on Sept. 30, was higher than in any year in the history of the United States other than fiscal 2009, which was the year that President Barack Obama's $840 billion stimulus law was enacted.
Fiscal 2017 also saw the second highest real federal individual income tax totals of any year in U.S. history, according to the Monthly Treasury Statement released today.
Total federal tax revenues were the third highest in U.S. history.
While it was collecting the third highest total tax revenues in U.S. history, the federal government ran a deficit $665,712,000,000 because of its high total spending.
Republicans have controlled the House 




http://tnne.ws/2CxOi8z
Top topics number are: 7 2 12
0: died gun student actor known photo friend fbi __float_2__ film
1: mr. weinstein employee ms. china sexual lauer meeting executive producer
2: agency epa policy tax administration energy air power federal pruitt
Last Slide Next Slide
In that suit, the plaintiffs alleged that managers have exposed veteran anchor Kalodimos — who is not a plaintiff in the suit — to "many acts of age-based discrimination and hostility."
Those acts include "public berating" and "ridicule, " spreading false rumors that Kalodimos would be retiring soon and referring to the veteran anchor as an "old maid, " according to the lawsuit.
► MORE: Bob Mueller slaps Channel 4 over handling of Demetria Kalodimos
Paul Karpowicz, president of Meredith Local Media Group, which owns WSMV, maintained that the claims are without merit. He denied that anyone at the station was mistreated or discriminated against.
The general manager and news director at the station's h




http://bit.ly/2im2zjm
Top topics number are: 12 1
0: agency epa policy tax administration energy air power federal pruitt
1: tax bill republican senate cut disclosure individual campaign policy income
It's a modern War of Roses over at the Consumer Financial Protection Bureau (CFPB).
Instead of dueling families, it's dueling directors: President Obama's appointee Leandra English claims the title of director is hers, given to her by departing director and fellow Obama-appointee Richard Cordray Richard Adams CordrayPoll: Majority of likely voters support consumer bureau mission Election Countdown: Takeaways from too-close-to-call Ohio special election | Trump endorsements cement power but come with risks | GOP leader's race now rated as 'toss-up' | Record numbers of women nominated | Latino candidates get prominent role in 2020 Michigan race shows two parties on different trajectories MORE. President Trump, however, has passed the title to his Senate-confirmed director of the Office o






In [33]:
# Printing some similar docs:
if isNotebook:
    for i in range(distances.shape[0]):
        if i > 100:
            break
        # Get train:
        trainUrl = xurls[i]
        trainText = urlsTexts[trainUrl]
        trainVector = urlsVectors[trainUrl]
        log(trainUrl, logger)
        printTopicsOf(trainVector, topics, logger=logger)
        log(trainText[:2000], logger)
        log("\n", logger)
        # Get distances:
        currentDistances = []
        for u in range(len(yurls)):
            currentDistances.append((yurls[u], urlsTexts[yurls[u]], distances[i][u]))
        topSim = sortBy(currentDistances, index=2, desc=False)[:3]
        topDissim = sortBy(currentDistances, index=2, desc=True)[:3]
        # Print similars:
        log("MOST SIMILARS", logger)
        log("\n", logger)
        for url, text, dist in topSim:
            log(dist, logger)
            log(url, logger)
            printTopicsOf(urlsVectors[url], topics, logger=logger)
            log(text[:2000], logger)
            log("\n", logger)
        # Print dissimilars:
        log("MOST DISSIMILARS", logger)
        log("\n", logger)
        for url, text, dist in topDissim:
            log(dist, logger)
            log(url, logger)
            printTopicsOf(urlsVectors[url], topics, logger=logger)
            log(text[:2000], logger)
            log("\n", logger)
        log("\n", logger)
        log("\n" * 2 + '-' * 20 + "\n" * 2, logger)        

http://thehill.com/opinion/campaign/353798-republicans-are-headed-for-a-disappointing-end-to-their-year-in-power#.WdTyKpBdNfI.twitter
Top topics number are: 1 22 25
0: tax bill republican senate cut disclosure individual campaign policy income
1: mr. email district republican zinke investigation rule marijuana pun senate
2: china air ozone asia mr. united leader pollution deal bill
The first year of unified Republican control of the government is winding to a disappointing end.
Key GOP campaign promises — ObamaCare repeal, tax reform, and a border wall, among them — remain undone. Congress hasn't passed a budget, nor have they stopped legislating by crisis. Over 200 of President Trump's nominees, the people responsible for implementing the president's agenda across the government, are still waiting for confirmation in the Senate.
So, what's the deal? Democratic obstruction? The 60-vote rule?
None of the above.
The real reason that Congress can't get anything done is because the McConne



MOST DISSIMILARS


1.0
https://buff.ly/2EmWoR4
Top topics number are: 5 27 20
0: player game north ms. football play bank league mr. health
1: furniture johnson employee feedback sofa ai canal price technology game
2: district mr. duke data daniel health student coal political water
Tournament Gives Elementary School Students Robotics Experience
(ABC 6 News) -- Dozens of Austin elementary school students took part in a robotics tournament at Neveln Elementary School Saturday, learning the ropes before a larger tournament takes place at Austin High School later this month.
Teams from multiple schools, including Neveln, Southgate, and IJ Holton, competed to see whose robot took home top honors in an event Neveln's robotics coach said was the first of its kind in the district for younger students.
"This is the only tournament we do with the elementaries, and so it's giving kids on the teams a chance to feel what the real tournament is like, " coach Clint Phillips said.
Students work in 



MOST SIMILARS


0.5702833244132943
https://www.nature.com/articles/s41531-017-0038-9
Top topics number are: 10 14 12
0: market credit security drug game sexual social mindfulness harassment bus
1: mattress derek site prison treatment fiscal air federal spending affiliate
2: agency epa policy tax administration energy air power federal pruitt
Full size table PKG sleep scores and PSG Forty-six subjects without PD who
were investigated with a sleep study (mostly for suspected sleep apnoea) wore
a PKG for the evening of the PSG. They were grouped according to the PSG
findings as: Normal (n = 10): no abnormality was described in the report and
sleep parameters were normal. Median age was 29 y. Normal minus (n = 8): these
were described as 'normal sleep study' but mention was made of some aspect
such as increased leg movements, oxygen saturation changes or sleep
fragmentation. Median age was 42 y. Abnormal (n = 28): these were all cases
reported as abnormal even though description ranged f

2: insurance blockchain internet product policy customer industry neutrality net technology
Baltimore, Md. (WMAR) -
On Thursday morning, a snow storm is expected to make for a rough morning commute for Baltimore drivers.
The biggest concern with this storm is the timing.
The brunt of it is expected to the Baltimore area during the morning commute.
Around six hundred snow crews will head out at midnight to start treating city streets and attacking the icy conditions.
Vincent Ayd the owner of Ayd Hardware had a busy day helping people make their own preparations.
"I don't recommend pre-salting or putting down ice melting compounds prior to the snow, " said Ayd. "It's been so cold that even when we do get whatever little bit is coming our way you'll probably get out there with a broom and sweep it off. The important thing is to have that done before people walk into it and compact and compress the snow and then it becomes ice and then you'll need the ice melting compounds for sure."
Maryl



0.12913383952727575
https://www.nashvillepost.com/politics/elections/article/20987172/randy-boyd-reports-42m-in-1516-earnings
Top topics number are: 1 3
0: tax bill republican senate cut disclosure individual campaign policy income
1: republican district court political vote party democrat conservative mr. democratic
Also: Mark Green touts new endorsements
Republican gubernatorial candidate Randy Boyd (pictured) has released documentation showing he and his wife, Jenny, earned more than $42 million in total income in 2015 and 2016, the two years he served as commissioner of the Tennessee Department of Economic and Community Development.
Boyd said he did not take a state salary and personally paid for travel and other expenses while working for the state, according to a release from his campaign. The East Tennessee businessman started Radio Systems Corporation in 1991; according to Boyd the company brings in annual revenues of more than $400 million and much of his investment income i



1.0
http://www.bnd.com/news/local/community/ofallon-progress/article191769774.html
Top topics number are: 12 4 5
0: agency epa policy tax administration energy air power federal pruitt
1: political tax drug republican court health federal parking port policy
2: player game north ms. football play bank league mr. health
Order Reprint of this Story
The O'Fallon-Shiloh Chamber of Commerce joined the St. Clair County Transit District to celebrate the opening of the MetroBikeLink connection between the Shiloh Wingate Subdivision (north of Southwestern Illinois College) and Scott Air Force Base. The ceremony was held Dec. 19.
This 2.9-mile section was the final extension to the now 11-mile MetroBikeLink Trail that runs throughout St. Clair County. Cyclists, walkers and joggers can enjoy this trail year-round, which now connects five MetroLink stations, beginning at Memorial Hospital and ending at Shiloh-Scott. SCCTD and engineering firm TWM collaborated on the entire trail design. The fund



0.11060598278741596
https://www.fredericknewspost.com/news/economy_and_business/services/i--project-attracts-interest-from-firms/article_d7582638-f940-5a1a-b4eb-53e4b1f32c06.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share
Top topics number are: 4 12 10
0: political tax drug republican court health federal parking port policy
1: agency epa policy tax administration energy air power federal pruitt
2: market credit security drug game sexual social mindfulness harassment bus
Maryland's State Highway Administration has received more than two dozen responses to its initial call for information on plans to combine public and private funds to add new lanes on interstates 270 and 495, but transportation advocates are skeptical that any plan will significantly decrease congestion.
The state received 27 responses from investors, contractors, engineers and other companies in seven countries to the state's Request for Information on the $7.6 billion public-private partnership to







--------------------


http://bit.ly/2im2zjm
Top topics number are: 12 1
0: agency epa policy tax administration energy air power federal pruitt
1: tax bill republican senate cut disclosure individual campaign policy income
It's a modern War of Roses over at the Consumer Financial Protection Bureau (CFPB).
Instead of dueling families, it's dueling directors: President Obama's appointee Leandra English claims the title of director is hers, given to her by departing director and fellow Obama-appointee Richard Cordray Richard Adams CordrayPoll: Majority of likely voters support consumer bureau mission Election Countdown: Takeaways from too-close-to-call Ohio special election | Trump endorsements cement power but come with risks | GOP leader's race now rated as 'toss-up' | Record numbers of women nominated | Latino candidates get prominent role in 2020 Michigan race shows two parties on different trajectories MORE. President Trump, however, has passed the title to his Senate-confirme



1.0
http://on.nfl.com/eya6bl
Top topics number are: 5 25
0: player game north ms. football play bank league mr. health
1: china air ozone asia mr. united leader pollution deal bill
▶ Preseason Power Rankings: Seahawks rise
"He came to me and he wanted to play and he wanted to do this thing , " Chiefs coach Andy Reid said of Hunt, via ESPN. "I fought him a little bit on it, and he won that fight. He told me he'd get it taken care of quick, and he did that. My hat goes off to him."
Hunt did it as quickly as possible, smashing past the necessary 14 yards to pass Rams running back Todd Gurley (who sat out Week 17) with one total carry, his 35-yard touchdown run on Kansas City's opening possession against Denver. One carry, one league lead, one touchdown, one risk taken, one injury avoided and one desire fulfilled. Simple enough.
Hunt was also called into duty out of need. The Chiefs didn't stand to gain anything (other than momentum) from a win over Denver, but with Charcandrick West out



0.12184636034288043
https://www.forbes.com/sites/jillgriffin/2018/01/02/five-tips-for-a-new-years-revolution/#792ceda55601
Top topics number are: 10 0 27
0: market credit security drug game sexual social mindfulness harassment bus
1: harassment sexual republican speaker email senate lawmaker party pilot complaint
2: furniture johnson employee feedback sofa ai canal price technology game
The New Year is upon us, and who knows, this might be your best year yet. Here are a few tips that I hope will be helpful to you as you get ready to tackle 2018:
Take some time to really learn about yourself. Take the Clifton StrengthsFinder or the Meyers-Briggs or any one of a number of terrific assessment tools out there and find out how you are wired. Learn what your signature strengths are. Most of us are wholly unaware of what we are pre-wired to do in this world. Don Clifton, the Grandfather of Positive Psychology and the inventor of the Clifton StrengthsFinder once said "Everyone on the planet 

Kathleen Sibert, president and CEO of Arlington Street People's Network (A-SPAN), presents Lora Rinker with a special award on the first year anniversary of the grand opening of the new Homeless Services Center (HSC) last October. Sibert explains that efforts to end homelessness in Arlington began 25 years ago when Rinker served meals from her front porch on N. Jackson Street in Arlington. Sibert said, "This has been her vision for the last 25 years. It began with a warm heart and much love." Rinker's efforts have led to a full service year-round resource center that provides shelter, hygiene, mental health care, counseling and medical care. Sibert said, "Kasia Shaw is our nurse practitioner. Kasia, raise your hand." Last season 129 people were treated for 343 medical visits. "This center is the first of its kind in the D.C. area. Our end goal is to move people from homelessness to a home." Last year A-SPAN helped 70 homeless people find a home.






--------------------


http://bit.



MOST DISSIMILARS


1.0
https://buff.ly/2EmWoR4
Top topics number are: 5 27 20
0: player game north ms. football play bank league mr. health
1: furniture johnson employee feedback sofa ai canal price technology game
2: district mr. duke data daniel health student coal political water
Tournament Gives Elementary School Students Robotics Experience
(ABC 6 News) -- Dozens of Austin elementary school students took part in a robotics tournament at Neveln Elementary School Saturday, learning the ropes before a larger tournament takes place at Austin High School later this month.
Teams from multiple schools, including Neveln, Southgate, and IJ Holton, competed to see whose robot took home top honors in an event Neveln's robotics coach said was the first of its kind in the district for younger students.
"This is the only tournament we do with the elementaries, and so it's giving kids on the teams a chance to feel what the real tournament is like, " coach Clint Phillips said.
Students work in 



MOST SIMILARS


0.10532679470622208
http://www.themonitor.com/news/article_6cec4284-f029-11e7-8e52-2fdc49ffdecd.html#utm_campaign%3Dblox%26utm_source%3Dtwitter%26utm_medium%3Dsocial
Top topics number are: 25 1 0
0: china air ozone asia mr. united leader pollution deal bill
1: tax bill republican senate cut disclosure individual campaign policy income
2: harassment sexual republican speaker email senate lawmaker party pilot complaint
U.S. Sen. John Cornyn will deliver a State of Congress address at the Knapp Medical Center in Weslaco Jan. 12.
The speech was originally planned for Dec. 1, but got postponed due to negotiations of the tax-overhaul bill.
Sergio Contreras, president of the Rio Grande Valley Partnership, said the Republican Majority Whip's speech will likely cover international trade, tax reform legislation, health care and the continuing education of the Rio Grande Valley workforce.
The RGV Partnership, an organization that tries to spur business growth in the Valley, is s



1.0
http://www.news1130.com/2018/01/03/possible-homicide-burnaby/
Top topics number are: 7 10
0: died gun student actor known photo friend fbi __float_2__ film
1: market credit security drug game sexual social mindfulness harassment bus
BURNABY (NEWS 1130) — Just three days into 2018, police are investigating the first homicide of the year in Metro Vancouver.
Around 9 p.m. on Tuesday, investigators responded to reports of a fight at a two-storey apartment building on Telford Avenue, behind Metrotown, in Burnaby.
RCMP say when officers arrived, a man was found "in medical distress."
"Despite all attempts to revive the individual, they succumbed to their injuries, " says Burnaby RCMP Corporal Daniela Panesar.
The Integrated Homicide Investigation Team has taken over the case.
One woman who lives next to the building, describes it to City TV as being "very loud, " with frequent parties, fire alarms going off, and police frequently being called to the area.
She also says on Tuesday night



0.13852961321749901
http://www.newsweek.com/nasa-hole-earths-ozone-layer-finally-closing-humans-did-something-771922
Top topics number are: 12 25
0: agency epa policy tax administration energy air power federal pruitt
1: china air ozone asia mr. united leader pollution deal bill
The study, published Thursday in Geophysical Research Letters, reveals that a decline in ozone-depleting chemicals has resulted in 20 percent less depletion since 2005. Specifically chlorine levels declined by 0.8 percent each year between 2005 and 2016.
"We see very clearly that chlorine from [chlorofluorocarbons] is going down in the ozone hole, and that less ozone depletion is occurring because of it, " Susan Strahan, lead author and atmospheric scientist from NASA's Goddard Space Flight Center, said in a statement.
The chlorofluorocarbons (CFCs) Strahan referred to are ozone-depleting chemicals that were once used in aerosol sprays, blowing agents for foams and packing materials, and refrigerants. Chlorin







--------------------


https://www.newsmax.com/newsfront/daca-dreamers-congressional-budget-office-economy/2017/12/15/id/832125/
Top topics number are: 12
0: agency epa policy tax administration energy air power federal pruitt
The Congressional Budget Office (CBO) announced Friday that legalizing 2 million young undocumented illegal immigrant "Dreamers" would cost the U.S. government $25.9 billion over a 10-year period, The Washington Times reported Friday .
Those costs would come from the tax, education and other benefits for which they would be eligible. In addition, the CBO said newly legalized Dreamers would be able to sponsor another 80,000 people from a process called chain migration. That allows non-nuclear family members to be eligible for immigration.
Immigration-rights activists have argued that legalizing Dreamers would be an overall benefit to the U.S. economy by the higher taxes they would pay because of their legal status. But, the CBO and the Joint Committee on Tax



MOST DISSIMILARS


1.0
https://buff.ly/2EmWoR4
Top topics number are: 5 27 20
0: player game north ms. football play bank league mr. health
1: furniture johnson employee feedback sofa ai canal price technology game
2: district mr. duke data daniel health student coal political water
Tournament Gives Elementary School Students Robotics Experience
(ABC 6 News) -- Dozens of Austin elementary school students took part in a robotics tournament at Neveln Elementary School Saturday, learning the ropes before a larger tournament takes place at Austin High School later this month.
Teams from multiple schools, including Neveln, Southgate, and IJ Holton, competed to see whose robot took home top honors in an event Neveln's robotics coach said was the first of its kind in the district for younger students.
"This is the only tournament we do with the elementaries, and so it's giving kids on the teams a chance to feel what the real tournament is like, " coach Clint Phillips said.
Students work in 



MOST SIMILARS


0.19455963530206233
https://ankitjoshi2409.wordpress.com/2017/12/31/sitecore-basics-introduction-to-personalization/
Top topics number are: 10
0: market credit security drug game sexual social mindfulness harassment bus
The scope of this blog post is to give an idea about personalization in Sitecore, why it's required and steps to create and configure personalization for the components, this is for beginners who are trying to understand the concept and how it can be implemented.
Personalization enables you to deliver targeted content to your visitors. For example, you can implement rules that show personalized content to visitors based on their browsing behavior and their accumulated profile values.
This is really important as you don't want to show something to users which really doesn't makes any sense to them.
As an example- For an e-Commerce application as an business you want to show relevant information to users to get most of the conversions, this depends on nu

ST. LOUIS COUNTY • An 8-year-old girl was shot in the leg early Wednesday morning while asleep in her bedroom in a North County home.
The girl was was shot about 3:45 a.m. in the 10400 block of Count Drive in Castle Point, police said.
She was inside along with several family members when multiple shots were fired into the home, police said. One bullet went through the walls and hit the child, according to the department.
No one else was injured.
Officers responded to a call about the shooting and an alert from ShotSpottter, which helps police detect gunshots. The 8-year-old was taken to a hospital for treatment of a non-life threatening injury, police said.
Police said the investigation to locate the shooter is ongoing.
In October, St. Louis County officials announced a $1 million grant for improvements and community policing in Castle Point, an unincorporated area with a population of about 3,000.
The community was identified by County Executive Steve Stenger at the time as one of th

1: agency epa policy tax administration energy air power federal pruitt
Republican leaders are considering skipping passage of a GOP budget this year — a blow to the party's weakened fiscal hawks that would squash all 2018 efforts to revamp entitlements or repeal Obamacare.
White House and Hill GOP leaders discussed the possibility of forgoing the painful budget process during last weekend's Camp David legislative summit, according to four sources familiar with the talks. Senate Majority Leader Mitch McConnell has argued that he cannot pass controversial deficit-reduction legislation using powerful budget procedures with his new 51-vote majority — and wasn't even sure he could find the votes for a fiscal blueprint in the first place.
Story Continued Below
Abandoning the budget, however, would be an embarrassment for Republicans, who for years railed against Democrats when they avoided one of the most basic responsibilities of Congress.
But more importantly, it would mean the GOP's 2018



1.0
http://www.kptv.com/story/37188160/caught-on-camera-driver-punched-in-road-rage-incident-in-se-portland#.Wk73eqIObgQ.twitter
Top topics number are: 10 5 16
0: market credit security drug game sexual social mindfulness harassment bus
1: player game north ms. football play bank league mr. health
2: death molly puerto jason rico hurricane digital mr. energy power
A car crash in southeast Portland turned into a case of road rage when one driver punched the other in the face before taking off, and it was all caught on camera.
The incident happened just after 6 p.m. Tuesday at Southeast Harney Drive and 52nd Avenue.
The driver who was punched caught the whole incident on his dashcam. He asked FOX 12 not to release his name for safety reasons.
Dashcam video shows that the driver in front was in the wrong lane and tried to turn around, but the second car also went to turn and t-boned the first car. He then followed the first car for a while.
When they finally pulled over, the two men got



0.2009094861300713
https://www.nashvillepost.com/politics/elections/article/20987172/randy-boyd-reports-42m-in-1516-earnings
Top topics number are: 1 3
0: tax bill republican senate cut disclosure individual campaign policy income
1: republican district court political vote party democrat conservative mr. democratic
Also: Mark Green touts new endorsements
Republican gubernatorial candidate Randy Boyd (pictured) has released documentation showing he and his wife, Jenny, earned more than $42 million in total income in 2015 and 2016, the two years he served as commissioner of the Tennessee Department of Economic and Community Development.
Boyd said he did not take a state salary and personally paid for travel and other expenses while working for the state, according to a release from his campaign. The East Tennessee businessman started Radio Systems Corporation in 1991; according to Boyd the company brings in annual revenues of more than $400 million and much of his investment income in







--------------------




# Ranking

In [34]:
if False and isNotebook:
    config['historyRef'] = 1.0

In [35]:
historyRef = config['historyRef']
assert (isinstance(historyRef, int) and historyRef >= 1) or (isinstance(historyRef, float) and historyRef > 0.0 and historyRef <= 1.0)

In [36]:
ranks = dict()
for userId in trainUsers.keys():
    if isinstance(historyRef, float):
        currentHistoryRef = int(historyRef * len(xurls))
    else:
        currentHistoryRef = historyRef
    xvectors = []
    xurls = []
    for url in trainUsers[userId]:
        xvectors.append(urlsVectors[url])
        xurls.append(url)
    xvectors = np.array(xvectors)
    ranks[userId] = []
    for currentCandidates in candidates[userId]:
        yvectors = []
        yurls = []
        for url in currentCandidates:
            yvectors.append(urlsVectors[url])
            yurls.append(url)
        yvectors = np.array(yvectors)
        distances = getDistances(xvectors, yvectors, metric=config['distance'], logger=logger)
        urlDistances = dict()
        for testIndex in range(len(yurls)):
            url = yurls[testIndex]
            currentDists = distances[:, testIndex]
            assert currentDists.shape[0] == len(xurls)
            currentDists = sorted(list(currentDists), reverse=False)
            currentDists = currentDists[:currentHistoryRef]
            currentDist = np.mean(currentDists)
            urlDistances[url] = currentDist
        rank = [e[0] for e in sortBy(urlDistances, index=1, desc=False)]
        ranks[userId].append(rank)

In [37]:
bp(ranks, logger, 4)

{
  '1198644368': 
  [
    [
      https://www.thewrap.com/heckler-of-donald-trump-animatronic-at-disney-world-responds-to-haters-check-your-privilege/,
      http://www.abajournal.com/news/article/chief_justice_roberts_says_judiciary_will_evaluate_its_handling_of_sexual_h/,
      https://www.theatlantic.com/politics/archive/2018/01/it-was-an-open-secret/549653/#%3Futm_source%3Dtwb,
      https://www.dailykos.com/story/2018/1/4/1729769/-Congressman-says-House-Intel-committee-has-already-heard-evidence-of-Trump-administration-crimes,
      https://www.americamagazine.org/politics-society/2018/01/12/catholic-leaders-condemn-trump-remarks-about-haiti-africa,
      http://www.tampabay.com/opinion/editorials/Editorial-Cracks-in-the-facade-of-normalcy-in-Tallahassee_164344048,
      http://goo.gl/oYC4yy,
      https://amp.thedailybeast.com/meet-the-porn-star-turned-academic-whos-revolutionizing-the-adult-industry?__twitter_impression=true,
      http://www.chicagotribune.com/suburbs/advertis

In [38]:
tt.tic("Ranks done")

--> tic: 4m 12.979s | message: Ranks done


252.98

# Adding ranks to the db

In [39]:
addRankings(modelName, ranks, config, logger=logger)

In [40]:
notif(modelName + " done")

In [41]:
tt.tic("Ranks stored")

--> tic: 26.62s | message: Ranks stored


26.62

In [42]:
tt.toc()

--> toc total duration: 8m 18.879s


498.88