# Commands

In [1]:
# cd twinews-logs ; jupython -o nohup-tfidf-$HOSTNAME.out --venv st-venv ~/Workspace/Python/Datasets/Twinews/twinews/models/tfidf.ipynb

# Init

In [3]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [4]:
isNotebook = '__file__' not in locals()

In [5]:
TEST = isNotebook

In [6]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from nlptools.preprocessing import *
from twinews.utils import *
from twinews.models.ranking import *

In [7]:
from nlptools.topicmodeling import *
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
logger = Logger(tmpDir('logs') + "/tfidf.log") if isNotebook else Logger("tfidf-" + getHostname() + ".log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

# Config

In [9]:
config = \
{
    'splitVersion': 2,
    
    'maxUsers': 2 if TEST else None, # Sub-sampling
    'minDF': 1 / 500 if TEST else 1 / 2000, # Remove words that have a document frequency ratio lower than 1 / 500
    'maxDF': 300, # Remove top 300 voc elements
    
    'lowercase': False if TEST else True,
    'doLemmatization': False if TEST else False,
    'useTFIDF': False,
    
    'distance': 'cosine', # 'cosine', 'euclidean', 'kl', 'js'
    'historyRef': 30, # 1, 1.0, 0.6, 0.3, 3, 10, 30
}

In [11]:
# Here we check if we already generated ranking for this model with this specific config:
if not isNotebook:
    if rankingExists(modelName, config, logger=logger):
        raise Exception(modelName + " with this config already exist:\n" + b(config, 5))

# Getting data

In [12]:
# Getting users and news
evalData = getEvalData(config['splitVersion'], maxExtraNews=config['maxDocuments'],
                       maxUsers=config['maxUsers'], logger=logger)
(trainUsers, testUsers, trainNews, testNews, candidates, extraNews) = \
(evalData['trainUsers'], evalData['testUsers'], evalData['trainNews'],
 evalData['testNews'], evalData['candidates'], evalData['extraNews'])
bp(evalData.keys(), 5, logger)
log(b(evalData['meta'], 5), logger)

--> tic: 31.89s | message: Eval data loaded
twinews news (version 1.0) initialised.
--> tic: 7.11s | message: Extra news downloaded
--> toc total duration: 39.01s | message: Got Twinews evaluation data
{ candidates, extraNews, meta, testNews, testUsers, trainNews, trainUsers }
{ 'created': 2020.03.24-14.28.06, 'endDate': 2018-01-15, 'id': 2, 'ranksLength': 1000, 'splitDate': 2017-12-25, 'startDate': 2017-10-01, 'testMaxNewsPerUser': 97, 'testMeanNewsPerUser': 7.22, 'testMinNewsPerUser': 2, 'testNewsCount': 71781, 'totalNewsAvailable': 570210, 'trainMaxNewsPerUser': 379, 'trainMeanNewsPerUser': 26.48, 'trainMinNewsPerUser': 8, 'trainNewsCount': 237150, 'usersCount': 15905 }


In [13]:
# Here it is important to convert urls to lists because we want the same order to retrieve vectors by index...
# And we shuffle it so we do not stick urls a a user at the begin...
# But we seed the random to always have same order...
extraNewsList = shuffle(list(extraNews), seed=0)
trainNewsList = shuffle(list(trainNews), seed=0)
testNewsList = shuffle(list(testNews), seed=0)

In [14]:
# We get urls for the LDA model:
if config['useExtraNews']:
    urlsForModel = extraNewsList + trainNewsList + testNewsList
else:
    urlsForModel = trainNewsList + testNewsList + extraNewsList
urlsForModel = urlsForModel[:config['maxDocuments']]
# We get urls to vectorize for the training and the inference:
urlsForModelSet = set(urlsForModel)
urlsToVectorize = copy.deepcopy(urlsForModel)
for url in trainNewsList + testNewsList:
    if url not in urlsForModelSet:
        urlsToVectorize.append(url)
# We get urls to infere for the scoring:
urlsToInfere = trainNewsList + testNewsList
# Print all:
log(str(len(urlsForModel)) + " urls for urlsForModel:\n" + b(urlsForModel), logger=logger)
log(str(len(urlsToVectorize)) + " urls for urlsToVectorize:\n" + b(urlsToVectorize), logger=logger)
log(str(len(urlsToInfere)) + " urls for urlsToInfere:\n" + b(urlsToInfere), logger=logger)

30 urls for urlsForModel:
[
  http://on.si.com/2yDAEx9,
  http://www.forbes.com/sites/bobbymcmahon/2017/11/25/messi-and-barcelona-sign-a-new-contract-to-2021-,
  ...,
  http://thehill.com/media/368673-fox-friends-host-criticized-trump-shithole-comment-moments-before-he,
  https://boston.curbed.com/boston-development/2018/1/10/16870266/allston-project-would-add-74-apartme
]
2017 urls for urlsToVectorize:
[
  http://on.si.com/2yDAEx9,
  http://www.forbes.com/sites/bobbymcmahon/2017/11/25/messi-and-barcelona-sign-a-new-contract-to-2021-,
  ...,
  http://www.sfchronicle.com/entertainment/article/The-street-vendor-who-makes-peace-his-profession-12,
  http://www.foxnews.com/politics/2018/01/04/house-russia-probe-ending-as-it-began-mess.amp.html?__twi
]
2017 urls for urlsToInfere:
[
  http://on.si.com/2yDAEx9,
  http://www.forbes.com/sites/bobbymcmahon/2017/11/25/messi-and-barcelona-sign-a-new-contract-to-2021-,
  ...,
  http://www.sfchronicle.com/entertainment/article/The-street-vendor-who-m

In [15]:
# We get sentences:
sentences = getNewsSentences(urlsToVectorize, logger=logger)
bp(sentences, logger)

twinews news (version 1.0) initialised.
  0% [                    ]
  9% [=                   ] (37.223s left)
 19% [===                 ] (31.335s left)
 29% [=====               ] (27.459s left)
[
  [ [ Juventus, midfielder, ..., media, . ], [ The, -, ..., page, . ], ..., [ Sturaro, has, ..., league, . ], [ Juventus, do, ..., Sunday, . ] ],
  [ [ Barcelona, has, ..., until, __int_4__ ], [ The, previous, ..., June, __int_4__ ], ..., [ Messi, has, ..., marksman, . ], [ Collective, honors, ..., Cups, . ] ],
  ...,
  [ [ Kate, Winslet, ..., vendors, . ], [ On, April, ..., him, . ], ..., [ Murphy, beamed, ..., to, . ], [ Beth, Spotswood, ..., Datebook, . ] ],
  [ [ Want, FOX, ..., day, ? ], [ Sign, up, here, . ], ..., [ Brianna, McClelland, ..., report, . ], [ Want, FOX, ..., day, ? ] ]
]


In [16]:
# We flatten sentences:
for i in range(len(sentences)):
    sentences[i] = flattenLists(sentences[i])
docs = sentences
bp(docs, logger)

[ [ Juventus, midfielder, ..., Sunday, . ], [ Barcelona, has, ..., Cups, . ], ..., [ Kate, Winslet, ..., Datebook, . ], [ Want, FOX, ..., day, ? ] ]


In [17]:
# Lower case:
if config['lowercase']:
    for i in pb(list(range(len(docs))), logger=logger, message="Lower casing"):
        for u in range(len(docs[i])):
            docs[i][u] = docs[i][u].lower()
bp(docs, logger)

[ [ Juventus, midfielder, ..., Sunday, . ], [ Barcelona, has, ..., Cups, . ], ..., [ Kate, Winslet, ..., Datebook, . ], [ Want, FOX, ..., day, ? ] ]


In [18]:
# Lemmatization:
if config['doLemmatization']:
    lemmatizer = WordNetLemmatizer()
    pbar = ProgressBar(len(docs), logger=logger, message="Lemmatization")
    for i in range(len(docs)):
        for u in range(len(docs[i])):
            docs[i][u] = lemmatizer.lemmatize(docs[i][u])
        pbar.tic()
bp(docs, logger)

[ [ Juventus, midfielder, ..., Sunday, . ], [ Barcelona, has, ..., Cups, . ], ..., [ Kate, Winslet, ..., Datebook, . ], [ Want, FOX, ..., day, ? ] ]


In [19]:
# Filtering the corpus:
docs = filterCorpus(docs, minDF=config['minDF'], maxDF=config['maxDF'],
                    removeEmptyDocs=False, allowEmptyDocs=False, logger=logger)
for doc in docs: assert len(doc) > 0
bp(docs, logger)

Voc removed because of minDF (44267 elements):
{ 1000th, 100s, 100th, 100x, 103rd, 104th, 108th, 10Weather, 10X, 10x, ..., zooms, zoonotic, zoos, 💗, 😁, 😄, 😇, 😒, 😢, 🙌 }
Voc removed because of maxDF (300 elements):
{ ", ', (, ), ,, -, ., :, ;, ?, ..., with, without, work, working, world, would, year, years, you, your }
73.9% of voc will be removed.
[ [ midfielder, issued, ..., Stadium, Sunday ], [ Barcelona, confirmed, ..., Club, World ], ..., [ Kate, gets, ..., column, appears ], [ Want, News, ..., Report, inbox ] ]


In [20]:
tt.tic("Data preprocessed")

--> tic: 1m 19.37s | message: Data preprocessed


79.37

# Infering topic vectors

At this step we generate `inferedVectors` and `topics`

In [21]:
if config['implementation'] == 'gensim-lda':
    dictionary = gensim.corpora.Dictionary(docs)
    # dictionary.filter_extremes(no_below=config['min_df'])
    bow = [dictionary.doc2bow(doc) for doc in docs]
    if config['useTFIDF']:
        tfidf = gensim.models.TfidfModel(bow)
        bow = tfidf[bow]
    assert len(bow) == len(urlsToVectorize)
    bowForModel = bow[:len(urlsForModel)]
    assert len(bowForModel) == config['maxDocuments']
    i = 0
    for url in urlsToVectorize:
        if url == trainNewsList[0]:
            break
        i += 1
    assert i == len(extraNews) or i == 0
    bowForInference = bow[i:i + len(trainNews) + len(testNews)]
    assert len(bowForInference) == len(trainNews) + len(testNews)
    lda_model = gensim.models.LdaMulticore\
    (
        bowForModel,
        num_topics=config['nbTopics'],
        id2word=dictionary,
        iterations=config['maxIter'],
        decay=config['ldaLearningDecay'],
        offset=config['ldaLearningOffset'],
        workers=cpuCount(),
        passes=3,
    )
    inferedVectors = []
    for current in bowForInference:
        topicProbDistrib = lda_model[current]
        currentVector = [0.0] * config['nbTopics']
        for t, v in topicProbDistrib:
            currentVector[t] = v
        inferedVectors.append(np.array(currentVector))
    assert len(inferedVectors) == len(trainNews) + len(testNews)
    assert len(inferedVectors[0]) == config['nbTopics']
    topics = []
    for i in range(lda_model.num_topics):
        current = dict()
        for x in lda_model.get_topic_terms(i, topn=100):
            current[dictionary[x[0]]] = x[1]
        topics.append(current)

In [22]:
if config['implementation'] == 'sklearn-lda' or config['implementation'] == 'sklearn-nmf':
    if config['implementation'] == 'sklearn-nmf' or config['useTFIDF']:
        vectorizer = TfidfVectorizer\
        (
            sublinear_tf=True,
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            # lowercase=True, # Doesn't work because we erased preprocessor
        )
    else:
        vectorizer = CountVectorizer\
        (
            tokenizer=lambda x: x,
            preprocessor=lambda x: x,
            # lowercase=True, # Doesn't work because we erased preprocessor
        )
    vectors = vectorizer.fit_transform(docs)
    assert vectors.shape[0] == len(urlsToVectorize)
    vectorsForModel = vectors[:len(urlsForModel)]
    assert vectorsForModel.shape[0] == config['maxDocuments']
    i = 0
    for url in urlsToVectorize:
        if url == trainNewsList[0]:
            break
        i += 1
    assert i == len(extraNews) or i == 0
    vectorsForInference = vectors[i:i + len(trainNews) + len(testNews)]
    assert vectorsForInference.shape[0] == len(trainNews) + len(testNews)
    if config['implementation'] == 'sklearn-lda':
        model = LatentDirichletAllocation\
        (
            n_components=config['nbTopics'],
            learning_method=config['ldaLearningMethod'],
            learning_offset=config['ldaLearningOffset'],
            learning_decay=config['ldaLearningDecay'],
            random_state=0,
            n_jobs=cpuCount(),
            max_iter=config['maxIter'],
        )
    else:
        model = NMF\
        (
            n_components=config['nbTopics'],
            random_state=0,
            alpha=config['nmfAlpha'],
            l1_ratio=config['nmfL1Ratio'],
            init=config['nmfInit'],
            max_iter=config['maxIter'],
        )
    model.fit(vectorsForModel)
    inferedVectors = model.transform(vectorsForInference)
    assert inferedVectors.shape[0] == len(trainNews) + len(testNews)
    assert inferedVectors[0].shape[0] == config['nbTopics']
    topics = []
    feature_names = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        wordProb = []
        for i in range(len(topic)):
            prob = topic[i]
            word = feature_names[i]
            wordProb.append((word, prob))
        wordProb = sortBy(wordProb, desc=True, index=1)[:100]
        current = dict()
        for word, prob in wordProb:
            current[word] = prob
        topics.append(current)

In [23]:
tt.tic("Model fitted and topic vectors infered")

--> tic: 11.54s | message: Model fitted and topic vectors infered


11.54

# Making a dict url --> topic vector

In [27]:
assert len(urlsToInfere) == len(inferedVectors)
urlsVectors = dict()
for i in range(len(urlsToInfere)):
    urlsVectors[urlsToInfere[i]] = inferedVectors[i]

# Ranking

In [28]:
if False and isNotebook:
    config['historyRef'] = 1.0

In [29]:
historyRef = config['historyRef']
assert (isinstance(historyRef, int) and historyRef >= 1) or (isinstance(historyRef, float) and historyRef > 0.0 and historyRef <= 1.0)

In [30]:
# Read the doc!
rankings = usersRankingsByHistoryDistance\
(
    trainUsers,
    candidates,
    historyRef,
    urlsVectors,
    distanceMetric=config['distance'],
    logger=logger,
)



In [31]:
bp(rankings, logger, 4)

{
  '2617447752': 
  [
    [
      http://ontario.ca/b3v8,
      http://nyti.ms/2zaO6c6,
      http://www.courant.com/opinion/editorials/hc-ed-ct-needs-left-lane-law-20180101-story.html,
      http://bit.ly/2qPkanf,
      http://wsvn.com/news/local/burglar-returns-bike-he-stole-outside-of-ne-miami-dade-home/,
      https://medium.com/reaching-out/when-the-church-kicked-me-out-this-is-where-i-went-2e214adedf92?source=linkShare-35084a546d22-1515599098,
      https://fb.me/8AgCIOtvd,
      https://trib.al/HSlqzZ8,
      http://bit.ly/2CUajSH,
      https://uofacesmg.wordpress.com/2018/01/07/wonder-and-kindness/,
      ...,
      https://medium.com/shanghaiist/chinese-shoe-company-tricks-people-into-swiping-instagram-ad-with-fake-strand-of-hair-54d8a2d8ec1d,
      https://buff.ly/2DfcIEr,
      http://www.stltoday.com/news/local/govt-and-politics/protesters-volunteers-urge-city-to-do-more-to-help-homeless/article_4c7990fc-6d8d-579b-bb7a-649fbe225926.html?utm_medium=social&utm_source=facebo

In [32]:
tt.tic("Rankings done")

--> tic: 0.29s | message: Rankings done


0.29

# Adding rankings to the db

In [33]:
# Read the doc!
addRanking(modelName, rankings, config, logger=logger)

Exception type: <class 'gridfs.errors.FileExists'>
Exception: file with _id ObjectId('5e7d19898175116238a94a07') already exists
Traceback (most recent call last):
  File "/home/hayj/.local/share/virtualenvs/st-venv/lib/python3.6/site-packages/gridfs/grid_file.py", line 296, in __flush
    self._file, session=self._session)
  File "/home/hayj/.local/share/virtualenvs/st-venv/lib/python3.6/site-packages/pymongo/collection.py", line 693, in insert_one
    session=session),
  File "/home/hayj/.local/share/virtualenvs/st-venv/lib/python3.6/site-packages/pymongo/collection.py", line 607, in _insert
    bypass_doc_val, session)
  File "/home/hayj/.local/share/virtualenvs/st-venv/lib/python3.6/site-packages/pymongo/collection.py", line 595, in _insert_one
    acknowledged, _insert_command, session)
  File "/home/hayj/.local/share/virtualenvs/st-venv/lib/python3.6/site-packages/pymongo/mongo_client.py", line 1243, in _retryable_write
    return self._retry_with_session(retryable, func, s, None)

In [34]:
tt.tic("Rankings stored")

--> tic: 0.25s | message: Rankings stored


0.25

In [35]:
totalDuration = tt.toc()

--> toc total duration: 1m 31.489s


In [36]:
notif(modelName + '-' + objectToHash(config)[:5] + " done in " + secondsToHumanReadableDuration(totalDuration) + " on " + getHostname())