# Commands

In [1]:
# cd ~/twinews-logs ; jupython -o nohup-bm25-$HOSTNAME.out --venv st-venv ~/Workspace/Python/Datasets/Twinews/twinews/models/bm25.ipynb

# Init

In [2]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [3]:
isNotebook = '__file__' not in locals()

In [4]:
TEST = isNotebook # isNotebook, True, False

In [5]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from nlptools.preprocessing import *
from twinews.utils import *
from twinews.models.ranking import *

In [6]:
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [24]:
from gensim.summarization import bm25

In [8]:
logger = Logger(tmpDir('logs') + "/bm25.log") if isNotebook else Logger("bm25-" + getHostname() + ".log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

# Config

In [22]:
config = \
{
    'splitVersion': 2,
    
    'maxUsers': 2 if TEST else None, # Sub-sampling
    'minDF': None if TEST else None, # Remove words that have a document frequency ratio lower than 1 / 500
    'maxDF': None if TEST else None, # Remove top 300 voc elements
    
    'lowercase': False if TEST else False,
    'doLemmatization': False if TEST else False,
    
    'k1': 1.5,
    'b': 0.75,
    'epsilon': 0.25,
}

In [10]:
modelName = 'bm25'

In [11]:
# Here we check if we already generated ranking for this model with this specific config:
if not isNotebook:
    if rankingExists(modelName, config, logger=logger):
        raise Exception(modelName + " with this config already exist:\n" + b(config, 5))

# Getting data

In [12]:
# Getting users and news
evalData = getEvalData(config['splitVersion'], maxExtraNews=0, maxUsers=config['maxUsers'], logger=logger)
(trainUsers, testUsers, trainNews, testNews, candidates, extraNews) = \
(evalData['trainUsers'], evalData['testUsers'], evalData['trainNews'],
 evalData['testNews'], evalData['candidates'], evalData['extraNews'])
bp(evalData.keys(), 5, logger)
log(b(evalData['meta'], 5), logger)

--> tic: 32.19s | message: Eval data loaded
--> toc total duration: 32.47s | message: Got Twinews evaluation data
{ candidates, extraNews, meta, testNews, testUsers, trainNews, trainUsers }
{ 'created': 2020.03.24-14.28.06, 'endDate': 2018-01-15, 'id': 2, 'ranksLength': 1000, 'splitDate': 2017-12-25, 'startDate': 2017-10-01, 'testMaxNewsPerUser': 97, 'testMeanNewsPerUser': 7.22, 'testMinNewsPerUser': 2, 'testNewsCount': 71781, 'totalNewsAvailable': 570210, 'trainMaxNewsPerUser': 379, 'trainMeanNewsPerUser': 26.48, 'trainMinNewsPerUser': 8, 'trainNewsCount': 237150, 'usersCount': 15905 }


In [13]:
# Here it is important to convert urls to lists because we want the same order to retrieve vectors by index...
# And we shuffle it so we do not stick urls a a user at the begin...
# But we seed the random to always have same order...
trainNewsList = shuffle(list(trainNews), seed=0)
testNewsList = shuffle(list(testNews), seed=0)
newsList = trainNewsList + testNewsList

In [14]:
# Print all:
log(str(len(trainNewsList)) + " urls for trainNewsList", logger=logger)
log(str(len(testNewsList)) + " urls for testNewsList", logger=logger)
log(str(len(newsList)) + " urls for newsList", logger=logger)

28 urls for trainNewsList
1989 urls for testNewsList
2017 urls for newsList


In [15]:
# We get sentences:
sentences = getNewsSentences(newsList, logger=logger)
bp(sentences, logger)

twinews news (version 1.0) initialised.
  0% [                    ]
  9% [=                   ] (42.734s left)
 19% [===                 ] (36.558s left)
 29% [=====               ] (31.914s left)
[
  [ [ The, Brazil, ..., more, . ], [ He, was, ..., play, . ], ..., [ That, football, . ], [ It, was, ..., am, . ] ],
  [ [ Proponents, of, ..., north, . ], [ As, part, ..., years, . ], ..., [ The, next, ..., November, __int_4__ ], [ ", With, ..., Governments, . ] ],
  ...,
  [ [ The, Mission, ..., community, . ], [ According, to, ..., to, . ], ..., [ ", A, ..., said, . ], [ Mission, :, ..., Mission, St. ] ],
  [ [ When, Lexington, ..., stomach, . ], [ ", It, ..., Richmond, . ], ..., [ ", Lexington, ..., said, . ], [ ", Lexington, ..., Cemetery, . ] ]
]


In [16]:
# We flatten sentences:
for i in range(len(sentences)):
    sentences[i] = flattenLists(sentences[i])
docs = sentences
bp(docs, logger)

[ [ The, Brazil, ..., am, . ], [ Proponents, of, ..., Governments, . ], ..., [ The, Mission, ..., Mission, St. ], [ When, Lexington, ..., Cemetery, . ] ]


In [17]:
# Lower case:
if config['lowercase']:
    for i in pb(list(range(len(docs))), logger=logger, message="Lower casing"):
        for u in range(len(docs[i])):
            docs[i][u] = docs[i][u].lower()
bp(docs, logger)

[ [ The, Brazil, ..., am, . ], [ Proponents, of, ..., Governments, . ], ..., [ The, Mission, ..., Mission, St. ], [ When, Lexington, ..., Cemetery, . ] ]


In [18]:
# Lemmatization:
if config['doLemmatization']:
    lemmatizer = WordNetLemmatizer()
    pbar = ProgressBar(len(docs), logger=logger, message="Lemmatization")
    for i in range(len(docs)):
        for u in range(len(docs[i])):
            docs[i][u] = lemmatizer.lemmatize(docs[i][u])
        pbar.tic()
bp(docs, logger)

[ [ The, Brazil, ..., am, . ], [ Proponents, of, ..., Governments, . ], ..., [ The, Mission, ..., Mission, St. ], [ When, Lexington, ..., Cemetery, . ] ]


In [19]:
# Filtering the corpus:
if config['minDF'] is not None or config['maxDF'] is not None:
    docs = filterCorpus(docs, minDF=config['minDF'], maxDF=config['maxDF'],
                        removeEmptyDocs=False, allowEmptyDocs=False, logger=logger)
    for doc in docs: assert len(doc) > 0
    bp(docs, logger)

In [20]:
tt.tic("Data preprocessed")

--> tic: 1m 18.45s | message: Data preprocessed


78.45

# Ranking

In [30]:
urlDocs = dict()
for i in range(len(newsList)):
    urlDocs[newsList[i]] = docs[i]

In [35]:
rankings = dict()
pbar = ProgressBar(len(evalData['candidates']), logger=logger, message="Generating rankings", printRatio=0.01)
for userId in evalData['candidates']:
    request = []
    for url in evalData['trainUsers'][userId]:
        request += urlDocs[url]
    currentRankings = []
    for candidates in evalData['candidates'][userId]:
        candidates = list(candidates)
        urlCorpus = dict()
        for i in range(len(candidates)):
            urlCorpus[candidates[i]] = urlDocs[candidates[i]]
        urlCorpus = list(urlCorpus.items())
        corpus = [e[1] for e in urlCorpus]
        try:
            # This is the new version not yet pushed on pipy:
            model = bm25.BM25(corpus, k1=config['k1'], b=config['b'], epsilon=config['epsilon'])
        except:
            # This is the actual version with parameters as global variables:
            bm25.PARAM_K1 = config['k1']
            bm25.PARAM_B = config['b']
            bm25.EPSILON = config['epsilon']
            model = bm25.BM25(corpus)
        scores = model.get_scores(request)
        scoresWithUrl = []
        for i in range(len(scores)):
            scoresWithUrl.append((urlCorpus[i][0], scores[i]))
        ranking = sortBy(scoresWithUrl, index=1, desc=True)
        ranking = [e[0] for e in ranking]
        currentRankings.append(ranking)
    rankings[userId] = currentRankings
    pbar.tic(str(userId))



In [36]:
bp(rankings, logger, 4)

{
  '2617447752': 
  [
    [
      https://trib.al/PiFii15,
      http://bit.ly/2AkNlxU,
      http://www.stltoday.com/sports/college/mizzou/drew-lock-puts-nfl-on-hold-will-return-to-mizzou/article_cb5dd7fa-783d-5458-9bef-e22ab0743a22.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share,
      https://mobile.nytimes.com/2018/01/06/world/asia/north-korea-nuclear-missile-intelligence.html,
      http://www.daily-chronicle.com/lists/2017/12/31/e8ec758e9f5a4d54b391fa43e2595e22/index.xml?page=1,
      https://www.theguardian.com/football/2018/jan/07/video-assistant-referees-errors-mike-riley,
      http://www.nzherald.co.nz/sport/news/article.cfm?c_id=4&objectid=11962735,
      http://nymag.com/daily/intelligencer/2017/08/felix-sater-donald-trump-russia-investigation.html,
      http://www.argusleader.com/story/sports/2018/01/12/heart-attack-referee/1027871001/,
      http://ow.ly/34Lp30hBCDI,
      ...,
      http://boston.cbslocal.com/2017/12/24/patriots-clinch-first-round-by

In [37]:
tt.tic("Rankings done")

--> tic: 24m 56.019s | message: Rankings done


1496.02

# Adding rankings to the db

In [38]:
# Read the doc!
addRanking(modelName, rankings, config, logger=logger)

In [39]:
tt.tic("Rankings stored")

--> tic: 7.91s | message: Rankings stored


7.91

In [40]:
totalDuration = tt.toc()

--> toc total duration: 26m 22.819s


In [41]:
notif(modelName + '-' + objectToHash(config)[:5] + " done in " + secondsToHumanReadableDuration(totalDuration) + " on " + getHostname())