# Commands

In [1]:
# oomstopper --no-tail bm25 ; killbill bm25 ; cd ~/twinews-logs ; jupython -o nohup-bm25-$HOSTNAME.out --venv st-venv ~/Workspace/Python/Datasets/Twinews/twinews/models/bm25.ipynb

# Init

In [2]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [3]:
isNotebook = '__file__' not in locals()

In [4]:
TEST = isNotebook # isNotebook, True, False

In [5]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from nlptools.preprocessing import *
from nlptools.basics import *
from twinews.utils import *
from twinews.models.ranking import *
from machinelearning.iterator import *

In [6]:
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [7]:
from gensim.summarization import bm25

In [8]:
logger = Logger(tmpDir('logs') + "/bm25.log") if isNotebook else Logger("bm25-" + getHostname() + ".log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

# Config

In [9]:
config = \
{
    'splitVersion': 2,
    
    'maxUsers': 10 if TEST else None, # Sub-sampling
    'minDF': None if TEST else 1 / 2000, # 1 / 2000 or None
    'maxDF': None if TEST else 300, # 300 or None
    
    'lowercase': False if TEST else False, # True or False
    'doLemmatization': False if TEST else False, # True or False
    
    'k1': 2.4, # 1.5 (default), (from Yuting: 1.2 to 2.0 with step 0.1)
    'b': 2.0, # 0.75 (default), (from Yuting: 0.5 to 0.8 with step 0.05)
    'epsilon': 0.25, # 0.25 (default).
}

In [10]:
modelName = 'bm25'

In [11]:
bp(config, logger, 5)

{ 'b': 1.5, 'doLemmatization': False, 'epsilon': 0.25, 'k1': 2.4, 'lowercase': False, 'maxDF': None, 'maxUsers': 10, 'minDF': None, 'splitVersion': 2 }


In [12]:
# Here we check if we already generated ranking for this model with this specific config:
if not isNotebook:
    if rankingExists(modelName, config, logger=logger):
        raise Exception(modelName + " with this config already exist:\n" + b(config, 5))

In [13]:
warnFreeRAM(logger=logger)

12.68g of RAM remaining.


# Getting data

In [14]:
# Getting users and news
evalData = getEvalData(config['splitVersion'], maxExtraNews=0, maxUsers=config['maxUsers'], logger=logger)
(trainUsers, testUsers, trainNews, testNews, candidates, extraNews) = \
(evalData['trainUsers'], evalData['testUsers'], evalData['trainNews'],
 evalData['testNews'], evalData['candidates'], evalData['extraNews'])
bp(evalData.keys(), 5, logger)
log(b(evalData['meta'], 5), logger)

--> tic: 32.11s | message: Eval data loaded
--> toc total duration: 32.41s | message: Got Twinews evaluation data
{ candidates, extraNews, meta, testNews, testUsers, trainNews, trainUsers }
{ 'created': 2020.03.24-14.28.06, 'endDate': 2018-01-15, 'id': 2, 'ranksLength': 1000, 'splitDate': 2017-12-25, 'startDate': 2017-10-01, 'testMaxNewsPerUser': 97, 'testMeanNewsPerUser': 7.22, 'testMinNewsPerUser': 2, 'testNewsCount': 71781, 'totalNewsAvailable': 570210, 'trainMaxNewsPerUser': 379, 'trainMeanNewsPerUser': 26.48, 'trainMinNewsPerUser': 8, 'trainNewsCount': 237150, 'usersCount': 15905 }


In [15]:
# Here it is important to convert urls to lists because we want the same order to retrieve vectors by index...
# And we shuffle it so we do not stick urls a a user at the begin...
# But we seed the random to always have same order...
trainNewsList = shuffle(list(trainNews), seed=0)
testNewsList = shuffle(list(testNews), seed=0)
newsList = trainNewsList + testNewsList

In [16]:
# Print all:
log(str(len(trainNewsList)) + " urls for trainNewsList", logger=logger)
log(str(len(testNewsList)) + " urls for testNewsList", logger=logger)
log(str(len(newsList)) + " urls for newsList", logger=logger)

299 urls for trainNewsList
9391 urls for testNewsList
9690 urls for newsList


In [17]:
# We get sentences:
sentences = getNewsSentences(newsList, logger=logger)
bp(sentences, logger)

twinews news (version 1.0) initialised.
  0% [                    ]
 10% [==                  ] (3m 20.52s left)
 20% [====                ] (2m 54.719s left)
[
  [ [ Sen., Ted, ..., misconduct, . ], [ ", Roy, ..., interview, . ], ..., [ ", And, ..., groping, . ], [ That, a, ..., more, . ] ],
  [ [ Order, Reprint, ..., ago, . ], [ The, tweet, ..., morning, . ], ..., [ Frank, Luntz, ..., California, . ], [ Can, you, ..., please, ? ] ],
  ...,
  [ [ How, you, ..., round, . ], [ According, to, ..., camps, . ], ..., [ If, you, ..., here, . ], [ For, a, ..., here, . ] ],
  [ [ Lori, Clemons, ..., Commission, . ], [ Clemons, ,, ..., seats, . ], ..., [ Reach, Emily, ..., at, @emwest22 ], [ To, attend, ..., dates, . ] ]
]


In [18]:
# We flatten sentences:
for i in range(len(sentences)):
    sentences[i] = flattenLists(sentences[i])
docs = sentences
bp(docs, logger)

[ [ Sen., Ted, ..., more, . ], [ Order, Reprint, ..., please, ? ], ..., [ How, you, ..., here, . ], [ Lori, Clemons, ..., dates, . ] ]


In [19]:
# Lower case:
if config['lowercase']:
    for i in pb(list(range(len(docs))), logger=logger, message="Lower casing"):
        for u in range(len(docs[i])):
            docs[i][u] = docs[i][u].lower()
bp(docs, logger)

[ [ Sen., Ted, ..., more, . ], [ Order, Reprint, ..., please, ? ], ..., [ How, you, ..., here, . ], [ Lori, Clemons, ..., dates, . ] ]


In [20]:
# Lemmatization:
if config['doLemmatization']:
    lemmatizer = WordNetLemmatizer()
    pbar = ProgressBar(len(docs), logger=logger, message="Lemmatization")
    for i in range(len(docs)):
        for u in range(len(docs[i])):
            docs[i][u] = lemmatizer.lemmatize(docs[i][u])
        pbar.tic()
bp(docs, logger)

[ [ Sen., Ted, ..., more, . ], [ Order, Reprint, ..., please, ? ], ..., [ How, you, ..., here, . ], [ Lori, Clemons, ..., dates, . ] ]


In [21]:
# Filtering the corpus:
if config['minDF'] is not None or config['maxDF'] is not None:
    docs = filterCorpus(docs, minDF=config['minDF'], maxDF=config['maxDF'],
                        removeEmptyDocs=False, allowEmptyDocs=False, logger=logger)
    for doc in docs: assert len(doc) > 0
    bp(docs, logger)

In [22]:
tt.tic("Data preprocessed")

--> tic: 4m 9.56s | message: Data preprocessed


249.56

# Ranking

In [23]:
userIds = list(evalData['candidates'].keys())

In [24]:
chunksSize = int(len(userIds) / 500)
if chunksSize == 0:
    chunksSize = 1

In [25]:
userIds = chunks(userIds, chunksSize)

In [26]:
bp(userIds, 3, logger)

[ [ 790664132 ], [ 2617447752 ], [ 62594694 ], [ 839264223741956096 ], [ 2786961721 ], [ 1198644368 ], [ 21379050 ], [ 97081361 ], [ 3223646504 ], [ 3075850549 ] ]


In [27]:
urlDocs = dict()
for i in range(len(newsList)):
    urlDocs[newsList[i]] = docs[i]

In [28]:
def genFunct(containers, *args, **kwargs):
    if not isinstance(containers[0], list):
        containers = [containers]
    for container in containers:
        for userId in container:
            request = []
            for url in evalData['trainUsers'][userId]:
                request += urlDocs[url]
            currentRankings = []
            for candidates in evalData['candidates'][userId]:
                candidates = list(candidates)
                urlCorpus = dict()
                for i in range(len(candidates)):
                    urlCorpus[candidates[i]] = urlDocs[candidates[i]]
                urlCorpus = list(urlCorpus.items())
                corpus = [e[1] for e in urlCorpus]
                try:
                    # This is the new version not yet pushed on pipy:
                    model = bm25.BM25(corpus, k1=config['k1'], b=config['b'], epsilon=config['epsilon'])
                except:
                    # This is the actual version with parameters as global variables:
                    bm25.PARAM_K1 = config['k1']
                    bm25.PARAM_B = config['b']
                    bm25.EPSILON = config['epsilon']
                    model = bm25.BM25(corpus)
                scores = model.get_scores(request)
                scoresWithUrl = []
                for i in range(len(scores)):
                    scoresWithUrl.append((urlCorpus[i][0], scores[i]))
                ranking = sortBy(scoresWithUrl, index=1, desc=True)
                # ranking = [e[0] for e in ranking]
                currentRankings.append(ranking)
            yield (userId, currentRankings)

In [29]:
# We use MLIterator for multi-processing (33 hours of processing if not...):
mli = MLIterator(userIds, genFunct, logger=logger, printRatio=0.01, parallelProcesses=cpuCount())

With parallelProcesses > 1, this iterator is not consistent, meaning 2 iterations over same containers will not give items in the same order
10 containers to process.


In [30]:
warnFreeRAM(logger=logger)

11.37g of RAM remaining.


In [31]:
rankings = dict()
i = 0
modulo = int(len(evalData['candidates']) / 100) + 1
for (userId, currentRankings) in mli:
    rankings[userId] = currentRankings
    if i % modulo == 0:
        warnFreeRAM(logger=logger)
        if freeRAM() < 2:
            exit()
    i += 1

10.48g of RAM remaining.
 10% [==                  ] (1m 2.729s left)
10.52g of RAM remaining.
 20% [====                ] (33.64s left)
10.44g of RAM remaining.
10.55g of RAM remaining.
10.67g of RAM remaining.
10.78g of RAM remaining.
10.9g of RAM remaining.
11.02g of RAM remaining.
11.12g of RAM remaining.
11.08g of RAM remaining.


In [32]:
bp(rankings, logger, 4)

{
  '1198644368': 
  [
    [
      (
        https://blackchefswhitehouse.wordpress.com/2017/12/25/merry-christmas-with-a-1889-white-house-menu-from-president-benjamin-harrison/,
        608438.0661806386
      ),
      (
        http://billingsgazette.com/news/state-and-regional/florida-man-dies-while-snowshoeing-in-glacier-national-park/article_49de6e4e-ed95-582e-bb99-a571bb251aec.html?utm_medium=social&utm_source=twitter&utm_campaign=user-share,
        540732.8914211949
      ),
      ( http://on.wsj.com/2AvHNAL, 500960.4962487661 ),
      (
        https://www.seattletimes.com/nation-world/sununu-to-endorse-lead-poisoning-prevention-bill/?utm_source=twitter&utm_medium=social&utm_campaign=article_left_1.1,
        394328.9451831466
      ),
      (
        http://zli.umich.edu/event/zli-startup-workshop-how-succeed-business-plan-competitions-0,
        378467.1065745893
      ),
      (
        http://www.business-standard.com/article/international/north-korea-the-costs-of-war-calc

In [33]:
tt.tic("Rankings done")

--> tic: 44.91s | message: Rankings done


44.91

# Adding rankings to the db

In [34]:
# Read the doc!
addRanking(modelName, rankings, config, logger=logger)

In [35]:
tt.tic("Rankings stored")

--> tic: 0.2s | message: Rankings stored


0.2

In [36]:
totalDuration = tt.toc()

--> toc total duration: 4m 54.699s


In [37]:
notif(modelName + '-' + objectToHash(config)[:5] + " done in " + secondsToHumanReadableDuration(totalDuration) + " on " + getHostname())