In [6]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [7]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from nlptools.preprocessing import *
from nlptools.basics import *
from twinews.utils import *
from twinews.models.ranking import *
from twinews.evaluation.utils import *
from machinelearning.iterator import *

In [8]:
import re
import codecs
import math
from gensim import corpora
from gensim.summarization import bm25
from nltk.stem import WordNetLemmatizer

In [9]:
modelName = 'BM25_yf'

In [10]:
isNotebook = '__file__' not in locals()

In [11]:
TEST = isNotebook

In [12]:
logger = Logger(tmpDir('logs') + "/bm25.log") if isNotebook else Logger("bm25-" + getHostname() + ".log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

# BM25 Config

In [13]:
config = \
{
    'splitVersion': 2,
    
    # BM25 free smoonthing para
    'k1': 1.8 if TEST else 0,
    'b': 0.75 if TEST else 0,
    'epsilon': 0.25, # get idf when < 0
    
    'maxUsers': None, # Sub-sampling
    'maxDocuments': 10000,
    'useExtraNews': False, # None = unlimited, 0 = no extra news
    'minDF': 1 / 2000, # Remove words that have a document frequency ratio lower than 1 / 500
    'maxDF': 300, # Remove top 300 voc elements
    
    'lowercase': False,
    'doLemmatization': True,
    
    'implementation': 'BM25', # bm25f, topic+bm25

}

# Data

In [14]:
evalData = getEvalData(config['splitVersion'], maxExtraNews=config['maxDocuments'],
                       maxUsers=config['maxUsers'], logger=logger)
(trainUsers, testUsers, trainNews, testNews, candidates, extraNews) = \
(evalData['trainUsers'], evalData['testUsers'], evalData['trainNews'],
 evalData['testNews'], evalData['candidates'], evalData['extraNews'])
bp(evalData.keys(), 5, logger)
log(b(evalData['meta'], 5), logger)

KeyboardInterrupt: 

In [12]:
bp(trainUsers,2)

{
  100022528: 
  {
    http://newsok.com/article/5566752?utm_source=NewsOK.com&utm_medium=Social&utm_campaign=ShareBar-Twitter: 1507218223,
    http://newsok.com/article/5566980?utm_source=NewsOK.com&utm_medium=Social&utm_campaign=ShareBar-Twitter: 1507556930,
    http://newsok.com/article/5567097?utm_source=NewsOK.com&utm_medium=Social&utm_campaign=ShareBar-Twitter: 1507556848,
    http://newsok.com/article/5567140?utm_source=NewsOK.com&utm_medium=Social&utm_campaign=ShareBar-Twitter: 1507556832,
    http://newsok.com/article/5567338?utm_source=NewsOK.com&utm_medium=Social&utm_campaign=ShareBar-Twitter: 1507642670,
    ...,
    http://newsok.com/article/5576420?utm_source=NewsOK.com&utm_medium=Social&utm_campaign=ShareBar-Twitter: 1513782942,
    http://newsok.com/article/5576758?utm_source=NewsOK.com&utm_medium=Social&utm_campaign=ShareBar-Twitter: 1513882745,
    http://newsok.com/article/5576875?utm_source=NewsOK.com&utm_medium=Social&utm_campaign=ShareBar-Twitter: 1513976221,
   

In [13]:
# extraNewsList = shuffle(list(extraNews), seed=0)
trainNewsList = shuffle(list(trainNews), seed=0)
testNewsList = shuffle(list(testNews), seed=0)
print("nb of trainNews : %d" % len(trainNewsList))
print("nb of testNews : %d" % len(testNewsList))
#print("nb of extraNews : %d" % len(extraNewsList))

nb of trainNews : 237150
nb of testNews : 71781


In [14]:
usersList = list(evalData["candidates"].keys())
bp(usersList, 2)
len(usersList)

[ 74803924, 250149639, ..., 297043321, 15779344 ]


15905

In [15]:
# build the query and corpus dict(here the user trainNews as history urls for query)
NewsList = trainNewsList + testNewsList
# Print all:
log(str(len(NewsList)) + " urls for NewsList:\n" + b(NewsList), logger=logger)

308931 urls for NewsList:
[
  https://www.defensenews.com/air/2017/10/27/turkey-terminates-local-jet-program-worth-billions/,
  http://www.star-telegram.com/news/politics-government/state-politics/article188454729.html,
  ...,
  https://www.stuff.co.nz/life-style/fashion/100077633/best--worst-dressed-celebrities-of-the-week,
  http://tcrn.ch/2lOXSvW
]


In [17]:
# get sentences
sentences = getNewsSentences(NewsList, logger=logger)
bp(sentences, logger=logger)

  0% [                    ]


KeyboardInterrupt: 

In [18]:
# flatten the senctence
for i in range(len(sentences)):
    sentences[i] = flattenLists(sentences[i])
docs = sentences
bp(docs, logger)

[ [ In, __int_4__, ..., matter, . ], [ Texas, Democrats, ..., issues, . ], ..., [ Getty, Images, ..., instead, . ], [ Chinese, internet, ..., development, . ] ]


In [19]:
# Lower case:
if config['lowercase']:
    for i in pb(list(range(len(docs))), logger=logger, message="Lower casing"):
        for u in range(len(docs[i])):
            docs[i][u] = docs[i][u].lower()
bp(docs, logger)
 

[ [ In, __int_4__, ..., matter, . ], [ Texas, Democrats, ..., issues, . ], ..., [ Getty, Images, ..., instead, . ], [ Chinese, internet, ..., development, . ] ]


In [20]:
# Lemmatization:
if config['doLemmatization']:
    lemmatizer = WordNetLemmatizer()
    pbar = ProgressBar(len(docs), logger=logger, message="Lemmatization")
    for i in range(len(docs)):
        for u in range(len(docs[i])):
            docs[i][u] = lemmatizer.lemmatize(docs[i][u])
        pbar.tic()
bp(docs, logger)

Lemmatization   0% [                    ]
Lemmatization   9% [=                   ] (12m 2.522s left)
Lemmatization  19% [===                 ] (10m 16.882s left)
Lemmatization  29% [=====               ] (8m 40.055s left)
[ [ In, __int_4__, ..., matter, . ], [ Texas, Democrats, ..., issue, . ], ..., [ Getty, Images, ..., instead, . ], [ Chinese, internet, ..., development, . ] ]


In [21]:
# Filtering the corpus:
docs = filterCorpus(docs, minDF=config['minDF'], maxDF=config['maxDF'],
                    removeEmptyDocs=False, allowEmptyDocs=False, logger=logger)
for doc in docs: assert len(doc) > 0
bp(docs, logger)

Voc removed because of minDF (635141 elements):
{ #B1, #ByeFelicia, #CarrieOnForever, #MeToo, #Salute, #Troublemaker, #United, #VegasStrong, #ohmy, #portland, ..., 😢, 😤, 😫, 😰, 😱, 😳, 😵, 😶, 🙅, 🙌 }
Voc removed because of maxDF (300 elements):
{ ", ', (, ), ,, -, ., :, ;, ?, ..., with, without, woman, work, working, world, would, year, you, your }
95.28% of voc will be removed.
[ [ Turkish, committed, ..., silent, matter ], [ Texas, Democrats, ..., politics, statewide ], ..., [ Getty, Images, ..., short, instead ], [ Chinese, internet, ..., driving, development ] ]


# BM25 ranking

In [22]:
usersList = list(evalData['candidates'].keys())

In [23]:
chunksSize = int(len(usersList) / 500)
if chunksSize == 0:
    chunksSize = 1

In [24]:
usersList = chunks(usersList, chunksSize)
bp(usersList, 3, logger)

[
  [ 74803924, 250149639, 252877167, 45215640, 16262059, ..., 250376045, 3214181795, 416830417, 1120236079, 748660045294952448 ],
  [ 3041245203, 2247446467, 176488015, 16406107, 2561716338, ..., 861400931379724288, 3307163917, 205726603, 73191309, 32388250 ],
  [ 129219624, 1199201359, 912581, 18801669, 49994069, ..., 826261123297116160, 44645640, 904386134, 36186336, 4358394552 ],
  [ 4377070103, 133365580, 235572729, 1543541996, 274809275, ..., 902691913, 3050412105, 271559689, 701681125027020801, 64871190 ],
  [ 915296223552315392, 20876855, 594231225, 2916717489, 9315842, ..., 242739989, 728096655484952576, 25671905, 2888717791, 635167769 ],
  ...,
  [ 514863780, 200184375, 539452166, 17667859, 911329833325617152, ..., 788228650470486017, 3345855166, 1858802611, 876169598126706690, 15136978 ],
  [ 43995880, 2775190792, 23970774, 1225674764, 14800339, ..., 93268575, 17088775, 799109747970633728, 376716249, 24806605 ],
  [ 252408723, 733451924, 102957405, 1068769698, 29924665, ...,

In [25]:
urlDocs = dict()
for i in range(len(NewsList)):
    urlDocs[NewsList[i]] = docs[i]

bp(urlDocs, 2, logger)

{
  http://11alive.com/SellingGirls: [ Carol, looked, ..., Missing, Children ],
  http://11alive.com/canathon: [ Dear, Can, ..., following, location ],
  http://13wham.com/news/local/brockport-volunteer-ambulance-shuts-down-service: [ a.m., Tuesday, ..., becomes, available ],
  http://13wham.com/news/local/call-to-help-puerto-ricans-relocating-to-rochester-following-hurricane-maria: [ Hundreds, Puerto, ..., opportunity, website ],
  http://13wham.com/news/local/gates-fire-chief-announces-retirement-in-jan-2018: [ Gates, Fire, ..., Fire, Chiefs ],
  ...,
  https://zooksoftware.wordpress.com/2017/12/21/how-to-batch-convert-ost-to-pst-to-import-ost-data-to-outlook/: [ Get, easiest, ..., conversion, process ],
  https://zorzahealth.wordpress.com/2018/01/13/update-time-not-so-great/: [ promised, date, ..., reach, Thanks ],
  https://zouxzoux.wordpress.com/2018/01/11/interview-with-julie-kane-poet-co-editor-of-nasty-women-poets-poetry-writing-women/#more-6733: [ Saturday, January, ..., Write

In [26]:
def genFunct(containers, *args, **kwargs):
    if not isinstance(containers[0], list):
        containers = [containers]
    for container in containers:
        for userId in container:
            request = []
            for url in evalData['trainUsers'][userId]:
                request += urlDocs[url] # the request is the sum historical url in trainUser
            currentRankings = []
            for candidates in evalData['candidates'][userId]:
                candidates = list(candidates) # list of each 1000 candidates for one user
                urlCorpus = dict()
                for i in range(len(candidates)):
                    urlCorpus[candidates[i]] = urlDocs[candidates[i]] # candidate dict, key:url, value:news doc
                urlCorpus = list(urlCorpus.items()) # list of url->news doc
                corpus = [e[1] for e in urlCorpus] # get rid of the url, take the whole coprus
                try:
                    # This is the new version not yet pushed on pipy:
                    model = bm25.BM25(corpus, k1=config['k1'], b=config['b'], epsilon=config['epsilon'])
                except:
                    # This is the actual version with parameters as global variables:
                    bm25.PARAM_K1 = config['k1']
                    bm25.PARAM_B = config['b']
                    bm25.EPSILON = config['epsilon']
                    model = bm25.BM25(corpus)
                scores = model.get_scores(request)
                scoresWithUrl = []
                for i in range(len(scores)):
                    scoresWithUrl.append((urlCorpus[i][0], scores[i])) # dict url->score
                ranking = sortBy(scoresWithUrl, index=1, desc=True)
                ranking = [e[0] for e in ranking] # only urls, without scores
                currentRankings.append(ranking)
            yield (userId, currentRankings)

In [27]:
# We use MLIterator for multi-processing (33 hours of processing if not...):
mli = MLIterator(usersList, genFunct, logger=logger, printRatio=0.01, parallelProcesses=cpuCount())

With parallelProcesses > 1, this iterator is not consistent, meaning 2 iterations over same containers will not give items in the same order
514 containers to process.


In [30]:
warnFreeRAM(logger=logger)

1.77g of RAM remaining.


In [31]:
rankings = dict()
i = 0
modulo = int(len(evalData['candidates']) / 100) + 1
for (userId, currentRankings) in mli:
    rankings[userId] = currentRankings
    if i % modulo == 0:
        warnFreeRAM(logger=logger)
        if freeRAM() < 2:
            exit()
    i += 1

1.43g of RAM remaining.
  0% [                    ]
0.03g of RAM remaining.
  0% [                    ]
3.59g of RAM remaining.
  1% [                    ]
1.27g of RAM remaining.
  2% [                    ]
2.96g of RAM remaining.
  3% [                    ]
1.54g of RAM remaining.
  4% [                    ]
1.1g of RAM remaining.
  5% [=                   ] (1h 28m 46.581s left)
2.36g of RAM remaining.
  6% [=                   ] (1h 22m 5.351s left)
  7% [=                   ] (1h 15m 6.792s left)
5.08g of RAM remaining.
1.6g of RAM remaining.
  8% [=                   ] (1h 15m 36.376s left)
2.54g of RAM remaining.
  9% [=                   ] (1h 14m 51.148s left)
0.93g of RAM remaining.
 10% [==                  ] (1h 12m 33.072s left)
3.04g of RAM remaining.
 11% [==                  ] (1h 8m 58.966s left)
 12% [==                  ] (1h 5m 54.377s left)
4.05g of RAM remaining.
0.11g of RAM remaining.
 13% [==                  ] (1h 6m 55.789s left)
 14% [==                  ] (

In [2]:
config

Available objects for config:
     AliasManager
     DisplayFormatter
     HistoryManager
     IPCompleter
     IPKernelApp
     LoggingMagics
     MagicsManager
     OSMagics
     PrefilterManager
     ScriptMagics
     StoreMagics
     ZMQInteractiveShell


In [4]:
candidates.keys()

NameError: name 'candidates' is not defined

In [5]:
rankings.keys()

NameError: name 'rankings' is not defined

In [67]:
checkRankings(rankings,candidates,maxUsers=None)

In [46]:
len(rankings[userId])

[['http://ow.ly/gnmh30hxxKj',
  'http://politi.co/2ma7rpw',
  'http://www.baltimoresun.com/news/maryland/environment/bs-md-air-lawsuit-20171228-story.html',
  'http://goo.gl/alerts/GT7wU',
  'http://nymag.com/daily/intelligencer/2018/01/conservatives-outsourced-climate-to-lobbyists-and-kooks.html',
  'https://www.gjsentinel.com/news/western_colorado/report-cites-climate-costs-of-gas-project/article_313e11dc-f7a3-11e7-b99b-10604b9f7e7c.html',
  'http://www.heraldnews.com/opinion/20180102/guest-opinion-good-side-of-environment-news-after-year-of-trump',
  'https://www.bloomberg.com/news/articles/2018-01-10/ford-sued-by-truck-owners-claiming-diesel-engines-were-rigged',
  'https://insideclimatenews.org/news/23052017/methane-rules-trump-epa-obama-clean-power-plan-waste-management-delay',
  'https://www.washingtonpost.com/politics/how-the-trump-era-is-changing-the-federal-bureaucracy/2017/12/30/8d5149c6-daa7-11e7-b859-fb0995360725_story.html?utm_term=.44c6991f607a',
  'https://www.postandco

In [47]:
len(candidates[userId])

1

In [68]:
addRanking(modelName, rankings, config, logger=logger)

# TF, IDF

In [11]:
def inition(docs):
    D = len(docs)
    avgdl = sum([len(doc)+ 0.0 for doc in docs]) / D # average length of document
    for doc in docs:
        tmp = {}
        for word in doc:
            tmp[word] = tmp.get(word, 0) + 1  # store the frequency of words temporarily
        f.append(tmp)
        for k in tmp.keys():
            tf[k] = tf.get(k, 0) + 1         # term frequency
    for k, v in tf.items():
        idf[k] = math.log(D - v + 0.5) - math.log(v + 0.5)
    return D, avgdl

# Similarity

In [12]:
def sim(doc, index):
    score = 0.0
    for word in doc:
        if word not in f[index]:
            continue
        d = len(document[index])
        score += (idf[word] * f[index][word] * (k1 + 1) / (f[index][word] + k1 * (1 - b + b * d / avgdl)))
    return score

In [13]:
def simall(doc):
    scores = []
    for index in range(D):
            score = sim(doc, index)
            scores.append(score)
    return scores

# Small test for gensim bm25

In [38]:
corpus = [
     ["black", "cat", "white", "cat"],
     ["cat", "outer", "space"],
     ["wag", "dog"]
]
bm25Model = bm25.BM25(corpus)

In [39]:
average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys())

In [40]:
average_idf

0.4508818005689612

In [48]:
query_str = 'outer dog'
query = []
for word in query_str.strip().split():
    query.append(word)
scores = bm25Model.get_scores(query)
# scores.sort(reverse=True)
print(scores)


[0, 0.5108256237659907, 0.600971322077636]


In [108]:
def training():
    # 1.2<k1<2.0 step = 0.2
    # 0.5<b<0.8  step = 0.1      
    for k1 in [1.2,1.4,1.6,1.8,2.0]:       
        for b in [0.55,0.65,0.75]:
            config['k1'] = k1
            config['b'] = b
            rankings = dict()
            for (userId, currentRankings) in mli:
                rankings[userId] = currentRankings
            addRanking(modelName, rankings, config, logger=logger)