# Commands

In [1]:
# cd ~/twinews-logs ; jupython -o nohup-random-$HOSTNAME.out --venv st-venv ~/Workspace/Python/Datasets/Twinews/twinews/models/random.ipynb

# Init

In [2]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [3]:
isNotebook = '__file__' not in locals()

In [4]:
TEST = isNotebook # isNotebook, True, False

In [5]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from nlptools.preprocessing import *
from nlptools.basics import *
from twinews.utils import *
from twinews.models.ranking import *
from machinelearning.iterator import *

In [6]:
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [7]:
from gensim.summarization import bm25

In [8]:
logger = Logger(tmpDir('logs') + "/random.log") if isNotebook else Logger("random-" + getHostname() + ".log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

# Config

In [9]:
# modelName = 'worst'
modelName = 'ideal'
# modelName = 'random'

In [10]:
config = \
{
    'splitVersion': 1,
    'maxUsers': None if TEST else None, # Sub-sampling
    'seed': 0,
}

In [11]:
if modelName != 'random':
    del config['seed']

In [12]:
bp(config, logger, 5)

{ 'maxUsers': None, 'splitVersion': 1 }


In [13]:
# Here we check if we already generated ranking for this model with this specific config:
if not isNotebook:
    if rankingExists(modelName, config, logger=logger):
        raise Exception(modelName + " with this config already exist:\n" + b(config, 5))

# Getting data

In [14]:
# Getting users and news
evalData = getEvalData(config['splitVersion'], maxExtraNews=0, maxUsers=config['maxUsers'], logger=logger)
(trainUsers, testUsers, trainNews, testNews, candidates, extraNews) = \
(evalData['trainUsers'], evalData['testUsers'], evalData['trainNews'],
 evalData['testNews'], evalData['candidates'], evalData['extraNews'])
bp(evalData.keys(), 5, logger)
log(b(evalData['meta'], 5), logger)

--> tic: 46.76s | message: Eval data loaded
--> toc total duration: 51.57s | message: Got Twinews evaluation data
{ candidates, extraNews, meta, testNews, testUsers, trainNews, trainUsers }
{ 'created': 2020.03.24-14.13.25, 'endDate': 2018-02-16, 'id': 1, 'ranksLength': 1000, 'splitDate': 2018-01-15, 'startDate': 2017-10-01, 'testMaxNewsPerUser': 164, 'testMeanNewsPerUser': 10.67, 'testMinNewsPerUser': 2, 'testNewsCount': 138785, 'totalNewsAvailable': 570210, 'trainMaxNewsPerUser': 443, 'trainMeanNewsPerUser': 28.0, 'trainMinNewsPerUser': 8, 'trainNewsCount': 323572, 'usersCount': 21239 }


In [15]:
# Here it is important to convert urls to lists because we want the same order to retrieve vectors by index...
# And we shuffle it so we do not stick urls a a user at the begin...
# But we seed the random to always have same order...
trainNewsList = shuffle(list(trainNews), seed=0)
testNewsList = shuffle(list(testNews), seed=0)
newsList = trainNewsList + testNewsList

In [16]:
# Print all:
log(str(len(trainNewsList)) + " urls for trainNewsList", logger=logger)
log(str(len(testNewsList)) + " urls for testNewsList", logger=logger)
log(str(len(newsList)) + " urls for newsList", logger=logger)

323572 urls for trainNewsList
138785 urls for testNewsList
462357 urls for newsList


In [17]:
tt.tic("Data preprocessed")

--> tic: 52.55s | message: Data preprocessed


52.55

# Ranking

In [18]:
rankings = dict()
for userId in evalData["candidates"]:
    currentRankings = []
    for currentCandidates in evalData["candidates"][userId]:
        relevantUrls = set(evalData["testUsers"][userId].keys())
        unrelevantUrls = [e for e in currentCandidates if e not in relevantUrls]
        relevantUrls = sorted(list(relevantUrls))
        unrelevantUrls = sorted(list(unrelevantUrls))
        assert len(relevantUrls) > 0
        assert len(unrelevantUrls) > 0
        if modelName == 'worst':
            ranking = unrelevantUrls + relevantUrls
        elif modelName == 'ideal':
            ranking = relevantUrls + unrelevantUrls
        elif modelName == 'random':
            ranking = shuffle(relevantUrls + unrelevantUrls, seed=config['seed'])
        assert len(ranking) == len(currentCandidates)
        currentRankings.append(ranking)
    rankings[userId] = currentRankings
bp(rankings, logger)

{
  100022528: 
  [
    [
      http://newsok.com/article/5579862?utm_source=NewsOK.com&utm_medium=Social&utm_campaign=ShareBar-Twit,
      http://newsok.com/article/5579897?utm_source=NewsOK.com&utm_medium=Social&utm_campaign=ShareBar-Twit,
      ...,
      https://www.zerohedge.com/news/2018-01-31/earths-magnetic-field-shifting-poles-may-flip-could-get-ba,
      https://ziffblog.wordpress.com/2018/01/26/stare-decisis-and-the-consubstantiality-of-the-washington-
    ]
  ],
  100024324: 
  [
    [
      https://buff.ly/2r33Cbc,
      https://fb.me/1V3tC1haw,
      ...,
      https://www.yahoo.com/news/brash-ex-milwaukee-sheriff-david-clarke-faces-civil-161831030--politics.h,
      https://www.zerohedge.com/news/2018-02-07/yuan-crashing-after-huge-china-trade-surprise
    ]
  ],
  100064338: 
  [
    [
      http://a.msn.com/0C/en-us/AAuJW0j?ocid=st,
      http://bbc.in/2EF1R7c,
      ...,
      https://www.yahoo.com/news/early-immigrants-weren-apos-t-105905842.html?.tsrc=fauxdal,
     

In [19]:
tt.tic("Rankings done")

--> tic: 12.2s | message: Rankings done


12.2

# Adding rankings to the db

In [20]:
# Read the doc!
addRanking(modelName, rankings, config, logger=logger)

In [21]:
tt.tic("Rankings stored")

--> tic: 29.38s | message: Rankings stored


29.38

In [22]:
totalDuration = tt.toc()

--> toc total duration: 1m 34.15s


In [23]:
notif(modelName + '-' + objectToHash(config)[:5] + " done in " + secondsToHumanReadableDuration(totalDuration) + " on " + getHostname())