# Commands

In [None]:
# rsync -avhuP -e "ssh -p 2222" student@212.129.44.40:/data/twinews-splits . # pwd: <company_name>-<computer_science_field_acronym>-<school_acronym>-<company_street_number>

# Init

In [1]:
isNotebook = '__file__' not in locals()

In [2]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from databasetools.mongo import *
from newstools.goodarticle.utils import *
from nlptools.preprocessing import *
from nlptools.news import parser as newsParser
from machinelearning.iterator import *
from twinews.utils import *
import pymongo

In [3]:
from machinelearning.bokehutils import *
from bokeh.plotting import output_notebook, show
output_notebook()

In [4]:
logger = Logger(tmpDir('logs') + "/twinews-splitting.log") if isNotebook else Logger("twinews-splitting.log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

In [5]:
TEST = False

In [6]:
newsCollection = getNewsCollection(logger=logger)
usersCollection = getUsersCollection(logger=logger)

Unable to create index url in twinews news
twinews news (version 1.0) initialised.
Unable to create index user_id in twinews users
twinews users (version 1.0) initialised.


In [7]:
assert len(newsCollection) > 0
assert len(usersCollection) > 0

In [8]:
tt.tic("Init done")

--> tic: 0.14s | message: Init done


0.14

# Finding min and max dates

In [None]:
ts = convertDate(newsCollection.findOne(sort=("minTimestamp", pymongo.ASCENDING))['minTimestamp'], dateFormat=DATE_FORMAT.datetimeString)
log("Overall min date: " + convertDate(ts, dateFormat=DATE_FORMAT.datetimeString), logger)

In [None]:
ts = convertDate(newsCollection.findOne(sort=("minTimestamp", pymongo.DESCENDING))['minTimestamp'], dateFormat=DATE_FORMAT.datetimeString)
log("Overall max date: " + convertDate(ts, dateFormat=DATE_FORMAT.datetimeString), logger)

In [None]:
ts = convertDate(newsCollection.findOne(sort=("maxTimestamp", pymongo.ASCENDING))['maxTimestamp'], dateFormat=DATE_FORMAT.datetimeString)
log("Overall min date when considering maxTimestamp: " + convertDate(ts, dateFormat=DATE_FORMAT.datetimeString), logger)

In [None]:
ts = convertDate(newsCollection.findOne(sort=("maxTimestamp", pymongo.DESCENDING))['maxTimestamp'], dateFormat=DATE_FORMAT.datetimeString)
log("Overall max date when considering maxTimestamp: " + convertDate(ts, dateFormat=DATE_FORMAT.datetimeString), logger)

# News count

In [None]:
def labelEncoderFunct(x):
    return ">= " + convertDate(x, dateFormat=DATE_FORMAT.datetimeString)

In [None]:
timestamps = [e['minTimestamp'] for e in newsCollection.find(projection={"minTimestamp": True})]

In [None]:
show(barplot(timestamps, labelEncoderFunct=labelEncoderFunct, n=60))

# Defining the start and end time

In [9]:
startDate = "2017-10-01"
endDate = "2018-02-16"
startTimestamp = convertDate(startDate, dateFormat=DATE_FORMAT.timestamp)
endTimestamp = convertDate(endDate, dateFormat=DATE_FORMAT.timestamp)

# News count from the start date

In [None]:
timestamps = [e['minTimestamp'] for e in newsCollection.find({'minTimestamp': {'$gt': startTimestamp}}, projection={"minTimestamp": True})]

In [None]:
show(barplot(timestamps, labelEncoderFunct=labelEncoderFunct, n=60))

# Defining split functions

In [10]:
def newsSplit(startTs, splitTs, endTs, returnExtraNews=True, logger=None, verbose=True):
    if isinstance(startTs, str):
        startTs = convertDate(startTs, dateFormat=DATE_FORMAT.timestamp)
    if isinstance(splitTs, str):
        splitTs = convertDate(splitTs, dateFormat=DATE_FORMAT.timestamp)
    if isinstance(endTs, str):
        endTs = convertDate(endTs, dateFormat=DATE_FORMAT.timestamp)
    newsCollection = getNewsCollection(logger=logger)
    previousNews = set()
    trainNews = set()
    testNews = set()
    afterNews = set()
    pbar = ProgressBar(len(newsCollection), logger=logger, verbose=verbose, printRatio=0.3)
    for news in newsCollection.find({}, projection={'minTimestamp': True, 'url': True}):
        url = news['url']
        ts = news['minTimestamp']
        if ts < startTs:
            previousNews.add(url)
        elif ts >= startTs and ts < splitTs:
            trainNews.add(url)
        elif ts >= splitTs and ts <= endTs:
            testNews.add(url)
        else:
            afterNews.add(url)
        pbar.tic()
    if returnExtraNews:
        return previousNews, trainNews, testNews, afterNews
    else:
        return trainNews, testNews

In [11]:
def usersSplit(urlss, minUrls=None, logger=None, verbose=True):
    """
        This function get a list of urls.
        You need to specify  the minium of url per user for each list in minUrls.
    """
    if minUrls is None:
        minUrls = [0] * len(urlss)
    assert len(urlss) == len(minUrls)
    usersCollection = getUsersCollection(logger=logger)
    bulks = []
    pbar = ProgressBar(len(usersCollection) * len(urlss), logger=logger, verbose=verbose, printRatio=0.3)
    for urls in urlss:
        current = dict()
        for user in usersCollection.find({}, projection={'timestamps': True, 'news': True, 'user_id': True}):
            current[user['user_id']] = dict()
            for i in range(len(user['news'])):
                if user['news'][i] in urls:
                    current[user['user_id']][user['news'][i]] = user['timestamps'][i]
            pbar.tic()
        bulks.append(current)
    toDeleteUsers = set()
    for i in range(len(bulks)):
        theMin = minUrls[i]
        users = bulks[i]
        for userId in list(users.keys()):
            if len(users[userId]) < theMin:
                toDeleteUsers.add(userId)
    remaining = len(usersCollection) - len(toDeleteUsers)
    log(str(remaining) + " users remaining on " + str(len(usersCollection))
                + ", so " + str(truncateFloat(remaining / len(usersCollection) * 100, 2)) + "%", logger)
    for i in range(len(bulks)):
        for userId in toDeleteUsers:
            del bulks[i][userId]
    return bulks

# Count for a split at 2018-02-01

In [None]:
theSplit = "2018-02-01"

In [None]:
previousNews, trainNews, testNews, afterNews = newsSplit(startDate, theSplit, endDate, logger=logger)

In [None]:
trainUsers, testUsers = usersSplit([trainNews, testNews], [0, 0], logger=logger)

In [None]:
show(barplot([len(n) for u, n in trainUsers.items()], title="Train counts for a split at " + theSplit, n=80))

In [None]:
show(barplot([len(n) for u, n in testUsers.items()], title="Test counts for a split at " + theSplit, n=80))

# Count for a split at 2018-01-15

In [None]:
theSplit = "2018-01-15"

In [None]:
previousNews, trainNews, testNews, afterNews = newsSplit(startDate, theSplit, endDate, logger=logger)

In [None]:
trainUsers, testUsers = usersSplit([trainNews, testNews], [0, 0], logger=logger)

In [None]:
show(barplot([len(n) for u, n in trainUsers.items()], title="Train counts for a split at " + theSplit, n=80))

In [None]:
show(barplot([len(n) for u, n in testUsers.items()], title="Test counts for a split at " + theSplit, n=80))

# User deletion for split at 2018-02-01 and mins (3, 2)

In [None]:
theSplit = "2018-02-01"

In [None]:
previousNews, trainNews, testNews, afterNews = newsSplit(startDate, theSplit, endDate, logger=logger)

In [None]:
trainUsers, testUsers = usersSplit([trainNews, testNews], [3, 2], logger=logger)

# User deletion for split at 2018-01-15 and mins (3, 2)

In [None]:
theSplit = "2018-01-15"

In [None]:
previousNews, trainNews, testNews, afterNews = newsSplit(startDate, theSplit, endDate, logger=logger)

In [None]:
trainUsers, testUsers = usersSplit([trainNews, testNews], [3, 2], logger=logger)

# User deletion for split at 2018-01-15 and mins (15, 5)

In [None]:
theSplit = "2018-01-15"

In [None]:
previousNews, trainNews, testNews, afterNews = newsSplit(startDate, theSplit, endDate, logger=logger)

In [None]:
trainUsers, testUsers = usersSplit([trainNews, testNews], [15, 5], logger=logger)

# User deletion for split at 2018-01-15 and mins (8, 2)

In [None]:
theSplit = "2018-01-15"

In [None]:
previousNews, trainNews, testNews, afterNews = newsSplit(startDate, theSplit, endDate, logger=logger)

In [None]:
trainUsers, testUsers = usersSplit([trainNews, testNews], [8, 2], logger=logger)

# Defining the split and mins

In [12]:
theSplit = "2018-01-15"

In [13]:
mins = [8, 2]

# Getting data

In [14]:
previousNews, trainNews, testNews, afterNews = newsSplit(startDate, theSplit, endDate, logger=logger)

Unable to create index url in twinews news
twinews news (version 1.0) initialised.
  0% [                    ]


In [15]:
trainUsers, testUsers = usersSplit([trainNews, testNews], mins, logger=logger)

Unable to create index user_id in twinews users
twinews users (version 1.0) initialised.
  0% [                    ]
 29% [=====               ] (4.293s left)
21239 users remaining on 32474, so 65.4%


# Creating candidates

In [18]:
ranksLength = 1000

In [19]:
testNewsList = list(testNews)
candidates = dict()
pbar = ProgressBar(len(testUsers), logger=logger, printRatio=0.01)
for userId, news in testUsers.items():
    news = set(news.keys())
    while len(news) < ranksLength:
        news.add(random.choice(testNewsList))
    candidates[userId] = [news]
    pbar.tic()

  0% [                    ]
  0% [                    ] (46.616s left)
  1% [                    ] (42.71s left)
  2% [                    ] (41.141s left)
  3% [                    ] (40.637s left)
  4% [                    ] (39.977s left)
  5% [=                   ] (39.086s left)
  6% [=                   ] (38.604s left)
  7% [=                   ] (38.025s left)
  8% [=                   ] (37.486s left)
  9% [=                   ] (37.065s left)
 10% [==                  ] (36.484s left)
 11% [==                  ] (36.081s left)
 12% [==                  ] (35.678s left)
 13% [==                  ] (35.15s left)
 14% [==                  ] (34.755s left)
 15% [===                 ] (34.357s left)
 16% [===                 ] (33.958s left)
 17% [===                 ] (33.512s left)
 18% [===                 ] (33.071s left)
 19% [===                 ] (33.156s left)
 20% [====                ] (32.691s left)
 21% [====                ] (32.268s left)
 22% [====                ] 

In [20]:
bp(candidates, logger)

{
  100022528: 
  [
    {
      http://a.msn.com/00/en-ca/AAuP4NT?ocid=st,
      http://a.msn.com/01/en-us/BBIVU3P?ocid=st,
      ...,
      https://www.yorkregion.com/news-story/8076176-york-region-parents-educators-talk-about-poverty-in-sc,
      https://yaledailynews.com/blog/2018/01/30/gsa-lobbies-for-ombuds-office/
    }
  ],
  100024324: 
  [
    {
      http://13wham.com/news/local/identity-tech-start-up-run-by-rit-grads-gets-175m-in-state-funding,
      http://a.msn.com/0E/en-us/AAvaVyo?ocid=st,
      ...,
      https://www.yahoo.com/sports/reports-msu-president-lou-anna-k-simon-resign-wake-larry-nassar-scandal,
      https://yesmeansyesblog.wordpress.com/2009/11/24/predator-redux/
    }
  ],
  100064338: 
  [
    {
      http://a.msn.com/00/en-ca/AAuP4NT?ocid=st,
      http://a.msn.com/0C/en-us/AAuJW0j?ocid=st,
      ...,
      https://www.wxyz.com/news/opinion/wxyz-editorial-whats-next-for-detroit-after-amazon-bid-loss,
      https://yaledailynews.com/blog/2018/02/06/gupta-to

# Removing news from trainNews and testNews

In [21]:
# Getting urls:
urls = set()
for users in (trainUsers, testUsers):
    for userId, news in users.items():
        for n in news.keys():
            urls.add(n)
for userId, bulks in candidates.items():
    for news in bulks:
        for n in news:
            urls.add(n)

In [22]:
# Sub-sampling news:
trainNews = set([n for n in trainNews if n in urls])
testNews = set([n for n in testNews if n in urls])

# Analysis

In [23]:
bp(trainNews, logger)

{
  http://11alive.com/SellingGirls,
  http://11alive.com/canathon,
  ...,
  https://zrants.wordpress.com/2017/10/24/la-orange-county-transit-agencies-seek-their-own-ride-sharin,
  https://zwischenzugs.wordpress.com/2017/10/15/my-20-year-experience-of-software-development-methodol
}


In [24]:
bp(trainUsers, logger)

{
  100022528: 
  {
    http://newsok.com/article/5566752?utm_source=NewsOK.com&utm_medium=Social&utm_campaign=ShareBar-Twitter: 1507218223,
    http://newsok.com/article/5566980?utm_source=NewsOK.com&utm_medium=Social&utm_campaign=ShareBar-Twitter: 1507556930,
    http://newsok.com/article/5567097?utm_source=NewsOK.com&utm_medium=Social&utm_campaign=ShareBar-Twitter: 1507556848,
    http://newsok.com/article/5567140?utm_source=NewsOK.com&utm_medium=Social&utm_campaign=ShareBar-Twitter: 1507556832,
    http://newsok.com/article/5567338?utm_source=NewsOK.com&utm_medium=Social&utm_campaign=ShareBar-Twitter: 1507642670,
    ...,
    http://newsok.com/article/5579183?utm_source=NewsOK.com&utm_medium=Social&utm_campaign=ShareBar-Twitter: 1515698281,
    http://newsok.com/article/5579231?utm_source=NewsOK.com&utm_medium=Social&utm_campaign=ShareBar-Twitter: 1515779595,
    http://newsok.com/article/5579253?utm_source=NewsOK.com&utm_medium=Social&utm_campaign=ShareBar-Twitter: 1515717177,
   

In [25]:
log("We loose " + str(len(previousNews)) + " news because they are too old.", logger)
trainNewsCount = len(trainNews)
log("We have " + str(trainNewsCount) + " news in train.", logger)
testNewsCount = len(testNews)
log("We have " + str(testNewsCount) + " news in test.", logger)
totalNewsAvailable = len(newsCollection)
log("Total available news in the dataset: " + str(totalNewsAvailable), logger)
assert len(afterNews) == 0

We loose 75275 news because they are too old.
We have 323572 news in train.
We have 138785 news in test.
Total available news in the dataset: 570210


In [26]:
trainMeanNewsPerUser = truncateFloat(np.mean([len(n) for u, n in trainUsers.items()]), 2)
log("Mean news count per user in train: " + str(trainMeanNewsPerUser), logger)
testMeanNewsPerUser = truncateFloat(np.mean([len(n) for u, n in testUsers.items()]), 2)
log("Mean news count per user in test: " + str(testMeanNewsPerUser), logger)

Mean news count per user in train: 28.0
Mean news count per user in test: 10.67


In [27]:
trainMinNewsPerUser = min([len(n) for u, n in trainUsers.items()])
log("Min news count per user in train: " + str(trainMinNewsPerUser), logger)
testMinNewsPerUser = min([len(n) for u, n in testUsers.items()])
log("Min news count per user in test: " + str(testMinNewsPerUser), logger)
trainMaxNewsPerUser = max([len(n) for u, n in trainUsers.items()])
log("Max news count per user in train: " + str(trainMaxNewsPerUser), logger)
testMaxNewsPerUser = max([len(n) for u, n in testUsers.items()])
log("Max news count per user in test: " + str(testMaxNewsPerUser), logger)

Min news count per user in train: 8
Min news count per user in test: 2
Max news count per user in train: 443
Max news count per user in test: 164


In [28]:
usersCount = len(trainUsers)
log("Users count: " + str(usersCount), logger)

Users count: 21239


# Adding a new eval

In [30]:
outputDir = nosaveDir() + "/twinews-splits"
mkdir(outputDir)

In [31]:
version = 1
filePath = outputDir + "/v" + str(version) + ".pickle.gzip"

In [32]:
evalDict = dict()

In [33]:
evalDict['stats'] = dict()
evalDict['stats']['usersCount'] = usersCount
evalDict['stats']['trainNewsCount'] = trainNewsCount
evalDict['stats']['testNewsCount'] = testNewsCount
evalDict['stats']['totalNewsAvailable'] = totalNewsAvailable
evalDict['stats']['trainMeanNewsPerUser'] = trainMeanNewsPerUser
evalDict['stats']['testMeanNewsPerUser'] = testMeanNewsPerUser
evalDict['stats']['trainMinNewsPerUser'] = trainMinNewsPerUser
evalDict['stats']['testMinNewsPerUser'] = testMinNewsPerUser
evalDict['stats']['trainMaxNewsPerUser'] = trainMaxNewsPerUser
evalDict['stats']['testMaxNewsPerUser'] = testMaxNewsPerUser

In [34]:
evalDict['created'] = getDateSec()
evalDict['ranksLength'] = ranksLength

In [35]:
bp(evalDict, logger, 4)

{
  'created': 2020.03.19-18.55.55,
  'ranksLength': 1000,
  'stats': { 'testMaxNewsPerUser': 164, 'testMeanNewsPerUser': 10.67, 'testMinNewsPerUser': 2, 'testNewsCount': 138785, 'totalNewsAvailable': 570210, 'trainMaxNewsPerUser': 443, 'trainMeanNewsPerUser': 28.0, 'trainMinNewsPerUser': 8, 'trainNewsCount': 323572, 'usersCount': 21239 }
}


In [36]:
evalDict['trainNews'] = trainNews
evalDict['testNews'] = testNews
evalDict['trainUsers'] = trainUsers
evalDict['testUsers'] = testUsers

In [37]:
evalDict['candidates'] = candidates

In [38]:
serialize(evalDict, filePath)

# End

In [None]:
tt.toc()