# Commands

In [None]:
# rsync -avhuP -e "ssh -p 2222" student@212.129.44.40:/data/twinews-splits . # pwd: <company_name>-<computer_science_field_acronym>-<school_acronym>-<company_street_number>

# Init

In [None]:
isNotebook = '__file__' not in locals()

In [None]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from databasetools.mongo import *
from newstools.goodarticle.utils import *
from nlptools.preprocessing import *
from nlptools.news import parser as newsParser
from machinelearning.iterator import *
from twinews.utils import *
import pymongo

In [None]:
from machinelearning.bokehutils import *
from bokeh.plotting import output_notebook, show
output_notebook()

In [None]:
logger = Logger(tmpDir('logs') + "/twinews-splitting.log") if isNotebook else Logger("twinews-splitting.log")
tt = TicToc(logger=logger)
tt.tic()

In [None]:
TEST = False

In [None]:
newsCollection = getNewsCollection(logger=logger)
usersCollection = getUsersCollection(logger=logger)

In [None]:
assert len(newsCollection) > 0
assert len(usersCollection) > 0

In [None]:
tt.tic("Init done")

# Finding min and max dates

In [None]:
ts = convertDate(newsCollection.findOne(sort=("minTimestamp", pymongo.ASCENDING))['minTimestamp'], dateFormat=DATE_FORMAT.datetimeString)
log("Overall min date: " + convertDate(ts, dateFormat=DATE_FORMAT.datetimeString), logger)

In [None]:
ts = convertDate(newsCollection.findOne(sort=("minTimestamp", pymongo.DESCENDING))['minTimestamp'], dateFormat=DATE_FORMAT.datetimeString)
log("Overall max date: " + convertDate(ts, dateFormat=DATE_FORMAT.datetimeString), logger)

In [None]:
ts = convertDate(newsCollection.findOne(sort=("maxTimestamp", pymongo.ASCENDING))['maxTimestamp'], dateFormat=DATE_FORMAT.datetimeString)
log("Overall min date when considering maxTimestamp: " + convertDate(ts, dateFormat=DATE_FORMAT.datetimeString), logger)

In [None]:
ts = convertDate(newsCollection.findOne(sort=("maxTimestamp", pymongo.DESCENDING))['maxTimestamp'], dateFormat=DATE_FORMAT.datetimeString)
log("Overall max date when considering maxTimestamp: " + convertDate(ts, dateFormat=DATE_FORMAT.datetimeString), logger)

# News count

In [None]:
def labelEncoderFunct(x):
    return ">= " + convertDate(x, dateFormat=DATE_FORMAT.datetimeString)

In [None]:
timestamps = [e['minTimestamp'] for e in newsCollection.find(projection={"minTimestamp": True})]

In [None]:
show(barplot(timestamps, labelEncoderFunct=labelEncoderFunct, n=60))

# Defining the start and end time

In [None]:
startDate = "2017-10-01"
endDate = "2018-01-15" # '2018-02-16' for the main train / test split and '2018-01-15' for the train / validation
startTimestamp = convertDate(startDate, dateFormat=DATE_FORMAT.timestamp)
endTimestamp = convertDate(endDate, dateFormat=DATE_FORMAT.timestamp)

# News count from the start date

In [None]:
timestamps = [e['minTimestamp'] for e in newsCollection.find({'minTimestamp': {'$gt': startTimestamp}}, projection={"minTimestamp": True})]

In [None]:
show(barplot(timestamps, labelEncoderFunct=labelEncoderFunct, n=60))

# Defining split functions

In [None]:
def newsSplit(startTs, splitTs, endTs, returnExtraNews=True, logger=None, verbose=True):
    if isinstance(startTs, str):
        startTs = convertDate(startTs, dateFormat=DATE_FORMAT.timestamp)
    if isinstance(splitTs, str):
        splitTs = convertDate(splitTs, dateFormat=DATE_FORMAT.timestamp)
    if isinstance(endTs, str):
        endTs = convertDate(endTs, dateFormat=DATE_FORMAT.timestamp)
    newsCollection = getNewsCollection(logger=logger)
    previousNews = set()
    trainNews = set()
    testNews = set()
    afterNews = set()
    pbar = ProgressBar(len(newsCollection), logger=logger, verbose=verbose, printRatio=0.3)
    for news in newsCollection.find({}, projection={'minTimestamp': True, 'url': True}):
        url = news['url']
        ts = news['minTimestamp']
        if ts < startTs:
            previousNews.add(url)
        elif ts >= startTs and ts < splitTs:
            trainNews.add(url)
        elif ts >= splitTs and ts <= endTs:
            testNews.add(url)
        else:
            afterNews.add(url)
        pbar.tic()
    if returnExtraNews:
        return previousNews, trainNews, testNews, afterNews
    else:
        return trainNews, testNews

In [None]:
def usersSplit(urlss, minUrls=None, logger=None, verbose=True):
    """
        This function get a list of urls.
        You need to specify  the minium of url per user for each list in minUrls.
    """
    if minUrls is None:
        minUrls = [0] * len(urlss)
    assert len(urlss) == len(minUrls)
    usersCollection = getUsersCollection(logger=logger)
    bulks = []
    pbar = ProgressBar(len(usersCollection) * len(urlss), logger=logger, verbose=verbose, printRatio=0.3)
    for urls in urlss:
        current = dict()
        for user in usersCollection.find({}, projection={'timestamps': True, 'news': True, 'user_id': True}):
            current[user['user_id']] = dict()
            for i in range(len(user['news'])):
                if user['news'][i] in urls:
                    current[user['user_id']][user['news'][i]] = user['timestamps'][i]
            pbar.tic()
        bulks.append(current)
    toDeleteUsers = set()
    for i in range(len(bulks)):
        theMin = minUrls[i]
        users = bulks[i]
        for userId in list(users.keys()):
            if len(users[userId]) < theMin:
                toDeleteUsers.add(userId)
    remaining = len(usersCollection) - len(toDeleteUsers)
    log(str(remaining) + " users remaining on " + str(len(usersCollection))
                + ", so " + str(truncateFloat(remaining / len(usersCollection) * 100, 2)) + "%", logger)
    for i in range(len(bulks)):
        for userId in toDeleteUsers:
            del bulks[i][userId]
    return bulks

# Count for a split at 2018-02-01

In [None]:
theSplit = "2018-02-01"

In [None]:
previousNews, trainNews, testNews, afterNews = newsSplit(startDate, theSplit, endDate, logger=logger)

In [None]:
trainUsers, testUsers = usersSplit([trainNews, testNews], [0, 0], logger=logger)

In [None]:
show(barplot([len(n) for u, n in trainUsers.items()], title="Train counts for a split at " + theSplit, n=80))

In [None]:
show(barplot([len(n) for u, n in testUsers.items()], title="Test counts for a split at " + theSplit, n=80))

# Count for a split at 2018-01-15

In [None]:
theSplit = "2018-01-15"

In [None]:
previousNews, trainNews, testNews, afterNews = newsSplit(startDate, theSplit, endDate, logger=logger)

In [None]:
trainUsers, testUsers = usersSplit([trainNews, testNews], [0, 0], logger=logger)

In [None]:
show(barplot([len(n) for u, n in trainUsers.items()], title="Train counts for a split at " + theSplit, n=80))

In [None]:
show(barplot([len(n) for u, n in testUsers.items()], title="Test counts for a split at " + theSplit, n=80))

# User deletion for split at 2018-02-01 and mins (3, 2)

In [None]:
theSplit = "2018-02-01"

In [None]:
previousNews, trainNews, testNews, afterNews = newsSplit(startDate, theSplit, endDate, logger=logger)

In [None]:
trainUsers, testUsers = usersSplit([trainNews, testNews], [3, 2], logger=logger)

# User deletion for split at 2018-01-15 and mins (3, 2)

In [None]:
theSplit = "2018-01-15"

In [None]:
previousNews, trainNews, testNews, afterNews = newsSplit(startDate, theSplit, endDate, logger=logger)

In [None]:
trainUsers, testUsers = usersSplit([trainNews, testNews], [3, 2], logger=logger)

# User deletion for split at 2018-01-15 and mins (15, 5)

In [None]:
theSplit = "2018-01-15"

In [None]:
previousNews, trainNews, testNews, afterNews = newsSplit(startDate, theSplit, endDate, logger=logger)

In [None]:
trainUsers, testUsers = usersSplit([trainNews, testNews], [15, 5], logger=logger)

# User deletion for split at 2018-01-15 and mins (8, 2)

In [None]:
theSplit = "2018-01-15"

In [None]:
previousNews, trainNews, testNews, afterNews = newsSplit(startDate, theSplit, endDate, logger=logger)

In [None]:
trainUsers, testUsers = usersSplit([trainNews, testNews], [8, 2], logger=logger)

# Defining the split and mins

In [None]:
theSplit = "2017-12-25" # 2018-01-15 for version 1 and 

In [None]:
mins = [8, 2]

# Getting data

In [None]:
previousNews, trainNews, testNews, afterNews = newsSplit(startDate, theSplit, endDate, logger=logger)

In [None]:
trainUsers, testUsers = usersSplit([trainNews, testNews], mins, logger=logger)

# Creating candidates

In [None]:
ranksLength = 1000

In [None]:
testNewsList = list(testNews)
candidates = dict()
pbar = ProgressBar(len(testUsers), logger=logger, printRatio=0.01)
for userId, news in testUsers.items():
    news = set(news.keys())
    while len(news) < ranksLength:
        news.add(random.choice(testNewsList))
    candidates[userId] = [news]
    pbar.tic()

In [None]:
bp(candidates, logger)

# Removing news from trainNews and testNews

In [None]:
# Getting urls:
urls = set()
for users in (trainUsers, testUsers):
    for userId, news in users.items():
        for n in news.keys():
            urls.add(n)
for userId, bulks in candidates.items():
    for news in bulks:
        for n in news:
            urls.add(n)

In [None]:
# Sub-sampling news:
trainNews = set([n for n in trainNews if n in urls])
testNews = set([n for n in testNews if n in urls])

# Analysis

In [None]:
bp(trainNews, logger)

In [None]:
bp(trainUsers, logger)

In [None]:
log("We loose " + str(len(previousNews)) + " news because they are too old.", logger)
trainNewsCount = len(trainNews)
log("We have " + str(trainNewsCount) + " news in train.", logger)
testNewsCount = len(testNews)
log("We have " + str(testNewsCount) + " news in test.", logger)
totalNewsAvailable = len(newsCollection)
log("Total available news in the dataset: " + str(totalNewsAvailable), logger)

In [None]:
log("We loose " + str(len(afterNews)) + " news because they are after the end date.", logger)

In [None]:
trainMeanNewsPerUser = truncateFloat(np.mean([len(n) for u, n in trainUsers.items()]), 2)
log("Mean news count per user in train: " + str(trainMeanNewsPerUser), logger)
testMeanNewsPerUser = truncateFloat(np.mean([len(n) for u, n in testUsers.items()]), 2)
log("Mean news count per user in test: " + str(testMeanNewsPerUser), logger)

In [None]:
trainMinNewsPerUser = min([len(n) for u, n in trainUsers.items()])
log("Min news count per user in train: " + str(trainMinNewsPerUser), logger)
testMinNewsPerUser = min([len(n) for u, n in testUsers.items()])
log("Min news count per user in test: " + str(testMinNewsPerUser), logger)
trainMaxNewsPerUser = max([len(n) for u, n in trainUsers.items()])
log("Max news count per user in train: " + str(trainMaxNewsPerUser), logger)
testMaxNewsPerUser = max([len(n) for u, n in testUsers.items()])
log("Max news count per user in test: " + str(testMaxNewsPerUser), logger)

In [None]:
usersCount = len(trainUsers)
log("Users count: " + str(usersCount), logger)

# Adding a new eval

In [None]:
from databasetools.mongo import MongoFS
(user, password, host) = getMongoAuth(user='hayj')
mfs = MongoFS(dbName="twinews-splits", user=user, password=password, host=host)

In [None]:
version = 2

In [None]:
evalMeta = dict()
evalMeta['usersCount'] = usersCount
evalMeta['trainNewsCount'] = trainNewsCount
evalMeta['testNewsCount'] = testNewsCount
evalMeta['totalNewsAvailable'] = totalNewsAvailable
evalMeta['trainMeanNewsPerUser'] = trainMeanNewsPerUser
evalMeta['testMeanNewsPerUser'] = testMeanNewsPerUser
evalMeta['trainMinNewsPerUser'] = trainMinNewsPerUser
evalMeta['testMinNewsPerUser'] = testMinNewsPerUser
evalMeta['trainMaxNewsPerUser'] = trainMaxNewsPerUser
evalMeta['testMaxNewsPerUser'] = testMaxNewsPerUser
evalMeta['created'] = getDateSec()
evalMeta['ranksLength'] = ranksLength
evalMeta['splitDate'] = theSplit
evalMeta['startDate'] = startDate
evalMeta['endDate'] = endDate

In [None]:
bp(evalMeta, logger, 5)

In [None]:
evalData = dict()
evalData['trainNews'] = trainNews
evalData['testNews'] = testNews
evalData['trainUsers'] = trainUsers
evalData['testUsers'] = testUsers
evalData['candidates'] = candidates

In [None]:
checkEvalData(evalData)

In [None]:
del mfs[version]

In [None]:
mfs.insert(version, evalData, meta=evalMeta)

# End

In [None]:
tt.toc()