# Commands

In [None]:
# oomstopper --no-tail jaccard ; killbill jaccard ; cd ~/twinews-logs ; jupython -o nohup-jaccard-$HOSTNAME.out --venv st-venv ~/Workspace/Python/Datasets/Twinews/twinews/models/jaccard.ipynb

# Init

In [None]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [None]:
isNotebook = '__file__' not in locals()

In [None]:
TEST = isNotebook # isNotebook, True, False

In [None]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from nlptools.preprocessing import *
from nlptools.basics import *
from twinews.utils import *
from twinews.models.ranking import *
from machinelearning.iterator import *

In [None]:
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [None]:
logger = Logger(tmpDir('logs') + "/jaccard.log") if isNotebook else Logger("jaccard-" + getHostname() + ".log")
tt = TicToc(logger=logger)
tt.tic()

# Config

In [None]:
config = \
{
    'splitVersion': 1,
    
    'maxUsers': 10 if TEST else None,
    'minDF': 2 if TEST else 2, # 2, 1 / 2000
    'maxDF': 100 if TEST else 300, # 100, 200, 300 or None
    
    'lowercase': True if TEST else True,
    'doLemmatization': False if TEST else False,
}

In [None]:
modelName = 'jaccard'

In [None]:
# Here we check if we already generated ranking for this model with this specific config:
if not isNotebook:
    if rankingExists(modelName, config, logger=logger):
        raise Exception(modelName + " with this config already exist:\n" + b(config, 5))

# Getting data

In [None]:
# Getting users and news
evalData = getEvalData(config['splitVersion'], maxExtraNews=0, maxUsers=config['maxUsers'], logger=logger)
(trainUsers, testUsers, trainNews, testNews, candidates, extraNews) = \
(evalData['trainUsers'], evalData['testUsers'], evalData['trainNews'],
 evalData['testNews'], evalData['candidates'], evalData['extraNews'])
bp(evalData.keys(), 5, logger)
log(b(evalData['meta'], 5), logger)

In [None]:
# Here it is important to convert urls to lists because we want the same order to retrieve vectors by index...
# And we shuffle it so we do not stick urls a a user at the begin...
# But we seed the random to always have same order...
trainNewsList = shuffle(list(trainNews), seed=0)
testNewsList = shuffle(list(testNews), seed=0)
newsList = trainNewsList + testNewsList

In [None]:
# Print all:
log(str(len(trainNewsList)) + " urls for trainNewsList", logger=logger)
log(str(len(testNewsList)) + " urls for testNewsList", logger=logger)
log(str(len(newsList)) + " urls for newsList", logger=logger)

In [None]:
# We get sentences:
sentences = getNewsSentences(newsList, logger=logger)
bp(sentences, logger)

In [None]:
# We flatten sentences:
for i in range(len(sentences)):
    sentences[i] = flattenLists(sentences[i])
docs = sentences
bp(docs, logger)

In [None]:
# Lower case:
if config['lowercase']:
    for i in pb(list(range(len(docs))), logger=logger, message="Lowercasing"):
        for u in range(len(docs[i])):
            docs[i][u] = docs[i][u].lower()
bp(docs, logger)

In [None]:
# Lemmatization:
if config['doLemmatization']:
    lemmatizer = WordNetLemmatizer()
    pbar = ProgressBar(len(docs), logger=logger, message="Lemmatization")
    for i in range(len(docs)):
        for u in range(len(docs[i])):
            docs[i][u] = lemmatizer.lemmatize(docs[i][u])
        pbar.tic()
bp(docs, logger)

In [None]:
# Filtering the corpus:
if config['minDF'] is not None or config['maxDF'] is not None:
    docs = filterCorpus(docs, minDF=config['minDF'], maxDF=config['maxDF'],
                        removeEmptyDocs=False, allowEmptyDocs=False, logger=logger)
    for doc in docs: assert len(doc) > 0
    bp(docs, logger)

In [None]:
# Making the urlsVector map:
urlsDocs = dict()
for i in range(len(docs)):
    urlsDocs[newsList[i]] = set(docs[i])
bp(urlsDocs, logger)

In [None]:
docs = None
sentences = None

In [None]:
tt.tic("Data preprocessed")

# Ranking

In [None]:
def jaccardDistance(doc1, doc2):
    assert isinstance(doc1, set) and isinstance(doc2, set)
    assert len(doc1) > 0 and len(doc2) > 0
    # assert isinstance(doc1[0], str) and isinstance(doc2[0], str)
    return 1 - len(doc1.intersection(doc2)) / len((doc1.union(doc2)))

In [None]:
userIds = list(candidates.keys())

In [None]:
chunksSize = int(len(userIds) / 500)
if chunksSize == 0:
    chunksSize = 1

In [None]:
userIds = chunks(userIds, chunksSize)

In [None]:
bp(userIds, 3, logger)

In [None]:
def genFunct(containers, *args, **kwargs):
    if not isinstance(containers[0], list):
        containers = [containers]
    for container in containers:
        for userId in container:
            history = set()
            for url in trainUsers[userId]:
                for word in urlsDocs[url]:
                    history.add(word)
            currentRankings = []
            for currentCandidates in candidates[userId]:
                currentCandidates = list(currentCandidates)
                ranking = dict()
                for url in currentCandidates:
                    dist = jaccardDistance(history, urlsDocs[url])
                    ranking[url] = dist
                ranking = sortBy(ranking, index=1, desc=False)
                currentRankings.append(ranking)
            yield (userId, currentRankings)

In [None]:
# We use MLIterator for multi-processing:
mli = MLIterator(userIds, genFunct, logger=logger, printRatio=0.01, parallelProcesses=4)

In [None]:
warnFreeRAM(logger=logger)

In [None]:
rankings = dict()
i = 0
modulo = int(len(candidates) / 100) + 1
for (userId, currentRankings) in mli:
    rankings[userId] = currentRankings
    if i % modulo == 0:
        warnFreeRAM(logger=logger)
    i += 1

In [None]:
bp(rankings, logger, 4)

In [None]:
tt.tic("Rankings done")

# Adding rankings to the db

In [None]:
# Read the doc!
addRanking(modelName, rankings, config, logger=logger)

In [None]:
tt.tic("Rankings stored")

In [None]:
totalDuration = tt.toc()

In [None]:
notif(modelName + '-' + objectToHash(config)[:5] + " done in " + secondsToHumanReadableDuration(totalDuration) + " on " + getHostname())