# Commands

In [1]:
# cd twinews-logs ; jupython -o nohup-tfidf-$HOSTNAME.out --venv st-venv ~/Workspace/Python/Datasets/Twinews/twinews/models/tfidf.ipynb

# Init

In [2]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [3]:
isNotebook = '__file__' not in locals()

In [4]:
TEST = isNotebook # isNotebook, True, False

In [5]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from nlptools.preprocessing import *
from twinews.utils import *
from twinews.models.ranking import *

In [30]:
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [7]:
logger = Logger(tmpDir('logs') + "/tfidf.log") if isNotebook else Logger("tfidf-" + getHostname() + ".log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

# Config

In [27]:
config = \
{
    'splitVersion': 2,
    
    'maxUsers': 2 if TEST else None, # Sub-sampling
    'minDF': None if TEST else None, # Remove words that have a document frequency ratio lower than 1 / 500
    'maxDF': None if TEST else None, # Remove top 300 voc elements
    
    'lowercase': False if TEST else False,
    'doLemmatization': False if TEST else False,
    'sublinearTF': True,
    
    'dimensions': 20, # 20, 100, 500, None (means no dimensionality reduction)
    'maxIter': 10, # Iteration for SVD
    
    'distance': 'cosine', # 'cosine', 'euclidean', 'kl', 'js'
    'historyRef': 30, # 1, 1.0, 0.6, 0.3, 3, 10, 30
}

In [28]:
if config['dimensions'] is None:
    del config['maxIter']

In [9]:
# Here we check if we already generated ranking for this model with this specific config:
if not isNotebook:
    if rankingExists(modelName, config, logger=logger):
        raise Exception(modelName + " with this config already exist:\n" + b(config, 5))

# Getting data

In [10]:
# Getting users and news
evalData = getEvalData(config['splitVersion'], maxExtraNews=0, maxUsers=config['maxUsers'], logger=logger)
(trainUsers, testUsers, trainNews, testNews, candidates, extraNews) = \
(evalData['trainUsers'], evalData['testUsers'], evalData['trainNews'],
 evalData['testNews'], evalData['candidates'], evalData['extraNews'])
bp(evalData.keys(), 5, logger)
log(b(evalData['meta'], 5), logger)

--> tic: 32.09s | message: Eval data loaded
--> toc total duration: 32.37s | message: Got Twinews evaluation data
{ candidates, extraNews, meta, testNews, testUsers, trainNews, trainUsers }
{ 'created': 2020.03.24-14.28.06, 'endDate': 2018-01-15, 'id': 2, 'ranksLength': 1000, 'splitDate': 2017-12-25, 'startDate': 2017-10-01, 'testMaxNewsPerUser': 97, 'testMeanNewsPerUser': 7.22, 'testMinNewsPerUser': 2, 'testNewsCount': 71781, 'totalNewsAvailable': 570210, 'trainMaxNewsPerUser': 379, 'trainMeanNewsPerUser': 26.48, 'trainMinNewsPerUser': 8, 'trainNewsCount': 237150, 'usersCount': 15905 }


In [13]:
# Here it is important to convert urls to lists because we want the same order to retrieve vectors by index...
# And we shuffle it so we do not stick urls a a user at the begin...
# But we seed the random to always have same order...
trainNewsList = shuffle(list(trainNews), seed=0)
testNewsList = shuffle(list(testNews), seed=0)
newsList = trainNewsList + testNewsList

In [14]:
# Print all:
log(str(len(trainNewsList)) + " urls for trainNewsList", logger=logger)
log(str(len(testNewsList)) + " urls for testNewsList", logger=logger)
log(str(len(newsList)) + " urls for newsList", logger=logger)

28 urls for trainNewsList
1989 urls for testNewsList
2017 urls for newsList


In [15]:
# We get sentences:
sentences = getNewsSentences(newsList, logger=logger)
bp(sentences, logger)

twinews news (version 1.0) initialised.
  0% [                    ]
  9% [=                   ] (36.5s left)
 19% [===                 ] (30.934s left)
 29% [=====               ] (27.201s left)
[
  [ [ West, Ham, ..., decline, . ], [ The, club, ..., staff, . ], ..., [ There, was, ..., Monday, . ], [ Bilic, fate, ..., Monday, . ] ],
  [ [ What, missing, ..., room, . ], [ ", I, ..., late, ! ], ..., [ I, know, ..., hurting, . ], [ I, know, ..., of, . ] ],
  ...,
  [ [ In, __int_4__, ..., project, . ], [ They, gathered, ..., years, . ], ..., [ To, order, ..., call, __int_3__ ], [ Free, UK, ..., only, . ] ],
  [ [ In, Rural, ..., shelters, . ], [ But, in, ..., nights, . ], ..., [ Wyoming, homeless, ..., summer, . ], [ It, may, ..., state, . ] ]
]


In [16]:
# We flatten sentences:
for i in range(len(sentences)):
    sentences[i] = flattenLists(sentences[i])
docs = sentences
bp(docs, logger)

[ [ West, Ham, ..., Monday, . ], [ What, missing, ..., of, . ], ..., [ In, __int_4__, ..., only, . ], [ In, Rural, ..., state, . ] ]


In [17]:
# Lower case:
if config['lowercase']:
    for i in pb(list(range(len(docs))), logger=logger, message="Lower casing"):
        for u in range(len(docs[i])):
            docs[i][u] = docs[i][u].lower()
bp(docs, logger)

[ [ West, Ham, ..., Monday, . ], [ What, missing, ..., of, . ], ..., [ In, __int_4__, ..., only, . ], [ In, Rural, ..., state, . ] ]


In [18]:
# Lemmatization:
if config['doLemmatization']:
    lemmatizer = WordNetLemmatizer()
    pbar = ProgressBar(len(docs), logger=logger, message="Lemmatization")
    for i in range(len(docs)):
        for u in range(len(docs[i])):
            docs[i][u] = lemmatizer.lemmatize(docs[i][u])
        pbar.tic()
bp(docs, logger)

[ [ West, Ham, ..., Monday, . ], [ What, missing, ..., of, . ], ..., [ In, __int_4__, ..., only, . ], [ In, Rural, ..., state, . ] ]


In [19]:
# Filtering the corpus:
if config['minDF'] is not None or config['maxDF'] is not None:
    docs = filterCorpus(docs, minDF=config['minDF'], maxDF=config['maxDF'],
                        removeEmptyDocs=False, allowEmptyDocs=False, logger=logger)
    for doc in docs: assert len(doc) > 0
    bp(docs, logger)

In [20]:
tt.tic("Data preprocessed")

--> tic: 5m 38.019s | message: Data preprocessed


338.02

# Infering vectors

In [22]:
vectorizer = TfidfVectorizer\
(
    sublinear_tf=config['sublinearTF'],
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    # lowercase=True, # Doesn't work because we erased preprocessor
)
vectors = vectorizer.fit_transform(docs)
assert vectors.shape[0] == len(newsList)

In [23]:
tt.tic("TFIDF vectors generated")

--> tic: 4m 19.73s | message: TFIDF vectors generated


259.73

# Dimensionality reduction

In [31]:
if config['dimensions'] is not None:
    svd = TruncatedSVD(n_components=config['dimensions'], n_iter=config['maxIter'], random_state=42)
    vectors = svd.fit_transform(vectors)

# Making a dict url --> topic vector

In [32]:
assert len(vectors) == len(newsList)
urlsVectors = dict()
for i in range(len(newsList)):
    urlsVectors[newsList[i]] = vectors[i]
bp(urlsVectors, logger)

{
  http://a.msn.com/01/en-ie/BBHzIAS?ocid=st: [ 0.25081041  0.03500825 -0.04648407  0.03995524  0.03818875  0.06587702
   -0.02661354  0.03366245  0.00424143  0.02856054  0.01719238 -0.01333474
   -0.01785074 -0.0228395  -0.03801074 -0.00253639 -0.03199948 -0.00770459
    0.01834506 -0.02179933],
  http://a.msn.com/06/en-us/BBH7EiO?ocid=st: [ 0.27661412  0.09373918 -0.03876081 -0.0035497   0.02332182  0.02478822
   -0.05780108  0.03557604 -0.0189743  -0.02488483 -0.00336932  0.01306875
   -0.02167153 -0.03175106 -0.06713755  0.0138377  -0.00976167  0.01452504
   -0.02321557 -0.01297543],
  http://abcn.ws/2CbqxlW: [ 0.29984423  0.11062978  0.13114013  0.00593856 -0.0616313  -0.01404553
    0.01645525  0.01150832 -0.04753689 -0.0607552   0.07225907  0.02000181
    0.05133595  0.0578758   0.10473919  0.00540235 -0.07055241 -0.01979625
    0.01783355 -0.01287394],
  http://abcn.ws/2D92PrQ: [ 0.29097796 -0.10831123  0.11796665 -0.10420321  0.09573399  0.03709742
   -0.07014571  0.01241763 

# Ranking

In [33]:
# Read the doc!
rankings = usersRankingsByHistoryDistance\
(
    trainUsers,
    candidates,
    config['historyRef'],
    urlsVectors,
    distanceMetric=config['distance'],
    logger=logger,
)



In [34]:
bp(rankings, logger, 4)

{
  '2617447752': 
  [
    [
      https://medium.com/music-city-soccer/nashville-sc-signs-four-new-players-to-usl-squad-dcaeab3bdecf,
      http://waitingfornextyear.com/2017/03/terry-francona-yandy-diaz-clevelan-indians/,
      https://www.theguardian.com/football/2018/jan/07/video-assistant-referees-errors-mike-riley,
      https://fb.me/7LMIWdJxR,
      http://www.kcchronicle.com/lists/2018/01/12/61518ccaaf8049ae909db455e3d72049/index.xml?page=2,
      https://sportsday.dallasnews.com/soccer/soccer/2018/01/01/fc-dallas-buy-new-years-gift,
      http://bit.ly/2DuVgdQ,
      https://www.theguardian.com/football/2018/jan/11/manchester-united-alexis-sanchez-arsenal,
      https://wp.me/p2g4vo-1QEu,
      http://wapo.st/2lHeuXi?tid=ss_tw&utm_term=.3126ce7425ba,
      ...,
      http://www.arkansasonline.com/news/2018/jan/08/little-rock-police-18-year-old-arrested-was-driver/,
      https://usat.ly/2milzx0,
      http://bit.ly/2AjIm0l,
      http://politi.co/2qlebpK,
      https://www.bl

In [35]:
tt.tic("Rankings done")

--> tic: 6h 1m 12.919s | message: Rankings done


21672.92

# Adding rankings to the db

In [41]:
modelName = 'tfidf'

In [37]:
# Read the doc!
addRanking(modelName, rankings, config, logger=logger)

In [38]:
tt.tic("Rankings stored")

--> tic: 1m 32.42s | message: Rankings stored


92.42

In [39]:
totalDuration = tt.toc()

--> toc total duration: 6h 12m 43.97s


In [42]:
notif(modelName + '-' + objectToHash(config)[:5] + " done in " + secondsToHumanReadableDuration(totalDuration) + " on " + getHostname())