In [35]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""
import sys

In [36]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from databasetools.mongo import *
from newstools.goodarticle.utils import *
from nlptools.preprocessing import *
from nlptools.news import parser as newsParser
from machinelearning.iterator import *
from twinews.utils import *
from twinews.models.ranking import *
from twinews.evaluation.utils import *

In [37]:
from nlptools.basics import *
from nltk.stem import WordNetLemmatizer 
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import LdaMulticore
from gensim.test.utils import common_corpus, common_dictionary
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import gensim
from math import log2
from math import sqrt
from numpy import asarray

In [38]:
from twinews.yfnotebooks.dssm.graph import Graph
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from twinews.yfnotebooks.load_data import hashIndex,load_hashed_data,char_index

In [39]:
isNotebook = '__file__' not in locals()
TEST = isNotebook

In [40]:
logger = Logger(tmpDir('logs') + "/dssm_title.log") if isNotebook else Logger("dssm_title-" + getHostname() + ".log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

In [41]:
modelName = "DSSM_title"

In [42]:
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
os.environ["TF_CPP_MIN_LOG_LEVEL’"] = "3"

# config

In [60]:
config = \
{
    'splitVersion': 2,
    'testVersion':2,
    
    # for input data
    'maxUsers': 30 if TEST else None, # Sub-sampling
    'maxDocuments': None,
    'useExtraNews': 0 if TEST else None, # None = unlimited, 0 = no extra news
    'minDF': 1 / 2000, # Remove words that have a document frequency ratio lower than 1 / 500
    'maxDF': 20, # Remove top 300 voc elements
    'lowercase': True,
    'doLemmatization': True,
    #'useTFIDF': True,
    
   # for model    
    'seq_length' : 100,
#     'char_embedding_size': 100,
    'learning_rate': 0.0005,
    'keep_prob': 0.7,
    'vocab_size': 3029,
    'class_size': 2,
    'epochs': 100,
    'batch_size': 1000,
    
    # for ouput
    'similarity': 'cosine',

#     'historyRef': 0.3, # 1, 1.0, 0.5, 0.3, 3, 10
}

# get data

In [63]:
# Getting users and news
evalData = getEvalData(config['splitVersion'], maxExtraNews=0,
                       maxUsers=config['maxUsers'])
(trainUsers, testUsers, trainNews, testNews, candidates, extraNews) = \
(evalData['trainUsers'], evalData['testUsers'], evalData['trainNews'],
 evalData['testNews'], evalData['candidates'], evalData['extraNews'])
bp(evalData.keys(), 5, logger)
log(b(evalData['meta'], 5), logger)

--> tic: 29.58s | message: Eval data loaded


{ candidates, extraNews, meta, testNews, testUsers, trainNews, trainUsers }
{ 'created': 2020.03.24-14.28.06, 'endDate': 2018-01-15, 'id': 2, 'ranksLength': 1000, 'splitDate': 2017-12-25, 'startDate': 2017-10-01, 'testMaxNewsPerUser': 97, 'testMeanNewsPerUser': 7.22, 'testMinNewsPerUser': 2, 'testNewsCount': 71781, 'totalNewsAvailable': 570210, 'trainMaxNewsPerUser': 379, 'trainMeanNewsPerUser': 26.48, 'trainMinNewsPerUser': 8, 'trainNewsCount': 237150, 'usersCount': 15905 }


--> toc total duration: 29.81s | message: Got Twinews evaluation data


In [64]:
trainNewsList = shuffle(list(trainNews), seed=0)
testNewsList = shuffle(list(testNews), seed=0)

In [65]:
# get all the url->title dict
newsList = trainNewsList + testNewsList
log(str(len(newsList)) + " urls for newsList:\n" + b(newsList), logger=logger)

25254 urls for newsList:
[
  http://www.wcpo.com/news/local-news/kenton-county/independence/five-critically-injured-in-kenton-cou,
  https://www.sevendaysvt.com/vermont/how-drug-treatment-policies-in-vermont-prisons-contribute-to-the,
  ...,
  http://ew.com/movies/2018/01/09/jacob-tremblay-wonder-prosthetic-transformation-photos/,
  https://jezebel.com/elon-musk-confirms-he-was-at-a-sex-party-and-didnt-even-1822009703
]


In [66]:
titles = getNewsTitles(newsList)



In [67]:
titlesToken = []
for i in pb(list(range(len(titles))), logger=logger, message="tokenizing"):
    titlesToken.append(wordTokenize(titles[i]))
bp(titlesToken,2)

tokenizing   0% [                    ]
tokenizing   9% [=                   ] (2.34s left)
tokenizing  19% [===                 ] (2.28s left)
tokenizing  29% [=====               ] (2.077s left)


[ [ Police, :, ..., ,, OH ], [ How, Drug, ..., Independent, Voice ], ..., [ See, Jacob, ..., |, EW.com ], [ Elon, Musk, ..., Know, It ] ]


In [68]:
for i in pb(list(range(len(titlesToken))), logger=logger, message="Lower casing"):
    if titlesToken[i] == None:
        print(titlesToken[i])
        continue
    else:
        for u in range(len(titlesToken[i])):
            titlesToken[i][u] = titlesToken[i][u].lower()
bp(titlesToken, logger)

Lower casing   0% [                    ]
Lower casing   9% [=                   ] (0.36s left)
Lower casing  19% [===                 ] (0.32s left)
Lower casing  29% [=====               ] (0.28s left)


None


[ [ police, :, ..., ,, oh ], [ how, drug, ..., independent, voice ], ..., [ see, jacob, ..., |, ew.com ], [ elon, musk, ..., know, it ] ]


None
None
None


In [69]:
lemmatizer = WordNetLemmatizer()
pbar = ProgressBar(len(titlesToken), logger=logger, message="Lemmatization")
for i in range(len(titlesToken)):
    if titlesToken[i] == None:
        print(titlesToken[i])
        continue
    else:
        for u in range(len(titlesToken[i])):
            titlesToken[i][u] = lemmatizer.lemmatize(titlesToken[i][u])
    pbar.tic()
bp(titlesToken, logger)

Lemmatization   0% [                    ]
Lemmatization   9% [=                   ] (1.17s left)
Lemmatization  19% [===                 ] (1.08s left)
Lemmatization  29% [=====               ] (0.933s left)


None




None
None
None


[ [ police, :, ..., ,, oh ], [ how, drug, ..., independent, voice ], ..., [ see, jacob, ..., |, ew.com ], [ elon, musk, ..., know, it ] ]


In [70]:
urlTitles= dict()
for i in range(len(titlesToken)):
    urlTitles[newsList[i]] = titlesToken[i]
bp(urlTitles, 2, logger)

{
  http://247wallst.com/healthcare-business/2017/12/30/30-big-biotech-events-coming-in-2018/3/: [ 30, big, ..., wall, st. ],
  http://247wallst.com/special-report/2017/12/08/americas-25-dying-industries-2: [ america, 's, ..., wall, st. ],
  http://247wallst.com/special-report/2018/01/11/25-best-fitness-tech-gadgets/: [ 25, best, ..., wall, st. ],
  http://a.msn.com/00/en-us/AAtmQOZ?ocid=st: [ america, 's, ..., live, in ],
  http://a.msn.com/00/en-us/AAuyJLc?ocid=st: [ what, living, ..., motley, fool ],
  ...,
  https://xtinaluvspink.wordpress.com/2016/01/17/vegan-chickpea-barley-soup-for-two/: [ vegan, chickpea, ..., healthy, ! ],
  https://yaledailynews.com/blog/2018/01/06/yale-psychiatrist-met-with-congressmen-about-trumps-mental-health/: [ yale, psychiatrist, ..., mental, health ],
  https://zdubbzattmom.wordpress.com/2018/01/03/ex-libris-the-eyes-of-madness-presents-2017s-bakers-dozen-zakks-favorite-reads-of-the-year/: [ ex, libris, ..., of, madness ],
  https://zdubbzattmom.wordp

# load the vocab

In [71]:
titleVocab = []
with open('/home/yuting/PycharmProjects/data/title_vocab.txt','r') as file:
    for line in file.readlines():
        titleVocab.append(line.strip('\n'))

bp(titleVocab,2)

[ !, #, ..., •, … ]


In [72]:
len(trainUsers.keys())

30

# build user history

In [73]:
# for multiple users
trainUserQuery = dict()
pbar = ProgressBar(len(trainUsers.keys()), logger=logger, message="build user history")

for usr in trainUsers.keys():
    queryFinal = []
    
    # choose 15 urls
    if len(list(trainUsers[usr])) < 15:
        query = list(trainUsers[usr])
    else:
        query = random.sample(list(trainUsers[usr]),15)
        
    # put 15 titles together, using extend
    temp = []
    for url in query:
        if urlTitles[url] == None:
            continue
        else:
            temp.extend(urlTitles[url])
            
    # filter what is not in the vocab
    tempCompact = []
    for i in range(len(temp)):
        if temp[i] in titleVocab:
            tempCompact.append(temp[i])
    
    # remove duplicated & detokenize
    queryCompact = detokenize(list(set(tempCompact)))
    
    queryFinal.append(queryCompact)
    trainUserQuery[usr] = queryFinal
    
    pbar.tic()

build user history   3% [                    ]
build user history  10% [==                  ] (1.53s left)
build user history  20% [====                ] (1.359s left)


# rankings

In [74]:
def computeSimilarity(p,h,y):
    
    tf.reset_default_graph() 
    model = Graph()
    saver = tf.train.Saver()

    with tf.Session()as sess:
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, '/home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt')
        loss, acc, logit = sess.run([model.loss, model.acc, model.logits],
                             feed_dict={model.p: p,
                                        model.h: h,
                                        model.y: y,
                                        model.keep_prob: 1})

        #print('loss: ', loss, ' acc:', acc)
        return logit
#logit = computeSimilarity(p,h,y)

In [75]:
# rankings of the candidates
pbar = ProgressBar(len(trainUsers.keys()), logger=logger, message="testing")
rankings = dict()
for usr in trainUsers.keys():
    usrRankings = []
    
    # get historical data as query(p)
    query = trainUserQuery[usr] * 1000
    
    for candidates in evalData['candidates'][usr]:
        candidates = list(candidates)
        # get 1000 h as a list(h)
        h_temp = []
        for url in candidates:
            h_temp.append(detokenize(urlTitles[url]))
        # get p,h
        p,h = char_index(query,h_temp)
        y = np.zeros((1000))                    # whatever initialization
        
        scoresCandidates = []
        logit = computeSimilarity(p,h,y)            # get the 1000 scores list
        scoresCandidates = zip(candidates,list(logit[:,1]))
        ranking = sortBy(scoresCandidates, index=1, desc=True)
        ranking = [e[0] for e in ranking]
        usrRankings.append(ranking)
    rankings[usr] = usrRankings
    
    pbar.tic()

INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt


testing   3% [                    ]


INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt


testing  10% [==                  ] (23m 28.5s left)


INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt


testing  20% [====                ] (20m 52.4s left)


INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt




INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt




INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt




INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt




INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt




INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt




INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt




INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt




In [76]:
bp(rankings)

{
  1198644368: 
  [
    [
      https://www.theguardian.com/politics/blog/live/2018/jan/09/reshuffle-government-tory-cabinet-theresa,
      https://www.theglobeandmail.com/opinion/columnists/ubc-alumni-involved-in-steven-galloway-case-offer,
      ...,
      https://www.popmatters.com/matthew-oneill-gates-video-premiere-2524320883.html,
      http://ontario.ca/b3v8
    ]
  ],
  1447457215: 
  [
    [
      http://ow.ly/FAT030hwcUA,
      https://www.washingtonpost.com/local/a-battlefield-of-memory-asphalt-where-a-black-cemetery-is-said-,
      ...,
      https://sloanreview.mit.edu/article/how-office-seating-arrangements-can-boost-the-bottom-line/?utm_s,
      http://www.journal-news.com/sports/college-basketball-miami-holds-off-kent-state-for-mac-win/UEFYsov
    ]
  ],
  1596810348: 
  [
    [
      http://trib.in/2CeTle0,
      http://www.chicagotribune.com/news/local/breaking/ct-met-ethics-board-fines-assessor-berrios-cook-co,
      ...,
      http://bit.ly/2F3YIxH,
      http://bi

In [71]:
bp(evalData['candidates'])

{
  1198644368: 
  [
    {
      http://a.msn.com/00/en-us/AAuyJLc?ocid=st,
      http://a.msn.com/00/en-us/BBHoy0Q?ocid=st,
      ...,
      https://www.wxyz.com/news/opinion/wxyz-editorial-tv-7-newsmaker-of-the-year-gerard-gerry-anderson-en,
      https://www.zerohedge.com/news/2018-01-08/grand-jury-empaneled-10-million-fraud-probe-involving-jane
    }
  ],
  1447457215: 
  [
    {
      http://247wallst.com/special-report/2018/01/11/25-best-fitness-tech-gadgets/,
      http://abcn.ws/2BVXNxp,
      ...,
      https://www.zerohedge.com/news/2018-01-09/jamie-dimon-i-regret-calling-bitcoin-fraud,
      https://yaledailynews.com/blog/2018/01/06/yale-psychiatrist-met-with-congressmen-about-trumps-mental
    }
  ],
  1596810348: 
  [
    {
      http://a.msn.com/02/en-us/AAuzrQE?ocid=st,
      http://abcn.ws/2DfATlS,
      ...,
      https://www.wxyz.com/news/national/congress-is-back-and-here-s-what-s-on-the-agenda,
      https://www.zerohedge.com/news/2018-01-08/youll-never-believe-how-

In [23]:
bp(rankings)

{
  1198644368: 
  [
    [
      http://ontario.ca/b3v8,
      https://www.popmatters.com/matthew-oneill-gates-video-premiere-2524320883.html,
      ...,
      https://www.theguardian.com/higher-education-network/2017/dec/29/academics-anonymous-the-best-secret,
      https://www.theglobeandmail.com/opinion/columnists/ubc-alumni-involved-in-steven-galloway-case-offer
    ]
  ],
  137210010: 
  [
    [ http://bit.ly/2BBHhm1, https://buff.ly/2DQQMym, ..., http://nyti.ms/2A8wX3r, https://buff.ly/2DnBv9h ]
  ],
  1447457215: 
  [
    [
      https://lnkd.in/gnwtgAY,
      http://247wallst.com/special-report/2018/01/11/25-best-fitness-tech-gadgets/,
      ...,
      http://m.chron.com/business/retail/article/Port-Houston-container-imports-surged-last-year-12481938.,
      https://nyti.ms/2Egstu5
    ]
  ],
  14639840: 
  [
    [
      http://abc7news.com/2913203/,
      http://www.wthitv.com/weather/,
      ...,
      https://www.theglobeandmail.com/life/health-and-fitness/health/leslie-beck

In [77]:
checkRankings(rankings,evalData['candidates'],maxUsers=None)

In [78]:
addRanking(modelName, rankings, config, logger=logger)

# news classification

In [31]:
# for one user
query = random.sample(list(evalData['trainUsers']['1198644368']),15)
bp(query)

[
  http://ow.ly/KXjL30g7kRC,
  https://www.azcentral.com/story/news/politics/legislature/2017/11/07/arizona-state-rep-ugenti-rita-s,
  ...,
  https://www.cincinnati.com/story/news/politics/2017/10/18/ohio-republican-state-senator-who-resigned,
  http://www.philly.com/philly/business/cannabis/Las-Vegas-buying-marijuana-is-legal-but-theres-nowher
]


In [24]:
# for userId in trainUsers.keys():
temp = []
for url in query:
    if urlTitles[url] == None:
        continue
    else:
        temp.extend(urlTitles[url])

In [26]:
bp(temp)

[ sexual, harassment, ..., to, resign ]


In [79]:
tempTest = wordTokenize(temp)
bp(tempTest)
queryCompactTemp = list(set(tempTest))
bp(queryCompactTemp)

[ texas, journalist, ..., york, is ]


[ opinion, |, ..., texas, tribune ]


In [87]:
queryCompact = []
for i in range(len(queryCompactTemp)):
    if queryCompactTemp[i] in titleVocab:
        queryCompact.append(queryCompactTemp[i])

In [90]:
len(queryCompact)

77

In [94]:
# detokenize
queryCompact2 = detokenize(queryCompact)
queryCompact2

"texas journalist model wo must reality grade mueller post. year sexually making washington from han't capitol into themselves claim how lawmaker canceled senator america shooting accused it party be by now base admits report roy not state day behavior victim podcast investigation valley moore will trump dy powerful fellow fox harassment over reduce doctor republican trip were or holiday made opinion false tribune problem retirement jeff speech who an car been this sexual colorado york"

In [100]:
queryFinal = []
queryFinal.append(queryCompact2)

In [102]:
queryFinal = queryFinal * 1000

In [65]:
h_temp = []
for url in evalData['candidates']['1198644368'][0]:
    h_temp.append(urlTitles[url])
#h_temp

In [119]:
y = np.zeros((1000))

In [112]:
p,h = char_index(queryFinal,h_temp)

In [120]:
def computeSimilarity(p,h,y):
    
    tf.reset_default_graph() 
    model = Graph()
    saver = tf.train.Saver()

    with tf.Session()as sess:
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, '/home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt')
        loss, acc, logit = sess.run([model.loss, model.acc, model.logits],
                             feed_dict={model.p: p,
                                        model.h: h,
                                        model.y: y,
                                        model.keep_prob: 1})

        #print('loss: ', loss, ' acc:', acc)
        return logit
logit = computeSimilarity(p,h,y)

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt


In [127]:
logit

array([[ 0.01799134,  0.98200864],
       [-0.0375907 ,  1.0375907 ],
       [-0.0545145 ,  1.0545145 ],
       ...,
       [ 0.87169796,  0.12830204],
       [ 0.8784758 ,  0.12152421],
       [ 0.8359748 ,  0.16402519]], dtype=float32)

In [73]:
# rankings of the candidates
rankings = dict()
for usr in trainUsers.keys():
    # get historical data as query(p)
    query = np.zeros((1,7427))
    for url in evalData['trainUsers'][usr]:
        query += urlHasedDocs[url].toarray()
        p = np.tile(query,(1000,1)) 
    usrRankings = []
    for candidates in evalData['candidates'][usr]:
        candidates = list(candidates)
        # get 1000 h_doc as a list(h)
        h = np.zeros((1,7427))
        for url in candidates:
            h_temp = urlHasedDocs[url].toarray()
            h = np.vstack((h,h_temp))
        h = h[1:]                                   # cut off the first element
        y = np.zeros((1000))
        scoresCandidates = []
        logit = computeSimilarity(p,h,y)            # get the 1000 scores list
        scoresCandidates = zip(candidates,list(logit[:,0]))
        ranking = sortBy(scoresCandidates, index=1, desc=True)
        ranking = [e[0] for e in ranking]
        usrRankings.append(ranking)
    rankings[usr] = usrRankings

INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_9.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_9.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_9.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_9.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_9.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_9.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_9.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_9.ckpt
INFO:tensorflow:

In [77]:
checkRankings(rankings,evalData['candidates'],maxUsers=None)

In [29]:
addRanking(modelName, rankings, config, logger=logger)

In [56]:
modelName

'DSSM_yf'

# Extra notes

In [36]:
a = np.array([[1,2,3,4,5]])
a.shape

(1, 5)

In [51]:
b = np.tile(a,(5,1))

In [59]:
b.shape

(6, 5)

In [None]:
# rankings of the candidates
rankings = dict()
for usr in trainUsers.keys():
    query = np.zeros((1,7427))
    for url in evalData['trainUsers'][usr]:
        query += urlHasedDocs[url].toarray()
    usrRankings = []
    for candidates in evalData['candidates'][usr]:
        candidates = list(candidates)
        scoresCandidates = []
        for i in range(len(candidates)):
            candidateHashed = urlHasedDocs[candidates[i]].toarray()
            logit = computeSimilarity(query, candidateHashed, 0)
            score = logit[0][0]
            # score = 1
            scoresCandidates.append((candidates[i], score))
        ranking = sortBy(scoresCandidates, index=1, desc=True)
        ranking = [e[0] for e in ranking]
        usrRankings.append(ranking)
    rankings[usr] = usrRankings