In [35]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""
import sys

In [36]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from databasetools.mongo import *
from newstools.goodarticle.utils import *
from nlptools.preprocessing import *
from nlptools.news import parser as newsParser
from machinelearning.iterator import *
from twinews.utils import *
from twinews.models.ranking import *
from twinews.evaluation.utils import *

In [37]:
from nlptools.basics import *
from nltk.stem import WordNetLemmatizer 
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import LdaMulticore
from gensim.test.utils import common_corpus, common_dictionary
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import gensim
from math import log2
from math import sqrt
from numpy import asarray

In [38]:
from twinews.yfnotebooks.dssm.graph import Graph
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
from twinews.yfnotebooks.load_data import hashIndex,load_hashed_data,char_index

In [39]:
isNotebook = '__file__' not in locals()
TEST = isNotebook

In [40]:
logger = Logger(tmpDir('logs') + "/dssm_title.log") if isNotebook else Logger("dssm_title-" + getHostname() + ".log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

In [41]:
modelName = "DSSM_title"

In [42]:
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
os.environ["TF_CPP_MIN_LOG_LEVEL’"] = "3"

# config

In [60]:
config = \
{
    'splitVersion': 2,
    'testVersion':2, # with some parameters changed
    
    # for input data
    'maxUsers': 30 if TEST else None, # Sub-sampling
    'maxDocuments': None,
    'useExtraNews': 0 if TEST else None, # None = unlimited, 0 = no extra news
    'minDF': 1 / 2000, # Remove words that have a document frequency ratio lower than 1 / 2000
    'maxDF': 20, # Remove top 20 voc elements
    'lowercase': True,
    'doLemmatization': True,

    
   # for model    
    'seq_length' : 100,
#     'char_embedding_size': 100,
    'learning_rate': 0.0005,
    'keep_prob': 0.7,
    'vocab_size': 3029,
    'class_size': 2,
    'epochs': 100,
    'batch_size': 1000,
    
    # for ouput
    'similarity': 'cosine',

#     'historyRef': 0.3, # 1, 1.0, 0.5, 0.3, 3, 10
}

# get data

In [63]:
# Getting users and news
evalData = getEvalData(config['splitVersion'], maxExtraNews=0,
                       maxUsers=config['maxUsers'])
(trainUsers, testUsers, trainNews, testNews, candidates, extraNews) = \
(evalData['trainUsers'], evalData['testUsers'], evalData['trainNews'],
 evalData['testNews'], evalData['candidates'], evalData['extraNews'])
bp(evalData.keys(), 5, logger)
log(b(evalData['meta'], 5), logger)

--> tic: 29.58s | message: Eval data loaded


{ candidates, extraNews, meta, testNews, testUsers, trainNews, trainUsers }
{ 'created': 2020.03.24-14.28.06, 'endDate': 2018-01-15, 'id': 2, 'ranksLength': 1000, 'splitDate': 2017-12-25, 'startDate': 2017-10-01, 'testMaxNewsPerUser': 97, 'testMeanNewsPerUser': 7.22, 'testMinNewsPerUser': 2, 'testNewsCount': 71781, 'totalNewsAvailable': 570210, 'trainMaxNewsPerUser': 379, 'trainMeanNewsPerUser': 26.48, 'trainMinNewsPerUser': 8, 'trainNewsCount': 237150, 'usersCount': 15905 }


--> toc total duration: 29.81s | message: Got Twinews evaluation data


In [64]:
trainNewsList = shuffle(list(trainNews), seed=0)
testNewsList = shuffle(list(testNews), seed=0)

In [65]:
newsList = trainNewsList + testNewsList
log(str(len(newsList)) + " urls for newsList:\n" + b(newsList), logger=logger)

25254 urls for newsList:
[
  http://www.wcpo.com/news/local-news/kenton-county/independence/five-critically-injured-in-kenton-cou,
  https://www.sevendaysvt.com/vermont/how-drug-treatment-policies-in-vermont-prisons-contribute-to-the,
  ...,
  http://ew.com/movies/2018/01/09/jacob-tremblay-wonder-prosthetic-transformation-photos/,
  https://jezebel.com/elon-musk-confirms-he-was-at-a-sex-party-and-didnt-even-1822009703
]


In [66]:
# get all the title to form the (url: title) dict 
titles = getNewsTitles(newsList)



In [67]:
# get tokenized title
titlesToken = []
for i in pb(list(range(len(titles))), logger=logger, message="tokenizing"):
    titlesToken.append(wordTokenize(titles[i]))
bp(titlesToken,2)

tokenizing   0% [                    ]
tokenizing   9% [=                   ] (2.34s left)
tokenizing  19% [===                 ] (2.28s left)
tokenizing  29% [=====               ] (2.077s left)


[ [ Police, :, ..., ,, OH ], [ How, Drug, ..., Independent, Voice ], ..., [ See, Jacob, ..., |, EW.com ], [ Elon, Musk, ..., Know, It ] ]


In [68]:
# lowercase
for i in pb(list(range(len(titlesToken))), logger=logger, message="Lower casing"):
    if titlesToken[i] == None:
        # print(titlesToken[i])
        continue
    else:
        for u in range(len(titlesToken[i])):
            titlesToken[i][u] = titlesToken[i][u].lower()
bp(titlesToken, logger)

Lower casing   0% [                    ]
Lower casing   9% [=                   ] (0.36s left)
Lower casing  19% [===                 ] (0.32s left)
Lower casing  29% [=====               ] (0.28s left)


None


[ [ police, :, ..., ,, oh ], [ how, drug, ..., independent, voice ], ..., [ see, jacob, ..., |, ew.com ], [ elon, musk, ..., know, it ] ]


None
None
None


In [69]:
# lemmatization
lemmatizer = WordNetLemmatizer()
pbar = ProgressBar(len(titlesToken), logger=logger, message="Lemmatization")
for i in range(len(titlesToken)):
    if titlesToken[i] == None:
        # print(titlesToken[i])
        continue
    else:
        for u in range(len(titlesToken[i])):
            titlesToken[i][u] = lemmatizer.lemmatize(titlesToken[i][u])
    pbar.tic()
bp(titlesToken, logger)

Lemmatization   0% [                    ]
Lemmatization   9% [=                   ] (1.17s left)
Lemmatization  19% [===                 ] (1.08s left)
Lemmatization  29% [=====               ] (0.933s left)


None




None
None
None


[ [ police, :, ..., ,, oh ], [ how, drug, ..., independent, voice ], ..., [ see, jacob, ..., |, ew.com ], [ elon, musk, ..., know, it ] ]


In [70]:
# the (url:title) dict
urlTitles= dict()
for i in range(len(titlesToken)):
    urlTitles[newsList[i]] = titlesToken[i]
bp(urlTitles, 2, logger)

{
  http://247wallst.com/healthcare-business/2017/12/30/30-big-biotech-events-coming-in-2018/3/: [ 30, big, ..., wall, st. ],
  http://247wallst.com/special-report/2017/12/08/americas-25-dying-industries-2: [ america, 's, ..., wall, st. ],
  http://247wallst.com/special-report/2018/01/11/25-best-fitness-tech-gadgets/: [ 25, best, ..., wall, st. ],
  http://a.msn.com/00/en-us/AAtmQOZ?ocid=st: [ america, 's, ..., live, in ],
  http://a.msn.com/00/en-us/AAuyJLc?ocid=st: [ what, living, ..., motley, fool ],
  ...,
  https://xtinaluvspink.wordpress.com/2016/01/17/vegan-chickpea-barley-soup-for-two/: [ vegan, chickpea, ..., healthy, ! ],
  https://yaledailynews.com/blog/2018/01/06/yale-psychiatrist-met-with-congressmen-about-trumps-mental-health/: [ yale, psychiatrist, ..., mental, health ],
  https://zdubbzattmom.wordpress.com/2018/01/03/ex-libris-the-eyes-of-madness-presents-2017s-bakers-dozen-zakks-favorite-reads-of-the-year/: [ ex, libris, ..., of, madness ],
  https://zdubbzattmom.wordp

# load the vocab

In [71]:
# loading the vocab of title words
titleVocab = []
with open('/home/yuting/PycharmProjects/data/title_vocab.txt','r') as file:
    for line in file.readlines():
        titleVocab.append(line.strip('\n'))

bp(titleVocab,2)

[ !, #, ..., •, … ]


In [72]:
len(trainUsers.keys())

30

# build user history

In [73]:
# for multiple users
trainUserQuery = dict()
pbar = ProgressBar(len(trainUsers.keys()), logger=logger, message="build user history")

for usr in trainUsers.keys():
    queryFinal = []
    
    # for each user
    # choose 15 urls
    if len(list(trainUsers[usr])) < 15:
        query = list(trainUsers[usr])
    else:
        query = random.sample(list(trainUsers[usr]),15)
        
    # put 15 titles together, using extend
    temp = []
    for url in query:
        if urlTitles[url] == None:
            continue
        else:
            temp.extend(urlTitles[url])
            
    # the following two steps are making the query titles more simplified and efficient          
    # filter what is not in the titlevocab
    tempCompact = []
    for i in range(len(temp)):
        if temp[i] in titleVocab:
            tempCompact.append(temp[i])
    
    # remove duplicated & detokenize
    queryCompact = detokenize(list(set(tempCompact)))
    
    queryFinal.append(queryCompact)
    trainUserQuery[usr] = queryFinal
    
    pbar.tic()

build user history   3% [                    ]
build user history  10% [==                  ] (1.53s left)
build user history  20% [====                ] (1.359s left)


# rankings

In [74]:
def computeSimilarity(p,h,y):
    
    tf.reset_default_graph() 
    model = Graph()
    saver = tf.train.Saver()

    # restore the trained model
    with tf.Session()as sess:
        sess.run(tf.global_variables_initializer())
        saver.restore(sess, '/home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt')
        loss, acc, logit = sess.run([model.loss, model.acc, model.logits],
                             feed_dict={model.p: p,
                                        model.h: h,
                                        model.y: y,
                                        model.keep_prob: 1})

        #print('loss: ', loss, ' acc:', acc)
        return logit
#logit = computeSimilarity(p,h,y)

In [75]:
# rankings of the candidates with logits output result
pbar = ProgressBar(len(trainUsers.keys()), logger=logger, message="testing")
rankings = dict()
for usr in trainUsers.keys():
    usrRankings = []
    
    # for each user
    # get historical data as query(p)
    query = trainUserQuery[usr] * 1000
    
    for candidates in evalData['candidates'][usr]:
        candidates = list(candidates)
        # get 1000 h as a list(h)
        h_temp = []
        for url in candidates:
            h_temp.append(detokenize(urlTitles[url]))
        # get p,h
        p,h = char_index(query,h_temp)
        y = np.zeros((1000))                    # whatever initialization
        
        scoresCandidates = []
        logit = computeSimilarity(p,h,y)            # get the 1000 scores list
        scoresCandidates = zip(candidates,list(logit[:,1]))
        ranking = sortBy(scoresCandidates, index=1, desc=True)
        ranking = [e[0] for e in ranking]
        usrRankings.append(ranking)
    rankings[usr] = usrRankings
    
    pbar.tic()

INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt


testing   3% [                    ]


INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt


testing  10% [==                  ] (23m 28.5s left)


INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt


testing  20% [====                ] (20m 52.4s left)


INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt




INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt




INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt




INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt




INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt




INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt




INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt




INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt
INFO:tensorflow:Restoring parameters from /home/yuting/PycharmProjects/Twinews/twinews/yfnotebooks/dssm/output/dssm_99.ckpt




# Add rankings

In [77]:
checkRankings(rankings,evalData['candidates'],maxUsers=None)

In [78]:
addRanking(modelName, rankings, config, logger=logger)