In [4]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [6]:
import numpy as np
import pandas as pd
from ast import literal_eval
from gensim import corpora, models, similarities
from nltk.tokenize import sent_tokenize, word_tokenize
from stop_words import get_stop_words
from random import shuffle
import stop_words
from gensim.parsing.porter import PorterStemmer
import time
import re
import json, os, pickle
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.gensim

2018-12-01 19:22:26,369 : INFO : 'pattern' package found; tag filters are available for English


In [9]:
from TextCleaner import TextCleaner

In [22]:
class SubjectTrainer():
    """
    The class for training topic model
    """
    def __init__(self, folder="./subject_models", cl_folder="../text_cleaner_models_with_subjects/"):
        """
        Initialize
        """
        # init model folder
        self.folder = folder
        try:
            os.mkdir(folder)
        except:
            pass
        
        # load string cleaner
        self.cl = TextCleaner(folder=cl_folder)
        
        # init index model variables
        self.model = None
        self.dictionary = None
        
        # topic index: topic_name . topic_index
        self.topic_index = {}
        
        # indexed topic: topic_index . topic_name
        self.indexed_topic = []
        
        # topic vec: topic_index . topic_model_vec
        self.topic_vec = None
        
        # video index: video_name . video_index
        self.video_index = {}
        
        # indexed video: video_index . video_name
        self.indexed_video = []
        
        # video topic vec: video_index . topic_vec
        self.video_topic_vec = {}
        
        # video vec: video_index . topic_model_vec
        self.video_vec = {}
        
    def train_topic_model(self, cleaned_docs, vec_len=150):
        """
        Train topic model and save the:
        model, tfidf and dictionary model
        """
        print("start training topic model")
        
        # init topic model vector length
        self.vec_len = vec_len
        
        # get dictionary
        self.dictionary = corpora.Dictionary([doc.split() for doc in cleaned_docs])
        
        # remove organization specific words
        self.dictionary.filter_extremes(no_below=5, no_above=0.5)
        self.dictionary.compactify()
        print("dicionary collected")
        
        # prepare topic model
        bows = [self.dictionary.doc2bow(doc.split()) for doc in cleaned_docs]
        self.model = models.LdaModel(bows, self.vec_len, id2word=self.dictionary, chunksize=2000, passes=50, 
                                         iterations=100, alpha="auto", eta="auto", eval_every=80000
                                        )
        print("topic model generated")
        
        # save to file
        self.dictionary.save(self.folder + "/" + "dictionary")
        self.model.save(self.folder + "/" + "model")
        
    def index_topic_and_video(self, topics, videos, existing_video_index=None):
        """
        Index all documents to get self: topic_index, topic_vec, video_topic_vec and video_vec
        
        topics: dictionary of topic name and topic texts
        videos: dictionary of video name and video subtitles
        """ 
        # create video_index and indexed_video if no existing video index and indexed video
        if existing_video_index is None:
            v_counter = 0
            for v in videos:
                self.video_index[v] = v_counter
                self.indexed_video.append(v)
                v_counter += 1
        else: # load existing video index
            self.video_index = existing_video_index["video_index"]
            self.indexed_video = existing_video_index["indexed_video"]
            v_counter = len(self.indexed_video)
        
        # create topic_index and indexed_topic
        t_counter = 0
        for t in topics:
            self.topic_index[t] = t_counter
            self.indexed_topic.append(t)
            t_counter += 1
        
        # create topic_vec
        self.topic_vec = np.zeros((t_counter, self.vec_len))
        for t in topics:
            t_vec = self.index_text(topics[t], norm=None, thresh=30)
            self.topic_vec[self.topic_index[t], :] = t_vec
            
        # create video_vec:
        self.video_vec = np.zeros((v_counter, self.vec_len))
        for v in videos:
            v_vec = self.index_text(videos[v], norm=None, thresh=1)
            self.video_vec[self.video_index[v], :] = v_vec
            
        # create video_topic_vec, preserve the original similarity score
        self.video_topic_vec = self.norm_dot(self.video_vec, self.topic_vec)

        # save self. topic_index, indexed_topic, topic_vec, video_index, indexed_video, video_vec, video_topic_vec
        np.save(self.folder + "/topic_vec", self.topic_vec)
        np.save(self.folder + "/video_vec", self.video_vec)
        np.save(self.folder + "/video_topic_vec", self.video_topic_vec)
        with open(self.folder + "/topic_index.pkl", 'wb') as f:
            pickle.dump(self.topic_index, f)
        with open(self.folder + "/indexed_topic.pkl", 'wb') as f:
            pickle.dump(self.indexed_topic, f)
        with open(self.folder + "/video_index.pkl", 'wb') as f:
            pickle.dump(self.video_index, f)
        with open(self.folder + "/indexed_video.pkl", 'wb') as f:
            pickle.dump(self.indexed_video, f)
    
    def load_topic_model(self):
        """
        load topic models if no training is needed
        """
        self.dictionary = corpora.Dictionary.load(self.folder + "/dictionary")
        self.model = models.LdaModel.load(self.folder + "/model")
        self.vec_len = self.model.num_topics
        
    def load_topic_video_model(self):
        """
        load topic_index, indexed_topic, topic_vec, video_index, indexed_video, video_vec, video_topic_vec
        """
        self.topic_vec = np.load(self.folder + "/topic_vec.npy")
        self.video_vec = np.load(self.folder + "/video_vec.npy")
        self.video_topic_vec = np.load(self.folder + "/video_topic_vec.npy")
        with open(self.folder + "/topic_index.pkl", 'rb') as f:
            self.topic_index = pickle.load(f)
        with open(self.folder + "/indexed_topic.pkl", 'rb') as f:
            self.indexed_topic = pickle.load(f)
        with open(self.folder + "/video_index.pkl", 'rb') as f:
            self.video_index = pickle.load(f)
        with open(self.folder + "/indexed_video.pkl", 'rb') as f:
            self.indexed_video = pickle.load(f)
        
    def index_text(self, text, norm=None, thresh=0.05):
        """
        Index a given string use topic model
        """
        # clean the text
        cleaned_text = self.cl.clean(text)

        # get topic model index
        model_vec = self.model.inference([self.dictionary.doc2bow(cleaned_text.split())])[0][0]
        
        # topics that has prob lower than thresh would be set to zero
        model_vec -= thresh
        model_vec[model_vec<0] = 0
        
        # return zero array if no significant topic possibility
        if model_vec.sum() == 0:
            return model_vec
        
        # do norm if necessary
        if norm is None:
            return model_vec
        else:
            return model_vec / np.linalg.norm(model_vec, ord=norm)
        
    def norm_dot(self, a, b):
        """
        A dot operation that is aim to find all cos similarity of each row of each array
        """
        # get per row norm for each array
        norm_a = np.linalg.norm(a, ord=2, axis=1).reshape(-1, 1)
        norm_b = np.linalg.norm(b, ord=2, axis=1).reshape(-1, 1)
        
        # get similarity by doting normalized array rows
        sim = np.dot(a / norm_a, (b / norm_b).T)
        
        np.nan_to_num(sim, copy=False)
        
        return sim

In [23]:
class InterestTrainer():
    """
    The class for training topic model
    """
    def __init__(self, folder="./asym_150_both", cl_folder="../text_cleaner_models_without_subjects/"):
        """
        Initialize
        """
        # init model folder
        self.folder = folder
        try:
            os.mkdir(folder)
        except:
            pass
        
        # load string cleaner
        self.cl = TextCleaner(folder=cl_folder)
        
        # init index model variables
        self.model = None
        self.dictionary = None
        
        # topic index: topic_name . topic_index
        self.topic_index = {}
        
        # indexed topic: topic_index . topic_name
        self.indexed_topic = []
        
        # topic vec: topic_index . topic_model_vec
        self.topic_vec = None
        
        # video index: video_name . video_index
        self.video_index = {}
        
        # indexed video: video_index . video_name
        self.indexed_video = []
        
        # video topic vec: video_index . topic_vec
        self.video_topic_vec = {}
        
        # video vec: video_index . topic_model_vec
        self.video_vec = {}
        
    def train_topic_model(self, cleaned_docs, vec_len=300):
        """
        Train topic model and save the:
        model, tfidf and dictionary model
        """
        print("start training topic model")
        
        # init topic model vector length
        self.vec_len = vec_len
        
        # get dictionary
        self.dictionary = corpora.Dictionary([doc.split() for doc in cleaned_docs])
        
        # remove organization specific words
        self.dictionary.filter_extremes(no_below=10, no_above=0.35)
        self.dictionary.compactify()
        print("dicionary collected")
        
        # prepare topic model
        bows = [self.dictionary.doc2bow(doc.split()) for doc in cleaned_docs]
        self.model = models.LdaModel(bows, self.vec_len, id2word=self.dictionary, chunksize=4000, passes=100, 
                                         iterations=100, alpha="auto", eta="auto", eval_every=80000
                                        )
        print("topic model generated")
        
        # save to file
        self.dictionary.save(self.folder + "/" + "dictionary")
        self.model.save(self.folder + "/" + "model")
        
    def index_topic_and_video(self, topics, videos, existing_video_index=None):
        """
        Index all documents to get self: topic_index, topic_vec, video_topic_vec and video_vec
        
        topics: dictionary of topic name and topic texts
        videos: dictionary of video name and video subtitles
        """ 
        # create video_index and indexed_video if no existing video index and indexed video
        if existing_video_index is None:
            v_counter = 0
            for v in videos:
                self.video_index[v] = v_counter
                self.indexed_video.append(v)
                v_counter += 1
        else: # load existing video index
            self.video_index = existing_video_index["video_index"]
            self.indexed_video = existing_video_index["indexed_video"]
            v_counter = len(indexed_video)
        
        # create topic_index and indexed_topic
        t_counter = 0
        for t in topics:
            self.topic_index[t] = t_counter
            self.indexed_topic.append(t)
            t_counter += 1
        
        # create topic_vec
        self.topic_vec = np.zeros((t_counter, self.vec_len))
        for t in topics:
            t_vec = self.index_text(topics[t], norm=None, thresh=5)
            self.topic_vec[self.topic_index[t], :] = t_vec
            
        # create video_vec:
        self.video_vec = np.zeros((v_counter, self.vec_len))
        for v in videos:
            v_vec = self.index_text(videos[v], norm=None, thresh=1)
            self.video_vec[self.video_index[v], :] = v_vec
            
        # create video_topic_vec, preserve the original similarity score
        self.video_topic_vec = self.norm_dot(self.video_vec, self.topic_vec)

        # save self. topic_index, indexed_topic, topic_vec, video_index, indexed_video, video_vec, video_topic_vec
        np.save(self.folder + "/topic_vec", self.topic_vec)
        np.save(self.folder + "/video_vec", self.video_vec)
        np.save(self.folder + "/video_topic_vec", self.video_topic_vec)
        with open(self.folder + "/topic_index.pkl", 'wb') as f:
            pickle.dump(self.topic_index, f)
        with open(self.folder + "/indexed_topic.pkl", 'wb') as f:
            pickle.dump(self.indexed_topic, f)
        with open(self.folder + "/video_index.pkl", 'wb') as f:
            pickle.dump(self.video_index, f)
        with open(self.folder + "/indexed_video.pkl", 'wb') as f:
            pickle.dump(self.indexed_video, f)
    
    def load_topic_model(self):
        """
        load topic models if no training is needed
        """
        self.dictionary = corpora.Dictionary.load(self.folder + "/dictionary")
        self.model = models.LdaModel.load(self.folder + "/model")
        self.vec_len = self.model.num_topics
        
    def load_topic_video_model(self):
        """
        load topic_index, indexed_topic, topic_vec, video_index, indexed_video, video_vec, video_topic_vec
        """
        self.topic_vec = np.load(self.folder + "/topic_vec.npy")
        self.video_vec = np.load(self.folder + "/video_vec.npy")
        self.video_topic_vec = np.load(self.folder + "/video_topic_vec.npy")
        with open(self.folder + "/topic_index.pkl", 'rb') as f:
            self.topic_index = pickle.load(f)
        with open(self.folder + "/indexed_topic.pkl", 'rb') as f:
            self.indexed_topic = pickle.load(f)
        with open(self.folder + "/video_index.pkl", 'rb') as f:
            self.video_index = pickle.load(f)
        with open(self.folder + "/indexed_video.pkl", 'rb') as f:
            self.indexed_video = pickle.load(f)
        
    def index_text(self, text, norm=None, thresh=0.05):
        """
        Index a given string use topic model
        """
        # clean the text
        cleaned_text = self.cl.clean(text)

        # get topic model index
        model_vec = self.model.inference([self.dictionary.doc2bow(cleaned_text.split())])[0][0]
        
        # topics that has prob lower than thresh would be set to zero
        model_vec -= thresh
        model_vec[model_vec<0] = 0
        
        # return zero array if no significant topic possibility
        if model_vec.sum() == 0:
            return model_vec
        
        # do norm if necessary
        if norm is None:
            return model_vec
        else:
            return model_vec / np.linalg.norm(model_vec, ord=norm)
        
    def norm_dot(self, a, b):
        """
        A dot operation that is aim to find all cos similarity of each row of each array
        """
        # get per row norm for each array
        norm_a = np.linalg.norm(a, ord=2, axis=1).reshape(-1, 1)
        norm_b = np.linalg.norm(b, ord=2, axis=1).reshape(-1, 1)
        
        # get similarity by doting normalized array rows
        sim = np.dot(a / norm_a, (b / norm_b).T)
        
        np.nan_to_num(sim, copy=False)
        
        return sim

# Train the subject model to have the same index as interest model

In [24]:
from anytree import Node, RenderTree
from anytree.importer import DictImporter
importer = DictImporter()
# load subjects as topics
topics = {}
with open("../text_resource/tree.json", "r") as f:
    tree = importer.import_(json.load(f))
with open("../text_resource/node2text.json", "r") as f:
    node2text = json.load(f)

leaf_nodes = []
def get_leaf_node(node):
    if node.is_leaf:
        leaf_nodes.append(node)
    else:
        for child in node.children:
            get_leaf_node(child)
get_leaf_node(tree)

for node in leaf_nodes:
    text = node2text[node.name]
    topics[node.name] = text
    
# prepare documents of videos
with open("../text_resource/cleaned_subtitles.json", "r") as f:
    o_videos = json.load(f)
# clean up names
videos = {}
for ov in o_videos:
    v = re.sub("(.*/)", "", ov)
    v = re.sub(".en.json", "", v)
    videos[(v[:-12], v[-11:])] = o_videos[ov]

In [25]:
interest_t = InterestTrainer()
interest_t.load_topic_model()
interest_t.load_topic_video_model()
existing_video_index = {}
existing_video_index["indexed_video"] = interest_t.indexed_video
existing_video_index["video_index"] = interest_t.video_index

2018-12-01 19:30:15,080 : INFO : loading Phraser object from ../text_cleaner_models_without_subjects//bigram
2018-12-01 19:30:15,092 : INFO : loaded ../text_cleaner_models_without_subjects//bigram
2018-12-01 19:30:15,094 : INFO : loading Phraser object from ../text_cleaner_models_without_subjects//trigram
2018-12-01 19:30:15,103 : INFO : loaded ../text_cleaner_models_without_subjects//trigram
2018-12-01 19:30:15,105 : INFO : loading Dictionary object from ./asym_150_both/dictionary
2018-12-01 19:30:15,117 : INFO : loaded ./asym_150_both/dictionary
2018-12-01 19:30:15,120 : INFO : loading LdaModel object from ./asym_150_both/model
2018-12-01 19:30:15,123 : INFO : loading expElogbeta from ./asym_150_both/model.expElogbeta.npy with mmap=None
2018-12-01 19:30:15,136 : INFO : setting ignored attribute id2word to None
2018-12-01 19:30:15,139 : INFO : setting ignored attribute state to None
2018-12-01 19:30:15,142 : INFO : setting ignored attribute dispatcher to None
2018-12-01 19:30:15,144 :

In [26]:
subject_t = SubjectTrainer()
subject_t.load_topic_model()
subject_t.index_topic_and_video(topics, videos, existing_video_index=existing_video_index)

2018-12-01 19:30:15,790 : INFO : loading Phraser object from ../text_cleaner_models_with_subjects//bigram
2018-12-01 19:30:15,799 : INFO : loaded ../text_cleaner_models_with_subjects//bigram
2018-12-01 19:30:15,801 : INFO : loading Phraser object from ../text_cleaner_models_with_subjects//trigram
2018-12-01 19:30:15,808 : INFO : loaded ../text_cleaner_models_with_subjects//trigram
2018-12-01 19:30:15,810 : INFO : loading Dictionary object from ./subject_models/dictionary
2018-12-01 19:30:15,814 : INFO : loaded ./subject_models/dictionary
2018-12-01 19:30:15,816 : INFO : loading LdaModel object from ./subject_models/model
2018-12-01 19:30:15,818 : INFO : loading expElogbeta from ./subject_models/model.expElogbeta.npy with mmap=None
2018-12-01 19:30:15,821 : INFO : setting ignored attribute id2word to None
2018-12-01 19:30:15,823 : INFO : setting ignored attribute dispatcher to None
2018-12-01 19:30:15,825 : INFO : setting ignored attribute state to None
2018-12-01 19:30:15,826 : INFO : 

In [27]:
subject_t.indexed_video == interest_t.indexed_video

True

In [31]:
class InterestVectorizer():
    """
    The class for training topic model
    """
    def __init__(self, folder="./asym_150_both", cl_folder="../text_cleaner_models_without_subjects", 
                 existing_video_index=None
                ):
        """
        Initialize and load all models
        """
        # init model folder
        self.folder = folder

        # load topic models if no training is needed
        self.dictionary = corpora.Dictionary.load(self.folder + "/dictionary")
        self.model = models.LdaModel.load(self.folder + "/model")
        self.vec_len = self.model.num_topics
        
        # load topic_index, indexed_topic, topic_vec, video_index, indexed_video, video_vec, video_topic_vec
        self.topic_vec = np.load(self.folder + "/topic_vec.npy")
        self.video_vec = np.load(self.folder + "/video_vec.npy")
        self.video_topic_vec = np.load(self.folder + "/video_topic_vec.npy")
        self.topic_video_vec = self.video_topic_vec.T
        with open(self.folder + "/topic_index.pkl", 'rb') as f:
            self.topic_index = pickle.load(f)
        with open(self.folder + "/indexed_topic.pkl", 'rb') as f:
            self.indexed_topic = pickle.load(f)
        if existing_video_index is None:
            with open(self.folder + "/video_index.pkl", 'rb') as f:
                self.video_index = pickle.load(f)
            with open(self.folder + "/indexed_video.pkl", 'rb') as f:
                self.indexed_video = pickle.load(f)
        else:
            self.video_index = existing_video_index["video_index"]
            self.indexed_video = existing_video_index["indexed_video"]
        
        # load string cleaner
        self.cl = TextCleaner(folder=cl_folder)
    
    def score_video_based_on_interest_vector(self, interest_vec, thresh=0.05):
        """
        With the input interest vector, return a score for all the videos
        Removing video whose similarity to the topic is less then thresh
        Be advised, do not normalize video topic vec as we are not summing the simlarities of videos
            to different 
        """
        if interest_vec.sum() == 0: # normally distributed interest if no interest is registered
            normed_interest_vector = np.ones_like(normed_interest_vector)
            normed_interest_vector /= normed_interest_vector.sum()
        else:
            normed_interest_vector = interest_vec / interest_vec.sum()
        video_score = np.dot(self.video_topic_vec, normed_interest_vector).reshape(-1)
        
        return video_score
    
    def index_text(self, text, norm=1, thresh=0.05):
        """
        Index a given string use topic model
        """
        # clean the text
        cleaned_text = self.cl.clean(text)

        # get topic model index
        model_vec = self.model.inference([self.dictionary.doc2bow(cleaned_text.split())])[0][0]
        
        # topics that has prob lower than thresh would be set to zero
        model_vec -= thresh
        model_vec[model_vec<0] = 0
        
        # return zero array if no significant topic possibility
        if model_vec.sum() == 0:
            return model_vec
        
        # do norm if necessary
        if norm is None:
            return model_vec
        else:
            return model_vec / np.linalg.norm(model_vec, ord=norm)

In [32]:
class SubjectVectorizer():
    """
    The class for training topic model
    """
    def __init__(self, folder="./subject_models", cl_folder="../text_cleaner_models_with_subjects/", 
                 existing_video_index=None
                ):
        """
        Initialize and load all models
        """
        # init model folder
        self.folder = folder

        # load topic models if no training is needed
        self.dictionary = corpora.Dictionary.load(self.folder + "/dictionary")
        self.model = models.LdaModel.load(self.folder + "/model")
        self.vec_len = self.model.num_topics
        
        # load topic_index, indexed_topic, topic_vec, video_index, indexed_video, video_vec, video_topic_vec
        self.topic_vec = np.load(self.folder + "/topic_vec.npy")
        self.video_vec = np.load(self.folder + "/video_vec.npy")
        self.video_topic_vec = np.load(self.folder + "/video_topic_vec.npy")
        self.topic_video_vec = self.video_topic_vec.T
        with open(self.folder + "/topic_index.pkl", 'rb') as f:
            self.topic_index = pickle.load(f)
        with open(self.folder + "/indexed_topic.pkl", 'rb') as f:
            self.indexed_topic = pickle.load(f)
        if existing_video_index is None:
            with open(self.folder + "/video_index.pkl", 'rb') as f:
                self.video_index = pickle.load(f)
            with open(self.folder + "/indexed_video.pkl", 'rb') as f:
                self.indexed_video = pickle.load(f)
        else:
            self.video_index = existing_video_index["video_index"]
            self.indexed_video = existing_video_index["indexed_video"]
        
        # load string cleaner
        self.cl = TextCleaner(folder=cl_folder)
    
    def score_video_based_on_topic(self, topics, thresh=0.6):
        """
        With the input topics, return a score for all the videos
        Removing video whose similarity to the topic is less then thresh
        """
        video_score_sum = np.zeros(len(self.indexed_video))
        for t in topics:
            ti = self.topic_index[t]
            video_score = self.topic_video_vec[ti].copy()
            video_score[video_score<thresh] = 0
            video_score_sum += video_score
        video_score_sum[video_score_sum<thresh] = 0
            
        return video_score_sum / len(topics)

In [66]:
class VideoVectorizer():
    def __init__(self, folder="."):
        """
        Load from folder vectorizer for interest and subject
        """
        # load vectorizer
        self.interest = InterestVectorizer(folder=folder + "/asym_150_both", cl_folder=folder + "/text_cleaner_models_without_subjects")
        self.subject = SubjectVectorizer(folder=folder + "/subject_models", cl_folder=folder + "/text_cleaner_models_with_subjects")
        
        # get the common video index
        self.indexed_video = self.interest.indexed_video
        self.video_index = self.subject.video_index
        
    def update_interest_vector(self, prev_interest_vec, prev_video, update_constant=0.1):
        """
        update interest vector with EMS
        """
        prev_video_interest_vec = self.interest.video_topic_vec[self.video_index[prev_video]]
        
        interest_vec = update_constant * (prev_video_interest_vec - prev_interest_vec) + prev_interest_vec
        
        return interest_vec
        
    def get_ranked_video(self, subjects, interest_vec, subject_weight=0.8, subject_mask_value=0.1, thresh=0):
        # get score for each video based on topics and interests
        interest_score = self.interest.score_video_based_on_interest_vector(interest_vec)
        subject_score = self.subject.score_video_based_on_topic(subjects)
        
        # get the final score, NB video that does not match a subject would not be presented
        final_score = subject_weight * subject_score + (1 - subject_weight) * interest_score
        # get subject mask to mask the final result
        subject_mask = subject_score.copy()
        subject_mask[subject_mask>0] = 1
        subject_mask[subject_mask==0] = subject_mask_value
        # get finally masked result
        final_score *= subject_mask
        
        # give ranked url and title out
        sorted_score = sorted(enumerate(final_score), key=lambda x:x[1], reverse=True)
        
        ranked_video = []
        for index, score in sorted_score:
            if score < thresh:
                break
            ranked_video.append((self.indexed_video[index], score))
        return ranked_video

# Testing py package

In [1]:
from VideoVectorizer import *

In [2]:
vv = VideoVectorizer()

2018-12-01 22:09:03,089 : INFO : loading Dictionary object from ./asym_150_both/dictionary
2018-12-01 22:09:03,110 : INFO : loaded ./asym_150_both/dictionary
2018-12-01 22:09:03,114 : INFO : loading LdaModel object from ./asym_150_both/model
2018-12-01 22:09:03,118 : INFO : loading expElogbeta from ./asym_150_both/model.expElogbeta.npy with mmap=None
2018-12-01 22:09:03,134 : INFO : setting ignored attribute id2word to None
2018-12-01 22:09:03,137 : INFO : setting ignored attribute state to None
2018-12-01 22:09:03,140 : INFO : setting ignored attribute dispatcher to None
2018-12-01 22:09:03,142 : INFO : loaded ./asym_150_both/model
2018-12-01 22:09:03,145 : INFO : loading LdaModel object from ./asym_150_both/model.state
2018-12-01 22:09:03,285 : INFO : loaded ./asym_150_both/model.state
2018-12-01 22:09:03,345 : INFO : loading Phraser object from ./text_cleaner_models_without_subjects/bigram
2018-12-01 22:09:03,359 : INFO : loaded ./text_cleaner_models_without_subjects/bigram
2018-12-

# get video ranks

In [17]:
topics = vv.subject.indexed_topic[10:11]
print("topics are: ", topics)

index = 30
interest_vec = np.zeros_like(vv.interest.video_topic_vec[0])
interest_vec[index] = 1
print("interest is: ", vv.interest.indexed_topic[index])

rank = vv.get_ranked_video(topics, interest_vec, subject_weight=0.5, thresh=0.1)
rank

topics are:  ['sub_Gravitational fields']
interest is:  Visiting and Travel


[(('Why Are Astronauts Weightless', 'iQOHRKKNNLQ'), 0.4768162680938836),
 (('Our Definition For “Moon” Is Broken (Collab. w_ MinutePhysics)',
   'pAI1N96t8Vk'),
  0.4753106636848026),
 (('What Is a Field - Instant Egghead #42', '7BK166SL-ig'),
  0.47235847019149185),
 (("Best Film on Newton's Third Law. Ever.", '8bTdMmNZm2M'),
  0.46854150718677434),
 (('Why Does the Moon Orbit Earth', 'zN6kCa6xi9k'), 0.46307715482972467),
 (('Is There Gravity In Space', 'd57C2drB_wc'), 0.45150551371147835),
 (('The Tides explained in ten seconds', 'mVJEi-PkkaY'), 0.4459317506965165),
 (('Does Earth Have A Second Moon', 'rmQepa1qnI0'), 0.44552773522367384),
 (('How Big is the Moon MM#1', 'Tqt9hZcWhJM'), 0.43568792215630253),
 (('Is There Poop on the Moon ft. Smarter Every Day', 'QNP8wy3S_kY'),
  0.43518315250254885),
 (('LONELY.', '_QPcclYWOr4'), 0.42679270621174176),
 (('How Texting Can Ruin Relationships', 'DzaU-TinoZQ'), 0.4250347792168338),
 (('Calculating Gravitational Attraction', 'SN1Q5ru2fI0'),

# update interest vector

In [4]:
vv.update_interest_vector(interest_vec, prev_video=('What Your Drink Says About Your Politics', 'rdoUojbCfjk'))

array([9.86522330e-01, 1.37180339e-04, 2.96458314e-04, 3.27611181e-05,
       1.28036671e-03, 1.11712140e-03, 7.34916968e-05, 6.82541462e-06,
       1.21089875e-03, 3.62656553e-02, 1.62016043e-05, 1.47018327e-04,
       3.93463939e-04, 5.36163756e-03, 1.79165231e-03, 8.27099208e-03,
       1.03945717e-03, 2.25911788e-03, 2.09346629e-02, 5.71284230e-04,
       0.00000000e+00, 3.58201516e-04, 7.93990969e-03, 4.23049602e-03,
       7.47967202e-04, 9.20865848e-04, 1.73541606e-03, 7.11531135e-02,
       9.45783171e-05, 1.70179438e-05, 2.23952506e-03, 2.83625824e-03,
       2.01359949e-03, 8.92987120e-04, 7.67562046e-04, 8.76020840e-02,
       1.20735937e-02, 5.26675576e-05, 0.00000000e+00, 7.61999294e-03,
       2.12843647e-03, 5.78043329e-04, 2.35864996e-03, 2.32065223e-02,
       2.30981782e-03, 6.72781318e-03, 1.59164327e-03, 1.62465830e-03,
       9.76286465e-04, 1.61203363e-03, 1.23127857e-02, 6.11358770e-05,
       0.00000000e+00, 4.65810833e-03, 5.86727402e-05, 1.66928883e-02,
      