In [None]:
import pandas as pd
import numpy as np
import yaml
from pigeon import annotate
import pprint
import os
import json
from datetime import datetime
from gensim.models import Word2Vec
from src.utils.big_query_client import BigQueryClient
from src.utils.epoch_logger import EpochLogger
from src.utils.miscellaneous import load_pickled_content_id_list
from src.utils.related_links_csv_exporter import RelatedLinksCsvExporter
from src.utils.related_links_json_exporter import RelatedLinksJsonExporter
from src.utils.related_links_predictor import RelatedLinksPredictor
from src.utils.related_links_confidence_filter import RelatedLinksConfidenceFilter
from src.utils.date_helper import DateHelper
from src.utils.miscellaneous import read_config_yaml
from src.utils.big_query_client import BigQueryClient
from src.utils.date_helper import DateHelper
from scipy.special import softmax


### Changes for AB test:
    - Weighted
    - Add taxons and departments and document_collections
    - Prune network (remove transitions that have <1% probability of occurring
    - Increase dimensions (128)
    - Increase batchsize (50)
    - Threshold set at 80 rather than the fiddly logic (check how many have wouldnt have links in this scenario)
    - Train on aws sagemaker notebook - related-links-large
    
### Todo
    - Taxons and department link extraction only implemented in get_all_links notebook (not the main src of the app)
    - Network pruning not implemented
    - Threshold change only implemented in notebook `run-and-check-link-predictions` (not in src!)

In [None]:
def get_content_id_to_base_path_mapper(path):
    with open(path, 'r') as content_id_to_base_path_mapping_file:
        return json.load(content_id_to_base_path_mapping_file)


def get_content_ids_to_page_views_mapper(df):
    """
    Transform BigQuery dataframe to a dictionary where keys are content_ids and the values are pageviews.
    :param df:
    :return:
    """
    return df.set_index('content_id').T.to_dict('records')[0]


In [None]:
"""yesterday = DateHelper.get_datetime_for_yesterday()
three_weeks_ago = DateHelper.get_datetime_for_days_ago(40)

bq_client = BigQueryClient()
query_path = '../src/models/query_eligible_source_content_ids.sql'
all_content_ids_and_views_df = bq_client.query_date_range(query_path, three_weeks_ago, yesterday)
all_content_ids_and_views_df.to_csv('../data/tmp/all_page_views.csv', index=False)
"""

In [None]:
all_content_ids_and_views_df = pd.read_csv('../data/tmp/all_pages_views.csv')

In [None]:
all_content_ids_and_views_df = all_content_ids_and_views_df[['content_id','page_hits']]

In [None]:
all_content_ids_and_views_df.columns = ['target_content_id','hits']

In [None]:
content_ids = get_content_id_to_base_path_mapper('content_id_base_path_mapping.json')

In [None]:
DATA_DIR = '../data/tmp/'
MODEL_DIR = '../models/t/'
model_name = 'lite_weighted_n2v'
predictions1 = 'n2vweightedtest'

related_links_path = os.path.join(DATA_DIR,   predictions1 +
                                  datetime.today().strftime('%Y%m%d') + "suggested_related_links")


content_id_base_mapping_path = '../data/tmp/content_id_base_path_mapping.json'

node2vec_model_file_path = os.path.join(MODEL_DIR, model_name)


In [None]:
trained_model = Word2Vec.load(node2vec_model_file_path)



In [None]:
DATA_DIR

In [None]:
#Just run on a sample of eligible bits of content

eligible_source_content_ids = load_pickled_content_id_list(os.path.join(DATA_DIR,
                                                                        "eligible_source_content_ids.pkl"))

eligible_target_content_ids = load_pickled_content_id_list(os.path.join(DATA_DIR,
                                                                        "eligible_target_content_ids.pkl"))

eligible_target_content_ids = set(eligible_target_content_ids)





In [None]:
eligible_source_content_ids = list(set(eligible_source_content_ids))
len(eligible_source_content_ids)

In [None]:
all_content_ids_and_views_df = pd.read_csv('../data/tmp/all_pages_views.csv')
all_content_ids_and_views_df = all_content_ids_and_views_df[['content_id','page_hits']]

In [None]:
len(eligible_source_content_ids)

In [None]:
from tqdm import tqdm
import pandas as pd
import multiprocessing
from multiprocessing import cpu_count
import numpy as np
import logging.config
import os
from collections import ChainMap

class RelatedLinksPredictor:
    """
    Uses a node2vec model to create a nested list of source_content_ids and their predicted target_content_ids (up to 5)
    :param source_content_ids: list of content_ids we can link from
    :param target_content_ids: list of content_ids we can link to
    :param model: node2vec model where model.wv.vocab.keys() are content_ids
    :param probability_threshold: number in the range [0,1] representing the similarity of two nodes.
   :param num_links: maximum number of links to recommend (optional)
    """

    def __init__(self, source_content_ids, target_content_ids, model, num_links=5):
        self.model = model
        self.eligible_source_content_ids = self._get_eligible_content_ids(source_content_ids)
        self.eligible_target_content_ids = target_content_ids
        self.num_links = num_links


    def predict_all_related_links(self, num_workers=cpu_count()):
        params = list(map(
            lambda source_content_id: (
                source_content_id, self.eligible_target_content_ids, self.model,
                self.num_links),
            self._split_content_ids(self.eligible_source_content_ids, num_workers)))

        pool = multiprocessing.Pool(processes=num_workers)
        results = pool.starmap(_predict_related_links_for_content_ids, params)

        all_related_links = dict(ChainMap(*results))

        pool.close()

        return all_related_links

    def _get_eligible_content_ids(self, source_content_ids):
        """
        Filter eligible content_ids to only the ones included in the trained model's vocabulary
        :param source_content_ids:
        :return:
        """

        return [
            content_id for content_id in tqdm(
                source_content_ids, desc="eligible_content_ids"
            ) if content_id in self.model.wv.vocab.keys()
        ]

    def _split_content_ids(self, content_ids, chunks):
        """

        :param content_ids:
        :param chunks:
        :return:
        """
        return np.array_split(content_ids, chunks)



def _predict_related_links_for_content_ids(source_content_ids, eligible_target_content_ids, model,
                                            num_links):
    """
    Gets the top-5 most-probable eligible target_content_ids for a single source_content_id.
    Target_content_ids are dropped if:
        - The predicted probability between source and target is below the probability threshold
        - The target_content_id is not listed in the inclusion list
        - The source and target are the same item
        - The link is not in the top 5 (highest probabilities) for that source_id
    """

    related_links = {}

    print(f"Computing related links for {len(source_content_ids)} content_ids, worker id: {os.getpid()}")

    for content_id in tqdm(source_content_ids, desc="getting related links"):
        # stick to this approach because actually interacting with the most_similar generator is
        # super slow. Dump everything to a dataframe, then filter and save list values
        potential_related_links = pd.DataFrame(model.wv.most_similar(content_id, topn=100))
        potential_related_links.columns = ['target_content_id', 'probability']
        potential_related_links['source_content_id'] = content_id
        
        mask = potential_related_links['target_content_id'].map(lambda x: x in eligible_target_content_ids)
        
        potential_related_links = potential_related_links[mask]
        
        potential_related_links = potential_related_links[potential_related_links['probability']>0.8]
        
        if potential_related_links.shape[0] ==0:
            related_links[content_id] = [] 
            continue
    
        potential_related_links= potential_related_links.sort_values('probability',ascending=False).head(5)

        related_links[content_id] = potential_related_links[['target_content_id','probability']].values.tolist()

    return related_links


In [None]:

related_links_predictor = RelatedLinksPredictor(eligible_source_content_ids, eligible_target_content_ids,trained_model)


In [None]:
content_id_base_mapping_path = 'content_id_base_path_mapping.json'

In [None]:
related_links = related_links_predictor.predict_all_related_links()

json_exporter = RelatedLinksJsonExporter(related_links)
json_exporter.export(f'{related_links_path}.json')


csv_exporter = RelatedLinksCsvExporter(related_links,
                                       get_content_id_to_base_path_mapper(content_id_base_mapping_path),
                                       get_content_ids_to_page_views_mapper(all_content_ids_and_views_df))

csv_exporter.export(f'{related_links_path}.tsv')

In [None]:
pagepath_related_links = {}
for source_id in related_links:
    link_ids = [target[0] for target in related_links[source_id]]
    link_paths = [os.path.splitext(content_ids[target_id])[0] for target_id in link_ids]
    sourcepath = os.path.splitext(content_ids[source_id])[0]
    pagepath_related_links[sourcepath] = link_paths
    

In [None]:
len(pagepath_related_links)

In [None]:
with open(r'random_related_links_all.yml', 'w') as file:
    yaml.dump(pagepath_related_links, file)

In [None]:
recent = pd.read_csv('recent_related_links.csv')
recent.shape

In [None]:
recent = recent[['source_content_id',
           'destination_base_path',
           ]].groupby(['source_content_id',]).aggregate(list).reset_index()
                

In [None]:
weighted = pd.read_csv(related_links_path +'.tsv',sep='\t')
weighted

In [None]:
weighted = pd.read_csv(related_links_path +'.tsv',sep='\t')
weighted = weighted[['source_content_id',
                     'source_base_path',
                     'target_base_path',
                     'target_content_id',
                     'source_page_views']].groupby(['source_content_id',
                                                    'source_base_path',
                                                    'source_page_views']).aggregate(list).reset_index()

weighted = weighted.replace(np.nan, 'none')
weighted = weighted.fillna('none')

In [None]:
pd.set_option("max_colwidth", 400)

weighted.sort_values('source_page_views')

In [None]:
combined = weighted.merge(recent, how='left', on='source_content_id')
combined = combined.replace(np.nan, 'none')

In [None]:
combined.to_csv('test.csv', index=False)

In [None]:
json_exporter.export(f'{related_links_path}.json')