In [11]:
import pandas as pd
import numpy as np
import os
import json 
from gensim.models import Word2Vec
import sys
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), "src"))
from utils.miscellaneous import read_query, load_pickled_content_id_list

In [17]:
DATA_DIR = os.path.join(os.path.dirname(os.getcwd()), "data")
MODELS_DIR = os.path.join(os.path.dirname(os.getcwd()), "models")
DATA_DIR

'/Users/felisialoukou/Documents/govuk-related-links-recommender/data'

In [13]:
eligible_source_content_ids = load_pickled_content_id_list(os.path.join(DATA_DIR, "tmp",
                                                                            "eligible_source_content_ids.pkl"))

eligible_target_content_ids = load_pickled_content_id_list(os.path.join(DATA_DIR, "tmp",
                                                                               "eligible_target_content_ids.pkl"))

In [15]:
eligible_source_content_ids[0:10], eligible_target_content_ids[0:2]

(['22a305c8-2bbd-4caa-8a46-2df5d51089e1',
  'da78bdae-887e-4352-ab46-c230833f18ce',
  '7b7d2e12-1e3c-4733-90c0-920ae4b8ec6f',
  '1b921ea8-0fa6-4429-a0fe-428b17f10a5a',
  'd0cadb7b-2bbc-4e7a-8913-2db7faa3c30d',
  'fa869c33-4d32-4977-9056-97b544811367',
  '9638d234-a20e-4042-998b-0ac82da2aee0',
  '887311be-799a-45f8-805f-9bdafb57cb23',
  '1d7ceda0-1727-4ef6-8613-aadbf4a4ca2c',
  '7da5e3f4-2da0-46fb-9b7f-dcb253b43e88'],
 ['ddda6dc8-e9de-49db-bbd1-97e3d0bc1e6f',
  'db63bcc6-b847-4448-9738-27ecf5052ada'])

In [56]:
def _get_related_links_for_df(df, model, eligible_target_content_ids):
    """
    most probable (5 max) and eligible target_content_ids are extracted for each source_content_id
    in the input df's column called 'content_id'
    :param df: pandas DataFrame with a column named 'content_id'
    :param model: node2vec model where mode.wv.vocab.keys() are content_ids
    :param eligible_target_content_ids: list of eligible target_content_ids
    :return: pandas Series where each element is a pandas DataFrame
    """
    return df['source_content_id'].apply(
        get_related_links_for_a_source_content_id,
        model=model,
        eligible_target_content_ids=eligible_target_content_ids,
        output_type="df")



def get_related_links_for_a_source_content_id(source_content_id, model, eligible_target_content_ids,
                                              probability_threshold=0.46, output_type="list"):
    
    potential_related_links = pd.DataFrame(model.wv.most_similar(source_content_id, topn=1000))
    potential_related_links.columns = ['target_content_id', 'probability']
    potential_related_links.sort_values('probability', inplace=True, ascending=False)
    potential_related_links = only_include_eligible_target_content_ids(potential_related_links,
                                                                       eligible_target_content_ids,
                                                                    )
    potential_related_links['source_content_id'] = source_content_id

    if output_type == "df":
        output = potential_related_links[potential_related_links['probability'] > probability_threshold].head(5)
    if output_type == "list":
        output = potential_related_links[potential_related_links['probability'] > probability_threshold].head(5)[
            'target_content_id'].values.tolist()

    return output

def only_include_eligible_target_content_ids(df_target_prop, eligible_target_links):
    """
    Selects the rows of a pandas DataFrame which contain eligible target_content_ids. Drops rows where target_content_id
    is not in the list of eligible target content_ids
    :param df_target_prop: pandas DataFrame which includes column named 'target_content_id'
    :param eligible_target_links: list of eligible content_ids to identify rows where the target_content_id should be included.
    :return: pandas DataFrame of eligible target_content_ids
    """
    return df_target_prop.query('target_content_id in @eligible_target_links')

In [57]:
df = pd.DataFrame(['5fdd5f20-7631-11e4-a3cb-005056011aef'], columns=['source_content_id'])

In [58]:
trained_model = Word2Vec.load(os.path.join(MODELS_DIR, "n2v.model"))

In [62]:
df1 = _get_related_links_for_df(df, trained_model, eligible_target_content_ids)
type(df1.values[0])

pandas.core.frame.DataFrame

In [24]:
source = eligible_source_content_ids[1]
source

'da78bdae-887e-4352-ab46-c230833f18ce'

In [36]:
list(trained_model.wv.vocab)[0:5]

['5b9a09bb-edc4-4645-8a5b-e0feb1bab892',
 '5e0ec66e-7631-11e4-a3cb-005056011aef',
 'd9f3ece7-028d-49eb-b3de-5cc746a37092',
 'efbc9e97-2bad-4dfe-9f45-1118ec2c69af',
 'c8fff813-5088-45b9-b737-ef70578d4701']

In [None]:
import operator

In [67]:
with open(os.path.join(DATA_DIR, 'tmp', 'content_id_base_path_mapping.json'), 'r') as f:
    base_cid = json.load(f)

In [78]:
probability_threshold=0.46
dict_list = {}
for cid in ['5e0ec66e-7631-11e4-a3cb-005056011aef', '5b9a09bb-edc4-4645-8a5b-e0feb1bab892']:
    results = {k:v for k,v in trained_model.\
                           wv.most_similar(cid, topn=1000)\
                           if k in eligible_target_content_ids and v >= probability_threshold}
    dict_list[cid] = sorted(results.items(), key=operator.itemgetter(1), reverse=True)[0:5]
   

5e0ec66e-7631-11e4-a3cb-005056011aef
these are the result {'f20fbdbd-dcc1-4f00-8cf4-93d8a890dae3': 0.4649709463119507}
5b9a09bb-edc4-4645-8a5b-e0feb1bab892
these are the result {'be6e4ef9-2854-443a-94ef-b31a62dcece8': 0.5146117210388184}


In [77]:
row_list = [{'source_id':key,
                         'source_bp':base_cid[key],
                         'target_id':t,
                         'target_bp':base_cid[t],
                         'probability':p} for key, values in dict_list.items() for t,p in values]
df_results = pd.DataFrame(row_list)
df_results
# pd.DataFrame(sorted(potential_related_links.items(), key=operator.itemgetter(1), reverse=True)[0:5])

Unnamed: 0,probability,source_bp,source_id,target_bp,target_id
0,0.464971,/government/publications/national-film-and-tel...,5e0ec66e-7631-11e4-a3cb-005056011aef,/government/publications/gcse-9-to-1-subject-l...,f20fbdbd-dcc1-4f00-8cf4-93d8a890dae3
1,0.514612,/guidance/forensic-toxicology-tests,5b9a09bb-edc4-4645-8a5b-e0feb1bab892,/government/publications/hmt-workforce-managem...,be6e4ef9-2854-443a-94ef-b31a62dcece8


In [80]:
dict_list['5e0ec66e-7631-11e4-a3cb-005056011aef']

('f20fbdbd-dcc1-4f00-8cf4-93d8a890dae3', 0.4649709463119507)

In [82]:
json = {k:[vs for vs,_ in v] for k,v in dict_list.items()}
json

{'5e0ec66e-7631-11e4-a3cb-005056011aef': ['f20fbdbd-dcc1-4f00-8cf4-93d8a890dae3'],
 '5b9a09bb-edc4-4645-8a5b-e0feb1bab892': ['be6e4ef9-2854-443a-94ef-b31a62dcece8']}