In [None]:
import json
import logging.config
import os
import warnings
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import pymongo
from tqdm import tqdm
import pickle
from bs4 import BeautifulSoup
from src.utils.miscellaneous import read_config_yaml
from src.utils import text_preprocessing as tp



### Changes for AB test:
    - Weighted
    - Add taxons and departments and document_collections
    - Prune network (remove transitions that have <1% probability of occurring
    - Increase dimensions (128)
    - Increase batchsize (50)
    - Threshold set at 80 rather than the fiddly logic (check how many have wouldnt have links in this scenario)
    - Train on aws sagemaker notebook - related-links-large
    
### Todo
    - Taxons and department link extraction only implemented in this notebook (not the main src of the app)
    - Network pruning not implemented
    - Threshold change only implemented in notebook `run-and-check-link-predictions` (not in src!)

In [None]:
mongo_client = pymongo.MongoClient('mongodb://localhost:27017/')
# TODO check this is consistent with naming of restored db in AWS
content_store_db = mongo_client["content_store"]
content_store_collection = content_store_db["content_items"]


In [None]:


warnings.filterwarnings('ignore', category=UserWarning, module='bs4')

tqdm.pandas()


KEYS_FOR_LINK_TYPES = {
    "related": "ordered_related_items",
    "collection": "documents",
    "taxons":"taxons",
    "documents": "documents",
    "document_collections":"document_colletions",
    "organisations":"organisations"
}

EXCLUDED_SOURCE_CONTENT = read_config_yaml("source_exclusions_that_are_not_linked_from.yml")
EXCLUDED_TARGET_CONTENT = read_config_yaml("target_exclusions_that_are_not_linked_to.yml")

LINKS_PROJECTION = {
    "expanded_links.ordered_related_items.base_path": 1,
    "expanded_links.ordered_related_items.content_id": 1,
    "expanded_links.taxons.base_path":1,
    "expanded_links.taxons.content_id":1,
    "expanded_links.documents.base_path":1,
    "expanded_links.documents.content_id":1,
    "expanded_links.document_collections.base_path":1,
    "expanded_links.document_collections.content_id":1,
    "expanded_links.organisations.base_path":1,
    "expanded_links.organisations.content_id":1,
    "content_id": 1}


TEXT_PROJECTION = {
    "details.body": 1,
    "details.brand": 1,  # no documents found?
    "details.documents": 1,
    "details.final_outcome_detail": 1,
    "details.final_outcome_documents": 1,
    "details.government": 1,
    "details.headers": 1,
    "details.introduction": 1,
    "details.introductory_paragraph": 1,
    "details.licence_overview": 1,
    "details.licence_short_description": 1,
    "details.logo": 1,
    "details.metadata": 1,
    "details.more_information": 1,
    "details.need_to_know": 1,
    "details.other_ways_to_apply": 1,
    "details.summary": 1,
    "details.ways_to_respond": 1,
    "details.what_you_need_to_know": 1,
    "details.will_continue_on": 1,
    "details.parts": 1,
    "details.collection_groups": 1,
    "details.transaction_start_link": 1,
    "content_id": 1}

FILTER_BASIC = {"$and": [{"phase": "live"}]}



OUTPUT_DF_COLUMNS = ['destination_base_path', 'destination_content_id', 'source_base_path', 'source_content_id']


def get_key_links_df(mongodb_collection):
    """
    Gets links as per the links projections
    """
    linklist =  list(content_store_collection.find(FILTER_BASIC, LINKS_PROJECTION))
    df = json_normalize(linklist)
    df = df.melt(id_vars=['_id','content_id'])
    df = df.dropna()
    df = reshape_df_explode_list_column(df, 'value')
    df['destination_content_id'] = df['value'].map(lambda x: x['content_id'])
    df['destination_page_path'] = df['value'].map(lambda x: x['base_path'])
    df = df[['_id','content_id','variable','destination_content_id','destination_page_path']]
    df['variable'] = df['variable'].map(lambda x: x.split('.')[1])
    df.columns = ['source_base_path','source_content_id','link_type','destination_content_id','destination_base_path']
    return df

def get_page_text_df(mongodb_collection):
    """
    Queries a MongoDB collection, get specific fields from details using TEXT_PROJECTION, converts this cursor to a
        DataFrame, with all details fields in one list column
    :param mongodb_collection:
    :return: pandas DataFrame with: _id (base_path), content_id, and all_details list column
    """
    text_list = list(mongodb_collection.find(FILTER_BASIC, TEXT_PROJECTION))
    df = json_normalize(text_list)
    # concatenate text from all columns (except first 2) into a list -> so we get a list of all the details fields
    # that we queried
    df['all_details'] = df.iloc[:, 2:-1].values.tolist()
    logging.info(f' df with details text has columns={list(df.columns)} and shape={df.shape}')
    return df[['_id', 'content_id', 'all_details']]


def reshape_df_explode_list_column(wide_df, list_column):
    """
    Bit like a melt, we have a list column in a DataFrame, and we repeat all other columns for each item in the list
    TODO: would be nice to bump pandas and call DataFrame.explode, but it breaks other stuff
    :param wide_df: pandas DataFrame with a list column
    :param list_column: list column name
    :return: DataFrame with one row per item in the list_column
    """
    # repeat all columns except list_col as many times as the list is long for that row
    # get a 1D vecotr using concatenate to flatten all values in list vector
    # and unpack this vector into a new column called list_col
    return pd.DataFrame({
        col: np.repeat(wide_df[col].values, wide_df[list_column].str.len())
        for col in wide_df.columns.difference([list_column])
    }).assign(**{list_column: np.concatenate(wide_df[list_column].values)})[
        wide_df.columns.tolist()]


def extract_embedded_links_df(page_text_df, base_path_to_content_id_mapping):
    """
    Takes a dataframe with  a list column (all_details), returns a dataframe with one in-page (embedded) link per row
    :param page_text_df: pandas DataFrame with  a list column (all_details)
    :param base_path_to_content_id_mapping: Python dictionary {page_path: content_id}
    :return:  pandas DataFrame  of embedded links with columns ['source_base_path', 'source_content_id',
        'destination_base_path','destination_content_id', 'link_type']
    """
    page_text_df['embedded_links'] = page_text_df['all_details'].progress_apply(tp.extract_links_from_content_details)
    logging.info(f'have applied extract_links_from_content_details to page_text_df')

    embedded_links_df = page_text_df[['_id', 'content_id', 'embedded_links']]
    logging.info(f'shape of df with link list (wide before melt)={embedded_links_df.shape}')

    embedded_links_df = reshape_df_explode_list_column(embedded_links_df, 'embedded_links')
    logging.info(f'shape of df after melt (each link in its own row)={embedded_links_df.shape}')

    embedded_links_df['embedded_links'] = embedded_links_df['embedded_links'].apply(tp.clean_page_path)
    embedded_links_df['destination_content_id'] = embedded_links_df['embedded_links'].map(
        base_path_to_content_id_mapping)
    logging.info(f'mapping of page_path to content_id has completed')

    embedded_links_df.rename(
        columns={
            '_id': 'source_base_path',
            'content_id': 'source_content_id',
            'embedded_links': 'destination_base_path'},
        inplace=True)

    embedded_links_df['link_type'] = 'embedded_link'
    return embedded_links_df


def get_structural_edges_df(mongodb_collection, page_path_content_id_mapping):
    """
    Gets related, collection, and embedded links for all items in the mongodb collection
    :param mongodb_collection:
    :param page_path_content_id_mapping: Python dictionary {page_path: content_id}
    :return: pandas DataFrame with columns ['source_base_path', 'source_content_id', 'destination_base_path',
                                 'destination_content_id', 'link_type']
    """
    key_links_df = get_key_links_df(mongodb_collection)

    page_text_df = get_page_text_df(mongodb_collection)

    embedded_links_df = extract_embedded_links_df(page_text_df, page_path_content_id_mapping)
    logging.info(f'embedded links dataframe shape {embedded_links_df.shape}')

    structural_edges_df = pd.concat(
        [key_links_df, embedded_links_df],
        axis=0, sort=True, ignore_index=True)

    logging.info(f'structural edges dataframe shape {structural_edges_df.shape}')

    # filter out any links without a destination content ID, as we are building a network based on content_ids
    structural_edges_df.query('destination_content_id.notnull()', inplace=True)
    logging.info(
        f'structural edges dataframe shape f after dropping null destination_content_ids={structural_edges_df.shape}')
    return structural_edges_df



In [None]:
df = get_key_links_df(content_store_collection)

In [None]:
df

In [None]:
sn = pd.read_csv('../data/tmp/structural_edges.csv')
sn = sn[sn['link_type']=='embedded_link']
sn

In [None]:
c['weight'] = 50

all_edges = pd.concat([c, f],
                          ignore_index=True, sort=True)

# Deduplicate edges, summing structural and functional edge weights
all_edges = all_edges.groupby(['source_content_id', 'destination_content_id'], as_index=False).aggregate(sum)
all_edges = all_edges[
        ['source_content_id', 'destination_content_id', 'weight']].reset_index(drop=True)
all_edges.to_csv('metanetwork.csv',index=False)

In [None]:
f = pd.read_csv('../data/tmp/functional_edges.csv')

In [None]:
c.to_csv('metanetwork.csv',index=False)