In [1]:
import pymongo
import os
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize

import yaml
from collections import Counter
from datetime import datetime
import sys
SRC = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), "src")
sys.path.append(SRC)
from content_api.details_utils import extract_from_details, cs_extract_text, cs_extract_links, extract_html_links

from tqdm import tqdm
from bs4 import BeautifulSoup


In [2]:
### Get dirs

In [3]:
DATA_DIR = os.getenv("DATA_DIR")
config = os.path.join(SRC, "config")
black_list_path = os.path.join(config, "document_types_excluded_from_the_topic_taxonomy.yml")

In [4]:
with open(black_list_path, 'r') as stream:
    blacklisted_content_page = sorted(yaml.load(stream)['document_types'])
blacklisted_content_page[0:5]

['about',
 'about_our_services',
 'access_and_opening',
 'business_support_finder',
 'coming_soon']

In [5]:
### Get database running locally

In [8]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")

In [9]:
print(myclient.list_database_names())

['admin', 'config', 'content_items', 'local']


In [10]:
mydb = myclient["content_items"]
mycol = mydb["content_items"]

In [11]:
## set up filters and projections for MongoDB queries

In [12]:
# whitelisted_doctypes_filter = {"$elemMatch": { "document_type": {"$nin":blacklisted_content_page}, "phase":"live"}}
whitelisted_doctypes_filter = {"document_type": {"$nin": blacklisted_content_page}}

filter_ = { "$and": [{"$or": [{"expanded_links.ordered_related_items": {"$exists": True}}, {"expanded_links.documents": {"$exists": True}}]},
                    { "document_type": {"$nin": blacklisted_content_page}},
                    { "phase": "live"}]}


filter_related_links = { "$and": [{"expanded_links.ordered_related_items": {"$exists": True}},
                    { "document_type": {"$nin": blacklisted_content_page}},
                    { "phase": "live"}]}

filter_collection_links = { "$and": [{"expanded_links.documents": {"$exists": True}},
                    { "document_type": {"$nin": blacklisted_content_page}},
                    { "phase": "live"}]}

In [13]:
keep_projection = { 
    "expanded_links.ordered_related_items.base_path": 1,
    "expanded_links.documents.base_path": 1,
    "expanded_links.ordered_related_items.content_id": 1,
    "expanded_links.documents.content_id": 1,
    "content_id": 1}

In [14]:
### Get related_links and collection_links

In [15]:
related_links_json = list(mycol.find(filter_related_links, keep_projection))

In [16]:
collection_links_json = list(mycol.find(filter_collection_links, keep_projection))

In [17]:
related_links_df = json_normalize(related_links_json, 
                    record_path=[['expanded_links', 'ordered_related_items']], 
                    meta=['_id', 'content_id'],
                                  meta_prefix='source_'
                    )

In [18]:
related_links_df.columns = ['destination_base_path', 'destination_content_id', 'source_base_path', 'source_content_id']
related_links_df['link_type'] = 'related_link'

In [19]:
collection_links_df = json_normalize(collection_links_json, 
                    record_path=[['expanded_links', 'documents']], 
                    meta=['_id', 'content_id'], 
                                  meta_prefix='source_'
                    )

In [20]:
collection_links_df.columns = ['destination_base_path', 'destination_content_id', 'source_base_path', 'source_content_id'] 
collection_links_df['link_type'] = 'collection_link'

In [21]:
### Get basepath to content ID map

In [22]:
base_path_content_id_cursor = mycol.find({ "$and": [
                    { "content_id": {"$exists": True}},
                    { "phase": "live"}]}, 
                                         {"content_id": 1})

In [23]:
base_path_to_content_id_lookup_dict = {item['_id']:item['content_id'] for item in base_path_content_id_cursor}

In [24]:
### Get embedded links

In [25]:
filter_basic = { "$and": [
                    { "document_type": {"$nin": blacklisted_content_page}},
                    { "phase": "live"}]}

In [26]:
text_projection = { 
    "details.body": 1,
    "details.brand": 1, # no documents found?
    "details.documents": 1,
    "details.final_outcome_detail": 1,
    "details.final_outcome_documents": 1,
    "details.government": 1,
    "details.headers": 1,
    "details.introduction": 1,
    "details.introductory_paragraph": 1,
    "details.licence_overview": 1,
    "details.licence_short_description": 1,
    "details.logo": 1,
    "details.metadata": 1,
    "details.more_information": 1,
    "details.need_to_know": 1,
    "details.other_ways_to_apply": 1,
    "details.summary": 1,
    "details.ways_to_respond": 1,
    "details.what_you_need_to_know": 1,
    "details.will_continue_on": 1,
    "details.parts": 1,
    "details.collection_groups": 1,
    "details.transaction_start_link": 1,
    "content_id": 1}

In [27]:
text_cursor = mycol.find(filter_basic, text_projection)

In [28]:
text_list = list(text_cursor)

In [29]:
text_df = json_normalize(text_list)

In [30]:
text_df['all_details'] = text_df.iloc[:, 2:-1].values.tolist()

In [31]:
def is_html(text):
    try:
        return bool(BeautifulSoup(text, "html.parser").find())
    except:
        pass

In [32]:
def extract_html(cell_contents, links = []):
    
    if type(cell_contents) == list:
        [extract_html(item, links) for item in cell_contents] 
    
    elif type(cell_contents) == dict:
        extract_html(list(cell_contents.values()), links)
    
    else:
        if is_html(cell_contents):
            links.extend(extract_html_links(cell_contents))
        
    return links
    

In [33]:
tqdm.pandas(desc="extract href links")
text_df['embedded_links'] = text_df.progress_apply(
         lambda row: extract_html(
             cell_contents = row['all_details'],
             links=[]),
         axis=1)

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that d

In [48]:
embedded_links_df = text_df[['_id', 'content_id', 'embedded_links']]

In [49]:
embedded_links_df.head()

Unnamed: 0,_id,content_id,embedded_links
0,/1619-bursary-fund,f4b96a38-5247-4afd-b554-8a258a0e8c93,"[/education-maintenance-allowance-ema, /childr..."
1,/30-hours-free-childcare,ddda6dc8-e9de-49db-bbd1-97e3d0bc1e6f,[/help-with-childcare-costs/what-counts-approv...
2,/aaib-reports/1-1971-g-atek-and-g-ateh-15-augu...,ed760821-bf95-408b-9824-f6efccd1b505,[]
3,/aaib-reports/1-1972-g-apdn-3-july-1970,652b75bf-2f9a-42c5-be9f-b31218ff7fbe,[]
4,/aaib-reports/1-1973-ph-moa-3-june-1971,bd9737e8-44fe-4928-985c-803b5fa7ad9f,[]


In [50]:
embedded_links_df.shape

(360921, 3)

In [51]:
lst_col = 'embedded_links'

embedded_links_df = pd.DataFrame({
          col:np.repeat(embedded_links_df[col].values, embedded_links_df[lst_col].str.len())
          for col in embedded_links_df.columns.difference([lst_col])
      }).assign(**{lst_col:np.concatenate(embedded_links_df[lst_col].values)})[embedded_links_df.columns.tolist()] 

In [52]:
embedded_links_df.shape

(638034, 3)

In [39]:
def keep_first_part_of_basepath(basepath):
    return(os.path.split(basepath))[0]

In [40]:
embedded_links_df['first_part_path'] = embedded_links_df['embedded_links'].apply(keep_first_part_of_basepath)
embedded_links_df['first_part_path2'] = embedded_links_df['first_part_path'].apply(keep_first_part_of_basepath)

In [41]:
embedded_links_df['destination_content_id'] = embedded_links_df['embedded_links'].map(base_path_to_content_id_lookup_dict)
embedded_links_df['destination_content_id2'] = embedded_links_df['first_part_path'].map(base_path_to_content_id_lookup_dict)
embedded_links_df['destination_content_id3'] = embedded_links_df['first_part_path2'].map(base_path_to_content_id_lookup_dict)
embedded_links_df['final'] = embedded_links_df['destination_content_id'].fillna(embedded_links_df['destination_content_id2'])
embedded_links_df['final'] = embedded_links_df['final'].fillna(embedded_links_df['destination_content_id3'])

In [42]:
embedded_links_df.drop(['destination_content_id',
       'first_part_path', 'destination_content_id2', 'first_part_path2', 'destination_content_id3'], axis=1, inplace=True)

In [43]:
embedded_links_df.columns = ['source_base_path', 'source_content_id', 'destination_base_path', 'destination_content_id']

In [44]:
embedded_links_df['link_type'] = 'embedded_link'

In [53]:
embedded_links_df.head()

Unnamed: 0,_id,content_id,embedded_links
0,/1619-bursary-fund,f4b96a38-5247-4afd-b554-8a258a0e8c93,/education-maintenance-allowance-ema
1,/1619-bursary-fund,f4b96a38-5247-4afd-b554-8a258a0e8c93,/children-with-special-educational-needs/extra...
2,/1619-bursary-fund,f4b96a38-5247-4afd-b554-8a258a0e8c93,/1619-bursary-fund/eligibility
3,/1619-bursary-fund,f4b96a38-5247-4afd-b554-8a258a0e8c93,/children-with-special-educational-needs/extra...
4,/1619-bursary-fund,f4b96a38-5247-4afd-b554-8a258a0e8c93,/government/publications/advice-for-young-peop...


In [46]:
all_links = pd.concat([related_links_df, collection_links_df, embedded_links_df], axis=0, sort=True)

In [55]:
all_links.shape

(710403, 5)