# Set up

In [20]:
import pymongo
import os
import pandas as pd
import yaml
from collections import Counter
from datetime import datetime
import sys
SRC = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), "src")
sys.path.append(SRC)

from content_api.details_utils import extract_html_links

## get DIRs of where things are

In [3]:
DATA_DIR = os.getenv("DATA_DIR")
config = os.path.join(SRC, "config")
black_list_path = os.path.join(config, "document_types_excluded_from_the_topic_taxonomy.yml")

## get blacklisted document types

In [7]:
with open(black_list_path, 'r') as stream:
    blacklisted_content_page = sorted(yaml.load(stream)['document_types'])

  


## Lists of:
- fields we should keep from the content data JSON e.g. document_type
- links types we care about e.g. ordered_related_items
- what keys we want for each link ['base_path', 'content_id']

In [8]:
keep_columns =[
    '_id', 
    #                 'access_limited', 
    #                 'analytics_identifier', 
    'content_id',
    'content_purpose_document_supertype', 
    'content_purpose_subgroup',
    'content_purpose_supergroup', 
    # 'created_at', 
    'description', 
    'details',
    'document_type', 
    'email_document_supertype', 
    # 'expanded_links',
    'first_published_at', 
    #                 'format', 
    'government_document_supertype',
    #                 'links', 
    'locale', 
    'navigation_document_supertype', 
    #                 'need_ids',
    #                 'payload_version', 
    'phase', 
    'public_updated_at', 
    'publishing_app',
    #                 'publishing_request_id', 
    'publishing_scheduled_at', 
    #                 'redirects',
    'rendering_app', 
    #                 'routes', 
    #                 'scheduled_publishing_delay_seconds',
    #                 'schema_name', 
    'search_user_need_document_supertype', 
    'title',
    'updated_at', 
    'user_journey_document_supertype' 
    #                 'withdrawn_notice'
    ]

In [9]:
links_keep = [
    'organisations',
    'primary_publishing_organisation',
    'taxons',
    #  'finder',
    #  'available_translations',
    'mainstream_browse_pages',
    # 'parent',
    'part_of_step_navs',
    'ordered_related_items',
    #  'meets_user_needs',
    'topics',
    'ordered_related_items_overrides',
    'pages_part_of_step_nav',
    'pages_related_to_step_nav',
    'related_to_step_navs',
    # 'children',
    'document_collections',
    #  'lead_organisations',
    #  'world_locations',
    #  'worldwide_organisations',
    #  'supporting_organisations',
    #  'worldwide_priorities',
    # 'original_primary_publishing_organisation',
    'documents',
    'policy_areas',
    # 'topical_events',
    #  'suggested_ordered_related_items',
    'related_policies',
    # 'ministers',
    # 'people',
    # 'roles',
    #  'field_of_operation'
    ]

In [10]:
keep_keys = [
    # 'analytics_identifier', 
    # 'api_path', 
    'base_path', 
    'content_id', 
    # 'description', 
    # 'document_type', 
    # 'locale', 
    # 'schema_name', 
    # 'title', 
    # 'withdrawn', 
    # 'details', 
    # 'links'
    ]

# Query database

In [11]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
# specify the DB and collection within the DB that we are interested in
mydb = myclient["content_store"]
mycol = mydb["content_items"]

In [12]:
def handle_expanded_links(content_links, row_dict):
    for key,value in content_links.items():
        if key in links_keep:
            row_dict[key] = []
            for item in value:
                row = {}
                for k in keep_keys:
                    if k in item.keys():
                        row[k] = item[k]
                row_dict[key].append(row)

## get info from the keep_columns for all content items that are live, and aren't of a document type in the blacklist
If there is an expanded_links key, use handle_expanded_links to get these links out
return a df

In [16]:
mydoc = mycol.find({ "$and": [
                    { "document_type": {
                        "$not" : { "$in": blacklisted_content_page}}},
                    { "phase": "live"}]})
print("Started:",datetime.now().strftime("%H:%M:%S"))
rowlist = []
for i,item in enumerate(mydoc):
    if i < 50000:
        row = {key:value for key,value in item.items() if key in keep_columns}
#         row['body'] = extract_from_details(item['details'], "text")
#         row['embedded_links'] = extract_from_details(item['details'], "links")
        if "expanded_links" in item.keys():
            handle_expanded_links(item["expanded_links"], row)    
        rowlist.append(row)
    else:
        break
    if i % 10000==0:
        print(i,datetime.now().strftime("%H:%M:%S"))
print("Ended:",datetime.now().strftime("%H:%M:%S"))
df = pd.DataFrame(rowlist)
df.shape

Started: 17:15:53
0 17:15:53
10000 17:15:56
20000 17:16:06
30000 17:16:11
40000 17:16:15
Ended: 17:16:20


(50000, 34)

In [18]:
df.columns

Index(['_id', 'content_id', 'content_purpose_document_supertype',
       'content_purpose_subgroup', 'content_purpose_supergroup', 'description',
       'details', 'document_collections', 'document_type',
       'email_document_supertype', 'first_published_at',
       'government_document_supertype', 'locale', 'mainstream_browse_pages',
       'navigation_document_supertype', 'ordered_related_items',
       'ordered_related_items_overrides', 'organisations',
       'pages_part_of_step_nav', 'pages_related_to_step_nav',
       'part_of_step_navs', 'phase', 'primary_publishing_organisation',
       'public_updated_at', 'publishing_app', 'publishing_scheduled_at',
       'related_to_step_navs', 'rendering_app',
       'search_user_need_document_supertype', 'taxons', 'title', 'topics',
       'updated_at', 'user_journey_document_supertype'],
      dtype='object')

In [19]:
df.head()

Unnamed: 0,_id,content_id,content_purpose_document_supertype,content_purpose_subgroup,content_purpose_supergroup,description,details,document_collections,document_type,email_document_supertype,...,publishing_app,publishing_scheduled_at,related_to_step_navs,rendering_app,search_user_need_document_supertype,taxons,title,topics,updated_at,user_journey_document_supertype
0,/30-hours-free-childcare,ddda6dc8-e9de-49db-bbd1-97e3d0bc1e6f,guidance,transactions,services,{'value': 'Who is eligible for 30 hours free c...,"{'body': [{'content_type': 'text/govspeak', 'c...",,answer,other,...,publisher,NaT,,government-frontend,core,,30 hours free childcare,,2019-03-06 09:32:27.771,thing
1,/aaib-reports/aaib-investigation-to-skyranger-...,b88fe96f-57bd-4302-ae7f-82fb3c88f0ee,reports,incidents,transparency,"{'value': 'Lost control during taxiing, Newtow...",{'metadata': {'date_of_occurrence': '2014-07-0...,,aaib_report,other,...,specialist-publisher,NaT,,government-frontend,government,[{'base_path': '/transport/air-accidents-and-s...,"AAIB investigation to Skyranger 912(2), G-CCCK\t",,2019-03-06 12:07:13.338,thing
2,/aaib-reports/aaib-investigation-to-skyranger-...,01f0d046-2642-4723-a996-9f7c32d4129f,reports,incidents,transparency,{'value': 'Engine failure after take-off follo...,{'metadata': {'date_of_occurrence': '2015-06-2...,,aaib_report,other,...,specialist-publisher,NaT,,government-frontend,government,[{'base_path': '/transport/air-accidents-and-s...,"AAIB investigation to Skyranger 912(2), G-JBUL",,2019-03-06 13:13:54.143,thing
3,/aaib-reports/aaib-investigation-to-skyranger-...,2c18962c-57d5-4ea2-b1f4-7093059946bc,reports,incidents,transparency,"{'value': 'Heavy landing and propeller strike,...",{'metadata': {'date_of_occurrence': '2015-01-2...,,aaib_report,other,...,specialist-publisher,NaT,,government-frontend,government,[{'base_path': '/transport/air-accidents-and-s...,"AAIB investigation to Skyranger J2.2(3), G-CBXS",,2019-03-06 11:34:38.358,thing
4,/aaib-reports/aaib-investigation-to-skyranger-...,15c8690c-0089-4d53-8601-17081a7963cf,reports,incidents,transparency,"{'value': 'Aircraft departed runway, Brookfiel...",{'metadata': {'date_of_occurrence': '2014-09-1...,,aaib_report,other,...,specialist-publisher,NaT,,government-frontend,government,[{'base_path': '/transport/air-accidents-and-s...,"AAIB investigation to Skyranger Swift 912S(1),...",,2019-03-06 12:27:23.287,thing


In [45]:
LOOK = ['title', 'body']
CHILD_KEYS = ['title', 'description']
DETAILS_SECTIONS = ['body', 'brand', 'documents', 'final_outcome_detail', 'final_outcome_documents',
                    'government', 'headers', 'introduction', 'introductory_paragraph',
                    'licence_overview', 'licence_short_description', 'logo', 'metadata', 'more_information',
                    'need_to_know',
                    'other_ways_to_apply', 'summary', 'ways_to_respond', 'what_you_need_to_know', 'will_continue_on',
                    'parts',
                    'collection_groups']

def cs_extract_links(details):
    """
    Generic implementation to extract text or links from the details entry in a content store item. Details contains
    what is shown in the main body of a page. This is catered to content pages.
    :param details: A nested json dictionary-like structure
    :param function_type: extract texts or links
    :return: the aggregated text or links
    """

    links = []

    for key, dict_list in sorted(details.items()):
        if key in DETAILS_SECTIONS:
            print("thru if", key)
            if isinstance(dict_list, list):
                print('list')
                print(dict_list)
                for dict_i in dict_list:
                    # print("dict_i", dict_i)
#                     why are we looking only at dict_i['content_type'] == "text/html"? what about when there are no conteent types?
                    if 'content_type' in dict_i.keys() and dict_i['content_type'] == "text/html":
                        # print("get content")
                        links.extend(extract_html_links(dict_i['content']))
                    elif any(l in dict_i.keys() for l in LOOK):
                        for l in LOOK:
                            print(l)
                            if l in dict_i.keys():
                                for dict_j in dict_i[l]:
                                    print(dict_j)
                                    if 'content_type' in dict_j.keys() and dict_j['content_type'] == "text/html":
                                        links.extend(extract_html_links(dict_j['content']))
            else:
                print(dict_list)
                links.extend(extract_html_links(dict_list))
    if "transaction_start_link" in details.keys():
        links.append(details["transaction_start_link"])

    return links

In [48]:
# sorted(df.details.iloc[13].items())

In [47]:
cs_extract_links(df.details.iloc[13])

thru if body
list
[{'content_type': 'text/govspeak', 'content': '## Summary:\r\nFollowing engine shutdown due to a failed fuel pump, the pilot made a forced landing during which the landing gear collapsed.\r\n\r\n### Download report:\r\n[embed:attachments:inline:b0cb96e1-a605-470a-bf8e-45927bf2358a] \r\n\r\n### Download glossary of abbreviations:\r\n[Glossary of abbreviations](https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/433812/Glossary_of_abbreviations.pdf)\r\n'}, {'content_type': 'text/html', 'content': '<h2 id="summary">Summary:</h2>\n<p>Following engine shutdown due to a failed fuel pump, the pilot made a forced landing during which the landing gear collapsed.</p>\n\n<h3 id="download-report">Download report:</h3>\n<p><span class="attachment-inline"><a href="https://assets.publishing.service.gov.uk/media/59b0024ee5274a180e3705e3/Stolp_Starduster_Too_SA300_G-JIII_09-17.pdf">Stolp Starduster Too SA300, G-JIII 09-17</a></span></p>\n\n<h3 id="download-glossa

['/government/uploads/system/uploads/attachment_data/file/433812/Glossary_of_abbreviations.pdf']

In [58]:
cs_extract_links(df.details.iloc[5510])

thru if body
list
[{'content_type': 'text/html', 'content': '<h2 id="fokker-f28-mark-0070-ph-kzb">Fokker F.28 Mark 0070, PH-KZB</h2>\n\n<h3 id="summary">Summary:</h3>\n\n<p>The aircraft was taxied onto Stand 214C (centre) using the right engine with the left engine shut down and the APU running. As soon as the right engine was shut down, there was a strong smell of electrical burning and smoke began to accumulate on the flight deck. Following a brief discussion with the Cabin Service Supervisor, the aircraft was evacuated. An engineering investigation identified that the emergency inverter cooling fan capacitor, was completely burnt out. This had released smoke and fumes into the flight deck area. </p>\n\n<h3 id="download-report">Download report:</h3>\n\n<p><a rel="external" href="https://assets.digital.cabinet-office.gov.uk/media/5422f81be5274a13170006ed/Fokker_F_28_Mark_0070__PH-KZB_05-09.pdf">Fokker F 28 Mark 0070, PH-KZB 05-09.pdf</a> (535.27 kb) </p>\n'}, {'content_type': 'text/go

[]

In [59]:
mydoc1 = mycol.find({ "$and": [
                    { "content_id": "457fe71a-0ce9-46d5-b22a-0426cdd99ebc"},
                    { "phase": "live"}]})


In [61]:
contentdata1 = mydoc1.next()

In [63]:
contentdata1

{'_id': '/government/news/brexit-britain-will-have-the-worlds-best-maritime-industry',
 'description': {'value': 'Announces plans to shape, promote and collaborate with the maritime sector, double apprenticeships and launch a shipbuilding renaissance.'},
 'content_purpose_document_supertype': 'news',
 'content_purpose_subgroup': 'news',
 'content_purpose_supergroup': 'news_and_communications',
 'email_document_supertype': 'announcements',
 'government_document_supertype': 'news-stories',
 'navigation_document_supertype': 'other',
 'search_user_need_document_supertype': 'government',
 'user_journey_document_supertype': 'thing',
 'locale': 'en',
 'details': {'body': '<div class="govspeak"><p>Brexit Britain will be the best country in the world to do maritime business thanks to more trade opportunities, more jobs and more investment in new technologies, Transport Secretary Chris Grayling said today (11 September 2017).</p>\n\n<p>The government has committed to working with industry to del

In [67]:
cs_extract_links(contentdata1['details'])

thru if body
<div class="govspeak"><p>Brexit Britain will be the best country in the world to do maritime business thanks to more trade opportunities, more jobs and more investment in new technologies, Transport Secretary Chris Grayling said today (11 September 2017).</p>

<p>The government has committed to working with industry to deliver an ambitious export plan for when we leave the European Union.</p>

<p>This will maximise new trade opportunities and significantly grow the <abbr title="United Kingdom">UK</abbr>’s maritime sector to become a global front runner in future innovation and technology.</p>

<p>The government will achieve this by:</p>

<ul>
  <li>creating a plan to shape and promote the maritime industry up to 2050</li>
  <li>collaborating with industry partners to significantly grow the sector by seizing new trade opportunities</li>
  <li>calling on maritime employers to double the number of apprenticeships they offer</li>
  <li>launching a British shipbuilding ‘renaiss

['/government/publications/national-shipbuilding-strategy',
 '/government/publications/maritime-growth-study-report',
 '/guidance/support-for-maritime-training-smart']