In [1]:
import pymongo
import os
import pandas as pd
import yaml
from collections import Counter
from datetime import datetime
import sys
SRC = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), "src")
sys.path.append(SRC)
from content_api.details_utils import extract_from_details, cs_extract_text, cs_extract_links

In [2]:
### Get dirs

In [3]:
DATA_DIR = os.getenv("DATA_DIR")
config = os.path.join(SRC, "config")
black_list_path = os.path.join(config, "document_types_excluded_from_the_topic_taxonomy.yml")

In [4]:
### Get database running locally

In [5]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")

In [6]:
print(myclient.list_database_names())

['admin', 'config', 'content_store', 'local']


In [7]:
mydb = myclient["content_store"]
mycol = mydb["content_items"]

In [8]:
with open(black_list_path, 'r') as stream:
    blacklisted_content_page = sorted(yaml.load(stream)['document_types'])
blacklisted_content_page[0:5]

['about',
 'about_our_services',
 'access_and_opening',
 'business_support_finder',
 'coming_soon']

In [9]:
keep_columns = \
['_id', 
#                 'access_limited', 
#                 'analytics_identifier', 
'content_id',
'content_purpose_document_supertype', 
'content_purpose_subgroup',
'content_purpose_supergroup', 
# 'created_at', 
'description', 
'details',
'document_type', 
'email_document_supertype', 
# 'expanded_links',
'first_published_at', 
#                 'format', 
'government_document_supertype',
#                 'links', 
'locale', 
'navigation_document_supertype', 
#                 'need_ids',
#                 'payload_version', 
'phase', 
'public_updated_at', 
'publishing_app',
#                 'publishing_request_id', 
'publishing_scheduled_at', 
#                 'redirects',
'rendering_app', 
#                 'routes', 
#                 'scheduled_publishing_delay_seconds',
#                 'schema_name', 
'search_user_need_document_supertype', 
'title',
'updated_at', 
'user_journey_document_supertype' 
#                 'withdrawn_notice'
]

In [10]:
links_keep = \
[
'organisations',
'primary_publishing_organisation',
'taxons',
#  'finder',
#  'available_translations',
'mainstream_browse_pages',
# 'parent',
'part_of_step_navs',
'ordered_related_items',
#  'meets_user_needs',
'topics',
'ordered_related_items_overrides',
'pages_part_of_step_nav',
'pages_related_to_step_nav',
'related_to_step_navs',
# 'children',
'document_collections',
#  'lead_organisations',
#  'world_locations',
#  'worldwide_organisations',
#  'supporting_organisations',
#  'worldwide_priorities',
# 'original_primary_publishing_organisation',
'documents',
'policy_areas',
# 'topical_events',
#  'suggested_ordered_related_items',
'related_policies',
# 'ministers',
# 'people',
# 'roles',
#  'field_of_operation'
]

In [11]:
keep_keys = \
[
# 'analytics_identifier', 
# 'api_path', 
'base_path', 
'content_id', 
# 'description', 
# 'document_type', 
# 'locale', 
# 'schema_name', 
# 'title', 
# 'withdrawn', 
# 'details', 
# 'links'
]

In [12]:
def handle_expanded_links(content_links, row_dict):
    for key,value in content_links.items():
        if key in links_keep:
            row_dict[key] = []
            for item in value:
                row = {}
                for k in keep_keys:
                    if k in item.keys():
                        row[k] = item[k]
                row_dict[key].append(row)

In [13]:
mydoc = mycol.find({ "$and": [
                    { "document_type": {"$not" : { "$in": blacklisted_content_page}}},
                    { "phase": "live"}]})
print("Started:",datetime.now().strftime("%H:%M:%S"))
rowlist = []
for i,item in enumerate(mydoc):
    if i < 50000:
        row = {key:value for key,value in item.items() if key in keep_columns}
#         row['body'] = extract_from_details(item['details'], "text")
#         row['embedded_links'] = extract_from_details(item['details'], "links")
        if "expanded_links" in item.keys():
            handle_expanded_links(item["expanded_links"], row)    
        rowlist.append(row)
    else:
        break
    if i % 10000==0:
        print(i,datetime.now().strftime("%H:%M:%S"))
print("Ended:",datetime.now().strftime("%H:%M:%S"))
df = pd.DataFrame(rowlist)
df.shape

Started: 18:26:31
0 18:26:31
10000 18:26:33
20000 18:26:38
30000 18:26:42
40000 18:26:45
Ended: 18:26:48


(50000, 36)

In [14]:
df.iloc[0].details

{'metadata': {'date_of_occurrence': '1974-10-08',
  'aircraft_category': ['sport-aviation-and-balloons'],
  'report_type': 'formal-report',
  'location': 'Saltley Trading Estate, Birmingham',
  'aircraft_type': 'Hot Air Balloon',
  'registration': 'G-BCCG',
  'bulk_published': True,
  'document_type': 'aaib_report'},
 'change_history': [{'note': 'First published.',
   'public_timestamp': '2014-12-10T17:21:54.000Z'}],
 'body': [{'content_type': 'text/html',
   'content': '<h2 id="report-no-31976-hot-air-balloon-g-bccg-report-on-the-accident-at-saltley-trading-estate-birmingham-on-8-october-1974">Report No: 3/1976. Hot Air Balloon, G-BCCG. Report on the accident at Saltley Trading Estate, Birmingham, on 8 October 1974</h2>\n\n<h3 id="download-report">Download report:</h3>\n\n<p><a rel="external" href="https://assets.digital.cabinet-office.gov.uk/media/5422f4b9ed915d1374000501/3-1976_G-BCCG.pdf">3-1976 G-BCCG.pdf</a> (2,784.51 kb) </p>\n\n<h2 id="report-appendices">Report Appendices</h2>\

In [15]:
df.details.iloc[0]['body'][0].keys()

dict_keys(['content_type', 'content'])

In [16]:
target = "parts"
for det in df.details.values:
    if target in det.keys():
        for item in det[target]:
            print(item.keys())
            if "body" in item.keys():
                print(item['body'],"\n")
#         print(det[target])
#         print([item['body'] for item in det[target]])
#         print("".join([d['content'] for d in det[target]\
#                        if d['content_type'] == "text/html" ]))
        break

dict_keys(['title', 'slug', 'body'])
[{'content_type': 'text/govspeak', 'content': "You could get a bursary to help with education-related costs if you’re aged 16 to 19 and: \r\n\r\n+ studying at a publicly funded school or college in England - not a university  \r\n+ on a training course, including unpaid work experience  \r\n\r\nA publicly funded school is one that doesn't charge you for attending it.\r\n\r\n^There's a different [scheme in Wales, Scotland and Northern Ireland](/education-maintenance-allowance-ema).^ \r\n\r\n##If you're 19 and over\r\n\r\nYou could also get a bursary if you either:  \r\n\r\n* are continuing on a course you started aged 16 to 18 (known as being a ’19+ continuer’)  \r\n* have an [Education, Health and Care Plan (EHCP)](/children-with-special-educational-needs/extra-SEN-help)  \r\n\r\n##What a bursary is for\r\nA bursary is money that you, or your education or training provider, can use to pay for things like: \r\n\r\n+ clothing, books and other equipmen

In [17]:
Counter([d for det in df.details.values for d in det.keys()])

Counter({'metadata': 47107,
         'change_history': 48573,
         'body': 48939,
         'max_cache_time': 47096,
         'attachments': 45330,
         'headers': 15830,
         'external_related_links': 1179,
         'parts': 358,
         'temporary_update_type': 36718,
         'hide_chapter_navigation': 198,
         'variants': 56,
         'introductory_paragraph': 185,
         'start_button_text': 202,
         'will_continue_on': 334,
         'transaction_start_link': 185,
         'more_information': 238,
         'other_ways_to_apply': 37,
         'department_analytics_profile': 156,
         'hidden_search_terms': 14,
         'licence_identifier': 166,
         'continuation_link': 163,
         'licence_short_description': 166,
         'licence_overview': 166,
         'lgsl_code': 65,
         'lgil_code': 65,
         'service_tiers': 65,
         'introduction': 78,
         'need_to_know': 65,
         'step_by_step_nav': 19,
         'what_you_need_to_kn

In [18]:
for item in df.details.iloc[0]['headers']:
    print(item)

{'text': 'Report No: 3/1976. Hot Air Balloon, G-BCCG. Report on the accident at Saltley Trading Estate, Birmingham, on 8 October 1974', 'level': 2, 'id': 'report-no-31976-hot-air-balloon-g-bccg-report-on-the-accident-at-saltley-trading-estate-birmingham-on-8-october-1974', 'headers': [{'text': 'Download report:', 'level': 3, 'id': 'download-report'}]}
{'text': 'Report Appendices', 'level': 2, 'id': 'report-appendices'}


In [19]:
df.details.iloc[0]['metadata']

{'date_of_occurrence': '1974-10-08',
 'aircraft_category': ['sport-aviation-and-balloons'],
 'report_type': 'formal-report',
 'location': 'Saltley Trading Estate, Birmingham',
 'aircraft_type': 'Hot Air Balloon',
 'registration': 'G-BCCG',
 'bulk_published': True,
 'document_type': 'aaib_report'}

In [20]:
cs_extract_text(df.details.iloc[1000])

'Summary: The pilot reported that the aircraft was being landed on asphalt Runway 24 when the accident occurred. The weather was fine with a surface wind from 190° at 14 kt and the runway surface was damp. The aircraft bounced after a firm touchdown and the pilot applied full power with the intention of flying a go-around. However the aircraft rolled to the left and its wing struck the runway. The aircraft deviated to the left and landed on the grass beyond the runway edge. It continued across taxiway ‘B’ before coming to a rest on the grass beyond 380 m from the runway threshold. Neither occupant was injured but the aircraft sustained damage to its left wing landing gear and propeller. The surface of the taxiway was also damaged principally through propeller strikes and failing landing gear components. Download report: Cirrus SR20 G-VGAG 02-15'

In [21]:
cs_extract_links(df.details.iloc[1010])

['/government/uploads/system/uploads/attachment_data/file/433812/Glossary_of_abbreviations.pdf']

In [22]:
target = "transaction_start_link"
for i,det in enumerate(df.details.values):   
    if target in det.keys():
        print(i)
#         for item in det[target]:
#             print(item)
#             if "body" in item.keys():
#                 print(item['body'],"\n")
#         print(det[target])
#         print([item['body'] for item in det[target]])
#         print("".join([d['content'] for d in det[target]\
#                        if d['content_type'] == "text/html" ]))
        break

10554


In [23]:
dets = df.iloc[10554].details

In [24]:
dets['transaction_start_link']

'https://www.add-driving-licence-check-code.service.gov.uk/digital/hold-licence'

In [25]:
cs_extract_links(dets)

['/view-driving-licence',
 '/call-charges',
 'https://www.add-driving-licence-check-code.service.gov.uk/digital/hold-licence']

In [26]:
cs_extract_text(dets)

not a dict, not a list


'Share your driving record (such as vehicles you can drive) with a car hire company or employer using your mobile phone. You can only use this service on a mobile phone. You’ll get a button on your home screen if you have an Android phone or a pass in your Apple Wallet if you have an iPhone. With this you can share your driving record using a: ‘check code’ scannable QR code The codes expire after 21 days and can only be used once. There’s a different way to share your driving record if you’re using a laptop or desktop computer. You can only use this service if your licence was issued in England Wales or Scotland. It’s a criminal offence to obtain someone else’s personal information without their permission. If you’re having trouble using this service online you can get help by phone. DVLA helpline Telephone: 0300 083 0013 Monday to Friday 8am to 7pm Saturday 8am to 2pm Find out about call charges'

In [27]:
df['body'] = df.details.map(cs_extract_text)

not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict,

not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict,

not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict,

not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict,

not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict, not a list
not a dict,

In [28]:
df['body'].iloc[0]

'Report No: 3/1976. Hot Air Balloon G-BCCG. Report on the accident at Saltley Trading Estate Birmingham on 8 October 1974 Download report: 3-1976 G-BCCG.pdf (2 784.51 kb) Report Appendices To view appendices click on link below: 3/1976 Hot Air Balloon G-BCCG Appendices (899.01 kb)'