In [1]:
import pymongo
import os
import pandas as pd
import yaml
from collections import Counter
from datetime import datetime
import sys
SRC = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), "src")
sys.path.append(SRC)
from content_api.details_utils import extract_from_details

In [2]:
### Get dirs

In [3]:
DATA_DIR = os.getenv("DATA_DIR")
config = os.path.join(SRC, "config")
black_list_path = os.path.join(config, "document_types_excluded_from_the_topic_taxonomy.yml")

In [4]:
### Get database running locally

In [5]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")

In [6]:
print(myclient.list_database_names())

['admin', 'config', 'content_store', 'local']


In [7]:
mydb = myclient["content_store"]
mycol = mydb["content_items"]

In [8]:
with open(black_list_path, 'r') as stream:
    blacklisted_content_page = sorted(yaml.load(stream)['document_types'])
blacklisted_content_page[0:5]

['about',
 'about_our_services',
 'access_and_opening',
 'business_support_finder',
 'coming_soon']

In [None]:
mydoc = mycol.find({ "$and": [
                    { "document_type": {"$not" : { "$in": blacklisted_content_page}}},
                    { "phase": "live"}]})
print("Started:",datetime.now().strftime("%H:%M:%S"))
rowlist = []
for i,item in enumerate(mydoc):
    if i < 10000:
        rowlist.append(item)
    else:
        break
#     if i% 10000==0:
#         print(i,datetime.now().strftime("%H:%M:%S"))
print("Ended:",datetime.now().strftime("%H:%M:%S"))
df = pd.DataFrame(rowlist)
df.shape

In [187]:
df.head(2)

Unnamed: 0,_id,access_limited,analytics_identifier,content_id,content_purpose_document_supertype,content_purpose_subgroup,content_purpose_supergroup,created_at,description,details,...,redirects,rendering_app,routes,scheduled_publishing_delay_seconds,schema_name,search_user_need_document_supertype,title,updated_at,user_journey_document_supertype,withdrawn_notice
0,/aaib-reports/3-1976-hot-air-balloon-g-bccg-8-...,{},,9c575137-efe4-4ac1-8042-ad0010e80dc5,reports,incidents,transparency,NaT,{'value': ''},{'metadata': {'date_of_occurrence': '1974-10-0...,...,[],government-frontend,[{'path': '/aaib-reports/3-1976-hot-air-balloo...,,specialist_document,government,"3/1976 Hot Air Balloon, G-BCCG, 8 October 1974",2019-03-06 11:22:35.976,thing,{}
1,/aaib-reports/3-1979-piper-pa24-comanche-180-g...,{},,18bdf4b5-8425-4375-9d1e-02fbf8483e9d,reports,incidents,transparency,NaT,{'value': ''},{'metadata': {'date_of_occurrence': '1978-12-1...,...,[],government-frontend,[{'path': '/aaib-reports/3-1979-piper-pa24-com...,,specialist_document,government,"3/1979 Piper PA24 Comanche 180, G-ARSC, 15 Dec...",2019-03-02 01:21:18.322,thing,{}


In [173]:
df.phase.value_counts()

live    100000
Name: phase, dtype: int64

In [175]:
df.iloc[2]

_id                                           /aaib-reports/3-1982-maule-m-5-235c-g-love
access_limited                                                                        {}
analytics_identifier                                                                 NaN
content_id                                          b7353bf2-bbde-4705-8e8d-edf2e1f108d9
content_purpose_document_supertype                                               reports
content_purpose_subgroup                                                       incidents
content_purpose_supergroup                                                  transparency
created_at                                                                           NaT
description                            {'value': 'This occurred on 4 September 1981, ...
details                                {'metadata': {'date_of_occurrence': '1981-09-0...
document_type                                                                aaib_report
email_document_supert

In [226]:
keep_columns = \
['_id', 
#                 'access_limited', 
#                 'analytics_identifier', 
'content_id',
'content_purpose_document_supertype', 
'content_purpose_subgroup',
'content_purpose_supergroup', 
# 'created_at', 
'description', 
'details',
'document_type', 
'email_document_supertype', 
# 'expanded_links',
'first_published_at', 
#                 'format', 
'government_document_supertype',
#                 'links', 
'locale', 
'navigation_document_supertype', 
#                 'need_ids',
#                 'payload_version', 
'phase', 
'public_updated_at', 
'publishing_app',
#                 'publishing_request_id', 
'publishing_scheduled_at', 
#                 'redirects',
'rendering_app', 
#                 'routes', 
#                 'scheduled_publishing_delay_seconds',
#                 'schema_name', 
'search_user_need_document_supertype', 
'title',
'updated_at', 
'user_journey_document_supertype' 
#                 'withdrawn_notice'
]

In [176]:
df.shape

(100000, 36)

In [177]:
link_counts = Counter([key for l in df.expanded_links.values for key in l.keys()])
link_keys = [k for k in link_counts.keys()]
link_keys[0:10]

['organisations',
 'primary_publishing_organisation',
 'taxons',
 'finder',
 'available_translations',
 'mainstream_browse_pages',
 'parent',
 'part_of_step_navs',
 'ordered_related_items',
 'meets_user_needs']

In [178]:
links_keep = \
[
'organisations',
'primary_publishing_organisation',
'taxons',
#  'finder',
#  'available_translations',
'mainstream_browse_pages',
# 'parent',
'part_of_step_navs',
'ordered_related_items',
#  'meets_user_needs',
'topics',
'ordered_related_items_overrides',
'pages_part_of_step_nav',
'pages_related_to_step_nav',
'related_to_step_navs',
# 'children',
'document_collections',
#  'lead_organisations',
#  'world_locations',
#  'worldwide_organisations',
#  'supporting_organisations',
#  'worldwide_priorities',
# 'original_primary_publishing_organisation',
'documents',
'policy_areas',
# 'topical_events',
#  'suggested_ordered_related_items',
'related_policies',
# 'ministers',
# 'people',
# 'roles',
#  'field_of_operation'
]

In [185]:
[item.keys() for item in df.iloc[0].expanded_links['taxons']]

[dict_keys(['analytics_identifier', 'api_path', 'base_path', 'content_id', 'description', 'document_type', 'locale', 'public_updated_at', 'schema_name', 'title', 'withdrawn', 'details', 'phase', 'links'])]

In [200]:
keep_keys = \
[
# 'analytics_identifier', 
# 'api_path', 
'base_path', 
'content_id', 
# 'description', 
# 'document_type', 
# 'locale', 
# 'schema_name', 
# 'title', 
# 'withdrawn', 
# 'details', 
# 'links'
]

In [188]:
del df

In [212]:
def handle_expanded_links(content_links, row_dict):
    for key,value in content_links.items():
        if key in links_keep:
            row_dict[key] = []
            for item in value:
                row = {}
                for k in keep_keys:
                    if k in item.keys():
                        row[k] = item[k]
                row_dict[key].append(row)

In [None]:
mydoc = mycol.find({ "$and": [
                    { "document_type": {"$not" : { "$in": blacklisted_content_page}}},
                    { "phase": "live"}]})
print("Started:",datetime.now().strftime("%H:%M:%S"))
rowlist = []
for i,item in enumerate(mydoc):
    if i < 90000:
        row = {key:value for key,value in item.items() if key in keep_columns}
    #     row['details'] = extract_from_details(item['details'])
        if "expanded_links" in item.keys():
            handle_expanded_links(item["expanded_links"], row)    
        rowlist.append(row)
    else:
        break
print("Ended:",datetime.now().strftime("%H:%M:%S"))
df = pd.DataFrame(rowlist)
df.shape

Started: 15:40:21


In [214]:
df.iloc[0].taxons

[{'base_path': '/transport/air-accidents-and-serious-incidents',
  'content_id': '951ece54-c6df-4fbc-aa18-1bc629815fe2',
  'title': 'Air accidents and serious incidents'}]

In [None]:
Counter([k for det in df.details for d in det for k in d.keys()])