# 002_Sentence-extraction

Purpose:
Stratified sampling run across gov.uk to get sample of base paths. Now to extract the sentences from these base paths.

1. Connect to mongodb container
2. Get base paths
3. Filter db by base paths
4. Extract only text


In [None]:
import json
import pprint
import random
from datetime import datetime
from typing import Dict, List

import matplotlib.pyplot as plt
import pandas as pd
import pymongo

#scraping
import requests
from bs4 import BeautifulSoup
from bson import ObjectId, json_util
from clumper import Clumper
from dotenv import load_dotenv  # pip install python-dotenv
from pandas.io.json import json_normalize

# make sure a .env file exists in the same directory, with a line like this:
# KG_PWD=<insert password here>
load_dotenv()
pd.set_option('display.max_columns', None)

### 1. Connect to mongodb container

Get database running locally as per the instructions in this README.

In [None]:
# or whatever port it's at
myclient = pymongo.MongoClient("mongodb://localhost:27017/")

In [None]:
print(myclient.list_database_names())

In [None]:
mydb = myclient["content_store"]
mycol = mydb["content_items"]

The first parameter of the find() method is the filter that all returned records must match (we can leave it empty to get all records). With projections, we can select specific fields from the returned documents. The projections are passed in the second argument of the find() method where 1 means True, we want that returned.

In [None]:
find_record = mycol.find_one({"_id": "/30-hours-free-childcare"}, projection={'details.body.content':1})
#find_record = mycol.find_one({"_id": "/30-hours-free-childcare"}, projection={'details':1})
find_record

### 2. Get base paths

In [None]:
DIR_SRC_STRATA = os.environ.get('DIR_SRC_STRATA')

base_paths_df = pd.read_csv(os.path.join(DIR_SRC_STRATA, 'data/schemas_stratified_random_sample.csv'))

In [None]:
base_paths_df

A little bit of EDA...

In [None]:
base_paths_df['schema_name'].value_counts().plot(kind='bar')

In [None]:
base_paths_df['document_type'].value_counts()

In [None]:
base_paths_df['schema_strata_name'].value_counts()

In [None]:
base_paths_df['base_path'].nunique()

In [None]:
base_paths_list = list(base_paths_df['base_path'])
base_paths_list[:5]

### 3. Filter db by base paths

* 'details, licence_overview, content' x5
* 'details, introductory_paragraph, content' x3
* 'details, body, content' *2
* 'details, body'

In [None]:
random_base_path = random.choice(base_paths_list)
print(random_base_path)

records = mycol.find({"_id": random_base_path})

In [None]:
for record in records:
  pprint.pprint(record)

In [None]:
def get_mongo_data(base_path):
    records = mycol.find({"_id": base_path}, projection={'details.body.content':1})
    for record in records:
        record = record
    return record

In [None]:
def mongo_to_dataframe(mongo_data):
    """Get nested JSON lines into a pandas dataframe.
    
    Sanitize by loading as a regular JSON. Un-nest the data with normalise
    then turn into a pandas dataframe.
    """

    sanitized = json.loads(json_util.dumps(mongo_data))
    normalized = json_normalize(sanitized)
    df = pd.DataFrame(normalized)

    return df

In [None]:
record = get_mongo_data(base_path=base_paths_list[12])

In [None]:
mongo_to_dataframe(record)

In [None]:
df_list = []

for i in base_paths_list:
    record = get_mongo_data(base_path=i)
    df = mongo_to_dataframe(record)
    df_list.append(df)

In [None]:
master_df = pd.concat(df_list)

In [None]:
master_df.head()

In [None]:
print(master_df.shape[0])
print(master_df['details.body'].isna().sum())

### 4. Scrape Govuk Content (using content api)

In [None]:
TARGET_PAGES = base_paths_list

In [None]:
def get_content_api(page_api_url: str) -> requests.models.Response:
  """"""
  return(requests.get(page_api_url))

In [None]:
def get_text_parts(api_content_json: requests.models.Response) -> List[dict]:
  """
  Extracts:
  - main title
  - main body (if any)
  - chapter headings (if any)
  - chapter sections (if any)
  from a gov.uk page, fetched through the api/content.

  Strips html/css markdown. 

  Removes hyperlinks.

  Adds the necessary punctuation to preserve paragraph and sentence structure:
  - ":" after a (sub)heading so that the sub(heading) and text are presented together
  - "." at the end of the last bullet point
  - ";" at the end of a non-final bullet point

  Returns a {title: str, sections: list(str), headings: list(str)} dictionary.
  """

  main_title = api_content_json.json()['title']

  try:
    main_body = BeautifulSoup(api_content_json.json()['details']['body'].replace("</h2>", ":").replace("</h3>", ":").replace("</h4>", ":").replace('</li>\n</ul>', '.').replace('</li>\n', ';').replace('\n', ' '), 
                                 "html.parser").get_text()
  except KeyError:
    main_body = ""
  
  try: 
    body_sections = [BeautifulSoup(d['body'].replace("</h2>", ":").replace("</h3>", ":").replace("</h4>", ":").replace('</li>\n</ul>', '.').replace('</li>\n', ';').replace('\n', ' '), 
                                  "html.parser").get_text() for d in api_content_json.json()['details']['parts']]

    body_headings = [d['title'] for d in api_content_json.json()['details']['parts']]

  except KeyError:
    body_sections = []
    body_headings = []


  return({"title": main_title, "main_text": main_body, "sections": body_sections, "headings": body_headings})

In [None]:
def get_structured_content(list_target_pages) -> Dict[str, dict]:
  """
  Given a list of page paths (format: '/universal-credit'):
    - scrape the content of the page using the gov.uk/api/content
    - extract the main text components: page title, headings, main body text, section texts.

  Returns a dictionary (page_path, dictionary of text components)
  """

  DOMAIN = "https://www.gov.uk/api/content"

  results_dict = {}

  for page in list_target_pages:
    api_url = DOMAIN + page
    response = get_content_api(api_url)
    results_dict[page] = get_text_parts(response)

  return(results_dict)

In [None]:
results_dict = get_structured_content(TARGET_PAGES)

In [None]:
results_dict

### 5. Using MongoDB again...

In [None]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")

In [None]:
print(myclient.list_database_names())

In [None]:
mydb = myclient["content_store"]
mycol = mydb["content_items"]

In [None]:
blacklisted_content_page = []

In [None]:
keep_columns = \
['_id', 
                'access_limited', 
                'analytics_identifier', 
'content_id',
'content_purpose_document_supertype', 
'content_purpose_subgroup',
'content_purpose_supergroup', 
'created_at', 
'description', 
'details',
'document_type', 
'email_document_supertype', 
'expanded_links',
'first_published_at', 
                'format', 
'government_document_supertype',
                'links', 
'locale', 
'navigation_document_supertype', 
                'need_ids',
                'payload_version', 
'phase', 
'public_updated_at', 
'publishing_app',
                'publishing_request_id', 
'publishing_scheduled_at', 
                'redirects',
'rendering_app', 
                'routes', 
                'scheduled_publishing_delay_seconds',
                'schema_name', 
'search_user_need_document_supertype', 
'title',
'updated_at', 
'user_journey_document_supertype' 
                'withdrawn_notice'
]

In [None]:
links_keep = \
[
'organisations',
'primary_publishing_organisation',
'taxons',
 'finder',
 'available_translations',
'mainstream_browse_pages',
# 'parent',
'part_of_step_navs',
'ordered_related_items',
 'meets_user_needs',
'topics',
'ordered_related_items_overrides',
'pages_part_of_step_nav',
'pages_related_to_step_nav',
'related_to_step_navs',
'children',
'document_collections',
 'lead_organisations',
 'world_locations',
 'worldwide_organisations',
 'supporting_organisations',
 'worldwide_priorities',
'original_primary_publishing_organisation',
'documents',
'policy_areas',
'topical_events',
 'suggested_ordered_related_items',
'related_policies',
'ministers',
'people',
'roles',
 'field_of_operation'
]

In [None]:
keep_keys = \
[
'analytics_identifier', 
'api_path', 
'base_path', 
'content_id', 
'description', 
'document_type', 
'locale', 
'schema_name', 
'title', 
'withdrawn', 
'details', 
'links'
]

In [None]:
def handle_expanded_links(content_links, row_dict):
    for key,value in content_links.items():
        if key in links_keep:
            row_dict[key] = []
            for item in value:
                row = {}
                for k in keep_keys:
                    if k in item.keys():
                        row[k] = item[k]
                row_dict[key].append(row)

In [None]:
mydoc = mycol.find({ "$and": [
                    { "document_type": {"$not" : { "$in": blacklisted_content_page}}},
                    { "phase": "live"}]})
print("Started:",datetime.now().strftime("%H:%M:%S"))
rowlist = []
for i,item in enumerate(mydoc):
    if i < 50000:
        row = {key:value for key,value in item.items() if key in keep_columns}
#         row['body'] = extract_from_details(item['details'], "text")
#         row['embedded_links'] = extract_from_details(item['details'], "links")
        if "expanded_links" in item.keys():
            handle_expanded_links(item["expanded_links"], row)    
        rowlist.append(row)
    else:
        break
    if i % 10000==0:
        print(i,datetime.now().strftime("%H:%M:%S"))
print("Ended:",datetime.now().strftime("%H:%M:%S"))
df = pd.DataFrame(rowlist)
df.shape

In [None]:
print(df.iloc[1])
print('*'*20)