# Create topology sparse matrix

This approach does not rely on the existing knowledge graph. A functional graph based on page hit session data is created, before further filtering of the graph, to find a list of pages related to the economic recovery whole user journey (WUJ). 

The first step is to identify seed0 and seed1 pages. Seed0 pages must be pre-defined and manually entered as `SEED0_PAGES`. Then, a topology sparse matrix is created which represents how each seed0 page is connected to another page via hyperlink. Adpated from: https://github.com/alphagov/govuk-intent-detector/blob/main/notebooks/generate_topology_matrix.ipynb

ASSUMPTIONS: 
- We are removing links to cross-domain services / external domain
- We attach anchor to the main url (/business#content)
- We are keeping the step-by-step ID, and search parameters
- We are only keeping self-loops where a page contains an explicit link to itself 
- We are not accounting for the fact that any page can be reloaded -> if we want this, add 1's to whole diagonal
- If we do not want any of these two options (point above) need remove add 0's to whole diagonal AND remove self-loop from user journey data 
- `SEED0_PAGES` are defined as `/topic/further-education-skills` and `/browse/working/finding-job`. These were chosen as they are topic and browse pages, which therefore link to many similar pages. This analysis assumes these pages are important pages in the economic recovery whole user journey. `SEED1_PAGES` are reliant on `SEED0_PAGES`, therefore this analysis is dependent on what the `SEED0_PAGES` are.
- Footer pages are manually defined and removed from the final list of `SEED1_PAGES`. This is because they occur on every page, regardless of the Whole User Journey. 

OUTPUT: 
- Saves `{date}_govuk_topology_matrix.pickle` in `../data/interim`
- A list of `SEED1_PAGES` are downloaded locally as `seed1_economic_recovery.csv` 

REQUIREMENTS:
- No scripts need to be run before this script
- Imports module `make_topology_matrix.py` located in `../src/make_data`

In [1]:
import os
import re
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from datetime import datetime
from urllib import request

In [2]:
from bs4 import BeautifulSoup, SoupStrainer
from bs4.element import Doctype

In [3]:
from src.make_data.make_topology_matrix import (clean_url, combine_anchor2url, process_page_links, 
                                                create_topology_matrix_pd)

## Set up folders

In [4]:
# Define the raw data folder where HTML pages will be stored
DIR_DATA_RAW = os.getenv("DIR_DATA_RAW")

# Create a new folder with the current timestamp - content may change, so it's good to keep a local record
HTML_DATETIME = datetime.now().strftime("%Y%m%d_%H%M%S")
DIR_HTML = Path(DIR_DATA_RAW, "html", HTML_DATETIME)
DIR_HTML.mkdir(parents=True)

# Output folder
DIR_DATA_INTERIM = Path(os.getenv("DIR_DATA_INTERIM"))

# Define the output file path
OUTPUT_FILE = DIR_DATA_INTERIM.joinpath(f"{HTML_DATETIME}_govuk_topology_matrix.pickle")

In [11]:
# Curated user journeys urls
DIR_DATA_EXTERNAL = os.environ.get("DIR_DATA_EXTERNAL")
SEED0_PAGES = ['/topic/further-education-skills', '/browse/working/finding-job']

## Get GOVUK html pages for seed0 pages

In [12]:
# Initialise an empty dictionary to store the HTML pages
html_pages = {}

# Iterate over the requested GOV.UK pages
for page in tqdm(SEED0_PAGES):
    
    # Raise an error if the page doesn't start with a "/" - prevents the request from hanging
    if not page.startswith("/"):
        raise ValueError(f"Pages must start with '/': {page}")
    
    # Download the HTML page, store it in `html_pages` 
    with request.urlopen(f"https://www.gov.uk{page}") as hp:
         html_page = hp.read().decode("utf8")
    
    # Check if there is an anchor heading in the page URL; if there is one, only get all the HTML **after** the
    # anchor
    anchor_heading = re.match(r".*#(?P<anchor>[^/]+)$", page, flags=re.DOTALL)
    if anchor_heading:
        heading_string = str(BeautifulSoup(html_page).find(id=anchor_heading.group("anchor")))
        html_page = heading_string + html_page.split(heading_string)[1]
    
    # Write `html_page` out, replacing "/" with "__", as "/" is not a valid file name, and also save it in a 
    # dictionary for further analysis
    with open(Path(DIR_HTML, f"{page.replace('/', '__')}.html"), "w") as f:
        _ = f.write(html_page)
    html_pages[page] = html_page

100%|██████████| 2/2 [00:00<00:00,  8.89it/s]


## Ingest the html pages 

In [13]:
# Initialise an empty dictionary, and extract out the embedding hyperlinks in all the HTML pages
# The HTML <a> tag defines a hyperlink. It has the following syntax: <a href="url">link text</a>
page_links = {}

# Iterate over the HTML files
for html_page, html_contents in tqdm(html_pages.items()):
    
    # Extract all embedded hyperlinks and save them in a list
    links = BeautifulSoup(html_contents, parse_only=SoupStrainer('a')) 
    page_links[html_page] = [link.get('href') for link in links if not isinstance(link, Doctype)]

100%|██████████| 2/2 [00:00<00:00, 17.63it/s]


## Generate topology matrix

In [14]:
# Process hyperlinks embedded in each page
page_links_proc = process_page_links(page_links)

In [15]:
# Generate directed topology matrix, as apandas.DataFrame
# Note that we keep source urls and destination urls
topology_matrix_df = create_topology_matrix_pd(page_links=page_links_proc)

## Save matrix, and list of seed1 pages (with footer pages removed)

In [47]:
# Save matrix
topology_matrix_df.to_pickle(path=OUTPUT_FILE)

In [48]:
# Create df of seed1 pages
df = pd.DataFrame(topology_matrix_df.columns.values.tolist(), columns=["seed1_page"])

# Remove footer pages
footer_pages = ['/browse/disabilities',
                '/browse/housing-local-services',
                '/help',
                '/browse/tax',
                '/browse/childcare-parenting',
                '/',
                '/browse/employing-people',
                '/browse/environment-countryside',
                '/government/organisations/government-digital-service',
                '/help/terms-conditions',
                '/browse/benefits',
                '/help/cookies',
                '/browse/births-deaths-marriages',
                '/browse/abroad',
                '/coronavirus',
                '/contact',
                '/transition',
                '/government/how-government-works',
                '/browse/education',
                '/browse/justice',
                '/browse/citizenship',
                '/cymraeg',
                '/browse/working',
                '/browse/business',
                '/help/accessibility-statement',
                '/world',
                '/browse/visas-immigration',
                '/browse/driving',
                '/help/privacy-notice',
                '/government/organisations'
               ]
    
df = df[~df.seed1_page.isin(footer_pages)]

# Save df to csv
df.to_csv('../../data/interim/seed1_economic_recovery.csv', index=False)