# Test Internal Links

Scan HTML pages to confirm that all internal links are working.

In [1]:
import os
import pandas as pd
from pathlib import Path
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
pd.options.display.max_colwidth = 100

In [2]:
here = Path.cwd()
html_root = (here / '../docs/').resolve()

In [3]:
html_files = sorted(list(html_root.rglob('*.html')))
# html_files = []

# Add the Mapbox CSVs to test their links as well
html_files += [Path(html_root / '../uploads/to-mapbox-label-points-2012-data.csv').resolve()]
html_files += [Path(html_root / '../uploads/to-mapbox-label-points-2022-data.csv').resolve()]

In [4]:
df = pd.DataFrame()

for h in tqdm(html_files):

    if h.suffix == '.html':
        with h.open('r') as f:
            html_text = f.read()
        link_source = h
        
    elif h.suffix == '.csv':
        temp_df = pd.read_csv(h)
        html_text = pd.DataFrame(temp_df['map_display_box']).to_string(index=False)
        link_source = Path(html_root / 'index.html')
        
    else:
        raise Exception(f'Unsupported file type: {h}')
        

    df_file = pd.DataFrame()
    soup = BeautifulSoup(html_text, features='html.parser')
    links = [link.get('href') for link in soup.find_all('a')]
    
    df_file['destination'] = links
    df_file['source'] = link_source
    
    df = pd.concat([df, df_file], ignore_index=True)

HBox(children=(FloatProgress(value=0.0, max=2073.0), HTML(value='')))




In [6]:
df['is_local'] = True
df.loc[df.destination.str.contains('http'), 'is_local'] = False
df_local = df[df.is_local].copy()

In [7]:
df_local['destination_resolved'] = df_local.apply(
    lambda x: (Path(x.source).parent / Path(x.destination)).resolve(), axis=1
)

In [8]:
# Total number of links
len(df_local)

25607

In [9]:
# Unique link destinations
len(df_local.destination.unique())
# todo: find orphan HTML pages, those not linked by anything else

4265

In [10]:
# Unique link destinations
len(df_local.destination_resolved.unique())
# todo: 

2070

In [11]:
# Unique link sources
len(df_local.source.unique())

2070

In [12]:
df_local['exists'] = df_local.apply(
    lambda x: (Path(x.source).parent / Path(x.destination)).resolve().exists(), axis=1
)

In [13]:
# links_to_check = df[df.is_local].groupby('destination').size()
# links_to_check

In [14]:
df_local['is_broken'] = (df_local.is_local) & ~(df_local.exists)

In [15]:
df_local['source_filename'] = df_local.source.apply(lambda x: Path(x).name)

In [16]:
df_local.is_broken.sum()

0

In [17]:
df_local[df_local.is_broken].source.unique().tolist()

[]

In [18]:
# Find orphan HTML pages, those not linked by anything else
[x for x in df_local.destination_resolved.unique() if x not in df_local.source.unique()]

[]

In [19]:
[x for x in df_local.source.unique() if x not in df_local.destination_resolved.unique()]

[]

In [20]:
# df_local[df_local.destination_resolved == Path('/Users/devin/Projects/openanc/uploads')].iloc[0].squeeze()

In [21]:
# sorted(list(df_local[df_local.is_broken].destination.unique()))

In [22]:
# df_local[df_local['source_filename'] == '1A.html']

## External Links

In [23]:
# df[~df.is_local].destination.unique()

In [24]:
# Path('/Users/devin/Projects/openanc/uploads/to-mapbox-label-points-2012-data.csv').suffix

In [25]:
# df_local[df_local['source'].str.contains('.csv')]

In [26]:
# Path(html_root / 'index.html')

In [27]:
df_local

Unnamed: 0,destination,source,is_local,destination_resolved,exists,is_broken,source_filename
15,index.html,/Users/devin/Projects/openanc/docs/about.html,True,/Users/devin/Projects/openanc/docs/index.html,True,False,about.html
16,list.html,/Users/devin/Projects/openanc/docs/about.html,True,/Users/devin/Projects/openanc/docs/list.html,True,False,about.html
17,people/index.html,/Users/devin/Projects/openanc/docs/about.html,True,/Users/devin/Projects/openanc/docs/people/index.html,True,False,about.html
18,about.html,/Users/devin/Projects/openanc/docs/about.html,True,/Users/devin/Projects/openanc/docs/about.html,True,False,about.html
20,index.html,/Users/devin/Projects/openanc/docs/about.html,True,/Users/devin/Projects/openanc/docs/index.html,True,False,about.html
...,...,...,...,...,...,...,...
30808,map_2022/ancs/districts/8F01.html,/Users/devin/Projects/openanc/docs/index.html,True,/Users/devin/Projects/openanc/docs/map_2022/ancs/districts/8F01.html,True,False,index.html
30809,map_2022/ancs/districts/8F02.html,/Users/devin/Projects/openanc/docs/index.html,True,/Users/devin/Projects/openanc/docs/map_2022/ancs/districts/8F02.html,True,False,index.html
30810,map_2022/ancs/districts/8F03.html,/Users/devin/Projects/openanc/docs/index.html,True,/Users/devin/Projects/openanc/docs/map_2022/ancs/districts/8F03.html,True,False,index.html
30811,map_2022/ancs/districts/8F04.html,/Users/devin/Projects/openanc/docs/index.html,True,/Users/devin/Projects/openanc/docs/map_2022/ancs/districts/8F04.html,True,False,index.html
