# Investigate: Non-English Attachments on English pages
Notebook explores GOV.UK pages that are non-English attachments but are being marked as English, the default choice.

This is part of the Accessibility work to ensure compliance with WCAG. These attachments are currently WCAG fails because screen-reading software that the visually impaired use to read GOV.UK pages will suggest these attachments are English and thus the person will download it, when it the attachment is not actually in English. The consequence is that they will then have to download another attachment, so the page is less accessible.

## Approach
The approach this notebook will take is to identify a column in the pre-processed content store that has the attachment in. We define this by looking at the *attachment* element of the HTML code and then title relating to this. Generally, there are two directions that we can then take:
1. Detect language of attachment via its title
     + Is easiest method
     + Less reliable because names of attachments are typically short plus there are abbreviations. Language detection works less effectively when it has less language to scan. Just like how humans cannot accurately guess the language of text if they do not have much text to go by.
1. Detect language of contents of attachment
     + Harder as you need to read the attachments in bulk
     + All sorts of different attachments such as `.pdf`, `.doc`, `.csv`, `.html` so need a variety of ways to read the contents
     + More accurate as will be working with extra text
     
We discard Option (2.) because  it would be really slow to download all the attachments and read their contents.

In [None]:
import os
import time

import pandas as pd
import numpy as np
import ast
import json

from pandarallel import pandarallel
import multiprocessing

from bs4 import BeautifulSoup

from langdetect import detect_langs
from langdetect.lang_detect_exception import LangDetectException

# display multiple outputs in same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
def extract_publishing_organisation(content_item, key, index = 0):
    """ Extracts the value of a key within a dictionary masquerading as a string
    
    :param content_item: A string that's in the format of a dictionary
    :param key: The name of the key you want to extract the associated value from
    :param index: The index of specific value if you extracted more than one value from the key
    :return: the extracted value of the key
    """
    try:
        # convert object to string
        content_item = json.dumps(content_item)
        # convert string to object
        content_item = json.loads(content_item)

        # convert to dictionary
        organisations = ast.literal_eval(content_item)

        # extract value of key entered from dictionary
        organisations = list(map(lambda org: org[index], organisations.get(key, {})))
        
        return organisations
    except (ValueError, SyntaxError):
        return [np.NaN]

In [None]:
def extract_title(text):
    """Extracts all the attachment titles from GOV.UK pages
    
    :param html: String of the HTML code for the GOV.UK page being passed in
    :return: list of all the attachment titles that were extracted from GOV.UK page
    
    """
    text = ast.literal_eval(text)
    text = text.get('attachments')
    
    titles = list(map(lambda x: x['title'], text))
    
    return titles


def extract_attachment_titles(html):
    """Extracts all the attachment titles from GOV.UK pages
    
    :param html: String of the HTML code for the GOV.UK page being passed in
    :return: list of all the attachment titles that were extracted from GOV.UK page
    
    """
    
    # pass html into BeautifulSoup class to apply methods on it
    soup = BeautifulSoup(html, 'html.parser')
    
    # initialise list to store results
    list_title = []
    
    # extract all text from `h2` element with class description `title` 
    # nested in `div` element with class description `attachment-details`
    for text in soup.find_all('div', class_ = 'attachment-details'):
        for title in text.find_all('h2', class_ = 'title'):
            list_title.append(title.get_text())
    
    return list_title

In [None]:
def func_detectlangs(text):
    """Detects language of a text, moving onto next text if an error is thrown
    
    :param text: A string to detect the language of
    :return: A list returning the language detected and confidence score associated to it
    
    """

    try:
        return detect_langs(text)
    except LangDetectException:
        return np.NaN

## Load Data
Data used in this will be all the content on GOV.UK that exist on 6th August 2020.

Due to the sheer size of the data, need to pre-specify column headings and which columns are dates to make the import process:
- Work
- Work relatively quickly

In [None]:
# create dictionaries and headers to specify dtype and date columns
dict_header = {'base_path':object,
               'content_id':object,
               'title':object,
               'description':object,
               'publishing_app':object,
               'document_type':object,
               'details':object,  
               'text':object,
               'organisations':object,  
               'taxons':object,
               'step_by_steps':object,
               'details_parts':object,  
               'first_published_at':object,
               'public_updated_at':object,
               'updated_at':object,
               'finder':object,
               'facet_values':object,  
               'facet_groups':object,
               'has_brexit_no_deal_notice':bool,
               'withdrawn':bool,
               'withdrawn_at':object,
               'withdrawn_explanation':object}
list_header_date = ['first_published_at',
                    'public_updated_at',
                    'updated_at',
                    'withdrawn_at']

# load data
df = pd.read_csv(filepath_or_buffer='../data/preprocessed_content_store_200820.csv.gz',
                 compression='gzip',
                 encoding='utf-8',
                 sep='\t',
                 header=0,
                 names=list(dict_header.keys()),
                 dtype=dict_header,
                 parse_dates=list_header_date)

In [None]:
# data of detected language of content
dict_header = {'base_path':object,
               'text':object,
               'text_languages':object,
               'detected_as_english':object}

df_lang_detect = pd.read_csv(filepath_or_buffer='../data/non_english_docs_report.csv',
                             header=0,
                             names=list(dict_header.keys()),
                             dtype=dict_header)

Need to bring data where we detected the language of the page content, `df_lang_detect` with the data that has the details of the attachment details on them, `df`.

In [None]:
df = df[['base_path', 'publishing_app', 'document_type', 'details', 'organisations']].merge(right=df_lang_detect,
                                                                                            on='base_path',
                                                                                            how='left')

## Extract for non-English pages
Here, are manipulating data for non-English pages segment of this work. 

Further detail in Trello card [here](https://trello.com/c/TkxAtsZD).

In [None]:
df_lang_detect = df[['base_path', 'publishing_app', 'document_type', 'organisations']].merge(right=df_lang_detect,
                                                                                             on='base_path',
                                                                                             how='right')
# remove unecessary rows of:
# - those that are detected to have English
# - those that have NaN in the `text` column
df_lang_detect = df_lang_detect.query('detected_as_english == "False"')
df_lang_detect = df_lang_detect.dropna(subset = ['text'], axis = 'index')

# extract `primary_organisation_name`
df_lang_detect['primary_publishing_organisation'] = df_lang_detect['organisations'].apply(lambda x: extract_publishing_organisation(content_item = x, 
                                                                                                                                    key = 'primary_publishing_organisation', 
                                                                                                                                    index = 1))

# select only relevant columns
df_lang_detect = df_lang_detect[['base_path', 'primary_publishing_organisation', 'publishing_app', 'document_type', 'text', 'text_languages']]

In [None]:
df_lang_detect.to_csv('../data/non_english_page_report.csv')

In [None]:
del dict_header, list_header_date, df_lang_detect

## Preprocessing
Need to extract organisation titles so analysis can be conducted to spot patterns.

In [None]:
%%time
df['organisation_name'] = df['organisations'].apply(lambda x: extract_publishing_organisation(content_item = x, key = 'primary_publishing_organisation', index = 1))

To find webpages with attachments on, we assume the following (based on a few case examples):
1. They have a non-empty list in the `'attachments': [...]` element

Not perfect though, still have pages that don't have any attachments in them. This is probably because `'attachments: []'`.

In [None]:
# have attachments in `details` column, under 'attachments'
df['details_attachment_exists'] = df['details'].str.contains('\'attachments\'\: \[', na = False)
df_attachment = df.query('details_attachment_exists == True').copy()

In [None]:
df_attachment[['base_path', 'organisation_name', 'publishing_app', 'document_type', 'details', 'text', 'text_languages']].sample(n = 5, random_state = 42)

***

## Extracting link titles
Let's extract the file names from the urls so that we can start detecting the language. Will do this in two main stages:
1. Extract the urls from the HTML code
1. Extract the file names and extensions from the urls

Some example webpages to test are:
- [MMR](https://www.gov.uk/government/publications/measles-mumps-and-rubella-lab-confirmed-cases-in-england-2019)
- [Dart Charge Bulletin](https://www.gov.uk/government/publications/dart-charge-bulletin-3-advice-for-foreign-hgv-drivers)
- [Tribunal decisions](https://www.gov.uk/employment-tribunal-decisions/miss-r-youd-v-elton-community-centre-2404942-2017)

Compare methods...(from sample extracted as .csv below, the `df['attachment_title_dict']` is more comprehensive than `df['attachment_title_html']`. In particular, there are no missing entries in the former that don't exist in the latter. However, there are missing entries in the latter that don't exist in the former.

In [None]:
%%time
df_attachment['attachment_title_dict'] = df_attachment['details'].apply(extract_title)

In [None]:
%%time
df_attachment['attachment_title_html'] = df_attachment['details'].apply(extract_attachment_titles)

In [None]:
# keep only pages with actual attachments on
df_attachment = df_attachment[df_attachment['attachment_title_dict'].map(lambda d: len(d) > 0)]

In [None]:
# for inspection
df_attachment.sample(n = 1000, random_state = 42).to_csv('../data/sample_attachments.csv')

## Language Detection
Let's apply language detection on our attachment titles now.

In [None]:
n_cores = multiprocessing.cpu_count() - 1
pandarallel.initialize(nb_workers = n_cores, progress_bar=True, use_memory_fs=False)

In [None]:
# create smaller cut of data
df_extract = df_attachment[['base_path', 'publishing_app', 'organisation_name', 'document_type', 'text', 'text_languages', 'attachment_title_dict']].copy()
# make every list item a row entry
df_extract = df_extract.explode('attachment_title_dict')

In [None]:
%%time
df_extract['attachment_title_lang'] = df_extract['attachment_title_dict'].parallel_apply(func_detectlangs)

Performs poorly with abbreviations and short sentences, which makes sense.

Next step is to identify those pages where the language that the content is in, `text_languages`, does not match the language of the attachment title, `attachment_title_lang`.
- This is more general compared to those pages that are English but have attachments with a non-English title.

In [None]:
%%time
df_extract = df_extract.rename(columns = {'text_languages': 'text_lang',
                                         'attachment_title_dict': 'attachment_title'})

# save as different so we don't have to rerun language detection
# (note, csv does not retain the nested structures we have for `attachment_title_dict` and `attachment_title_lang`)
df_extract.to_csv('../data/df_attachment.csv', index = False)
df_extract.to_pickle('../data/df_attachment.pkl')

***

## Formatting

Now focus on a subset of columns for the data and compare the languages identified in the text with the languages identified from the attachment titles.

We will need to do some transformation to:
1. for the text and attachment titles, isolate the language code from the confidence scores *e.g. [en: 0.9956]*
1. for the attachment titles, *explode* it out so that for each page, each associated row will be one of the attachment titles and the corresponding language
1. step 2. allows us to directly compare the predominant language identified for the page text with that related to the attachment title 

In [None]:
pd.options.display.max_colwidth = 1000
df_extract = df_extract.sort_values(by = ['base_path'])

In [None]:
df_extract

In [None]:
df_extract[['base_path', 'attachment_title', 'attachment_title_lang']].query('base_path == "/government/publications/foi-responses-published-by-mod-week-commencing-23-november-2015"')

In [None]:
[[sublists[0] for sublists in lists] for lists in df_extract['attachment_title_lang']][10]

In [None]:
# better way is to convert to list and extract first element but harder
df_extract['text_lang_main'] = df_extract['text_languages'].str[1:3]
df_extract['attachment_title_lang_main'] = [[sublists[0] for sublists in lists] for lists in df_extract['attachment_title_lang']]

# what we want to do next is see if the language in `text_languages` is in this extracted list
df_extract[['text_languages', 'text_lang_main', 'attachment_title_lang_main']]

In [None]:
# extract first two characters to get language
df_extract['attachment_title_lang_main_txt'] = df_extract['attachment_title_lang_main'].astype(str).str[0:2]
df_extract[['text_lang_main', 'attachment_title_lang_main_txt']]

In [None]:
pd.options.display.max_colwidth = 100
df_unpivot_title['attachment_title_dict'].head(10)

In [None]:
df_output = df_output.dropna(subset = ['text'])
df_output = df_output[['base_path', 'organisation_name', 'publishing_app', 'document_type', 'text_lang', 'attachment_title', 'attachment_title_lang']]
df_output.to_csv('../data/non_english_attachment_report.csv', index = False)

In [None]:
# see if text language is same as attachment title language
df_extract['check'] = np.where((df_extract['text_lang_main'] == df_extract['attachment_title_lang_main_txt']), True, False)

# prepare data output to save as .csv
df_output = df_extract.query('check == False').copy()
df_output = df_output[['base_path', 'organisation_name', 'publishing_app', 'document_type', 'text', 'text_languages', 'attachment_title_dict', 'attachment_title_lang_main']]
df_output = df_output.rename(columns = {'text_languages': 'text_lang',
                                        'attachment_title_dict': 'attachment_title',
                                        'attachment_title_lang_main': 'attachment_title_lang'})
df_output.sort_values(by = ['base_path'])

In [None]:
df_output.sort_values(by = ['base_path']).to_csv('../data/non_english_attachment_report.csv', index = False)