In [None]:
!pip install pandas
!pip install requests
!pip install pdfplumber

In [None]:
import pandas as pd

## Data download 

You need to download 'company_document_issue_responses.csv' file from https://www.business-humanrights.org/en/from-us/modern-slavery-statements/ by selecting **Download company response documents comparison data** button.

Once the data csv file is downloaded, you can fetch the file in either pdf, png or jpg format by looking at  **Document Cached URL** column


In [None]:
df_company = pd.read_csv('company_document_issue_responses.csv')
df_company['Document Cached URL'].head()

You can iterrate each row in the dataframe and download the corresponding data from the provided url

In [None]:
import requests

def download_file(url, local_file_location):
    r = requests.get(url, allow_redirects=True)
    if not os.path.isfile(local_file_location):
        with open(local_file_location, 'wb') as f:
            f.write(r.content)

In [None]:
LOCAL_FILE_FOLDER = 'data/'

In [None]:
i = 0
for _, row in df_company.iterrows():
    url = row['Document Cached URL']
    # use the last part of the url as the local file name
    file_name = join(LOCAL_FILE_FOLDER, url.split('/')[-1])
    download_file(url, file_name)
    i+=1
    print(i)
    #print('downloaded', file_name)

## Extract data from PDF and HTML

The following code provides an example of extracting data from PDF files using a Python library called pdfplumber. However, you can choose any library as you prefer. 

You would need to do your own research if you wish to extract textual data from image file format such as png or jpg. However, the basic idea is to OCR the image file and then extract the text from the OCRed file.

Note: there are scanned PDF files that should be treated as image files. 

In [None]:
import os
import logging
import jsonpickle
import pathlib
import urllib.parse
from bs4 import BeautifulSoup
from bs4.element import Comment
from boilerpy3 import extractors

logger = logging.getLogger()
logger.setLevel(logging.INFO)


def accepted_tags(element):
    TAGS = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
            'section', 'a', 'a href', 'href', 'ul']
    
    if element.name in TAGS:
        return True

    elif element.parent.name in TAGS:
        return True
  
    elif isinstance(element, Comment):
        return False
    
    elif len(element)== 0:
        return False
    
    return False

def processing_extracted_text(input_slice):
    list_sentences = []
    list_tags = ['h2', 'h3', 'h4']
    for text_element in input_slice:
        if text_element.parent.name == 'p' and text_element.name == 'a':

                if "href" in str(text_element) and 'slavery' in str(text_element).lower():
                    if text_element.text+" "+text_element['href'] not in list_sentences:
                        if text_element.text+" "+text_element['href'].strip():
                            list_sentences.append(text_element.text+" "+text_element['href'])
                else:
                    continue
        elif text_element.name == 'a':
            if "href" in str(text_element) and 'slavery' in str(text_element).lower():
                if text_element.text+" "+text_element['href'] not in list_sentences:
                    if text_element.text+" "+text_element['href'].strip():
                        list_sentences.append(text_element.text+" "+text_element['href'])
        try:
            if text_element.name == 'p' and text_element['class'] == 'copyright':
                continue
        except (AttributeError, KeyError):
            pass

        if text_element.parent.name == 'a' and text_element.name == 'p':
            continue
        if text_element.parent.name == 'a' and text_element.name == 'span':
            continue
        if text_element.parent.name == 'nav' and text_element.name in list_tags:
            continue
        if "email" in text_element.text.lower():
            continue
        if "a href" not in str(text_element) and text_element.name != 'a':
    
            splitted_text_element = text_element.text.split()
            cleaned_element = ' '.join(splitted_text_element)
            if cleaned_element not in list_sentences:
                list_sentences.append(cleaned_element)
    return list_sentences


def filter_tags(body):
    soup  = BeautifulSoup(body, "html.parser")
    texts = soup.findAll(name = ['p','a', 'h1', 'h2', 'h3','h4','li', 'ul'])
    accepted_headers = [ 'h1', 'h2']
    extracted_data1 = list(filter(accepted_tags, texts))
    extracted_data = list(dict.fromkeys(filter(None, extracted_data1)))
    list_sentences = []

    for i, text_element in enumerate(extracted_data):
        
        if text_element.name in accepted_headers and 'slavery' in str(text_element).lower():
        
            list_sentences = processing_extracted_text(extracted_data[i:])
            break

        else:
            list_sentences = processing_extracted_text(extracted_data)
            
            
    return "\n".join(list_sentences)


def extract_filtered_data(html_file):
    list_invalid_pages = ["page was not found", "url was not found", "page can’t be found","page isn't real", 
    "page is not real", "page not found", "error 404", "couldn't find the page"]

    with open (html_file, encoding='utf8',errors='replace') as f:
        file_contents = f.read()

        file_contents_without_metadata = filter_tags(file_contents).split("\n")
        for sentence in file_contents_without_metadata:
            if len(sentence.split())<=2:
                file_contents_without_metadata.remove(sentence)

        final_content = "\n".join(file_contents_without_metadata)
        
        if any(element in final_content.lower() for element in list_invalid_pages):
            return "No Modern Slavery statement found"
                
        elif len(final_content)==0:
            return "Nothing to return"
        
        else:
            return final_content 

In [None]:
from os import listdir
from os.path import isfile, join
import pdfplumber

df_data = pd.DataFrame()
for _, row in df_company.iterrows():
    url = row['Document Cached URL']
    file_name = join(LOCAL_FILE_FOLDER, url.split('/')[-1])
    if os.path.isfile(file_name):
        all_text = ''
        try:
            if file_name.endswith('pdf'):
                with pdfplumber.open(file_name) as pdf:
                    for pdf_page in pdf.pages:
                        single_page_text = pdf_page.extract_text()
                        if single_page_text:
                            all_text = all_text + '\n' + single_page_text
            elif file_name.endswith('html') :
                all_text = extract_filtered_data(file_name)
        except:
            print(file_name)
            pass
        row['extracted_text'] = all_text
        df_data = df_data.append(row)


In [None]:
df_data.shape

In [None]:
df_data.to_excel('data.xlsx')

In [None]:
df_data[200:500]