In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
from tqdm import tqdm
import numpy as np
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymupdf

## Get all law case links

In [None]:
# search queries
queries = ['forced labour',
            'forced labor',
            'coerced labor',
            'coerced labour',
            'labor exploitation',
            'labour exploitation',
            'labour bondage',
            'labor bondage',
            'involuntary servitude',
            'human trafficking​',
            'child labor',
            'child labour',
            'modern slavery',
            'slavery']

# required header
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0',
          'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8'
          }

domain = 'http://www.liiofindia.org/'

# base query template
query_base = 'http://www.liiofindia.org/cgi-bin/sinosrch.cgi?method=boolean&query={}&meta=%2Fliiofindia&lii=LIIofIndia&mask_path=in%2Fcases'

In [None]:
def get_law_urls(soup):
    """
    Get all law case urls from search page
    """
    law_cases_url = []
    law_cases_list = soup.find('ol')

    # some queries return empty response
    if law_cases_list == None:
        return None
    
    law_cases_list = law_cases_list.find_all('li')
    
    for case in law_cases_list:
        for item in case.p.contents:
            if item.name == 'a':
                url_case = item['href']
                title = item.contents[0]
                year = [int(y) for y in re.findall(r'\d{4}', title)]
                year = min(year) if year != [] else None
            elif item.name == 'small':
                database = item.contents[1].contents[0]
        
        # check if its law case
        if re.search(r'\s[vV][sS]?\.?\s', title):
            law_cases_url.append({'url': url_case,
                                'title': title,
                                'year': year,
                                'database': database})
    return law_cases_url

In [None]:
law_cases = []
for query in queries:
    query_cases = []
    url = query_base.format(query.replace(' ', '+'))
    soup = BeautifulSoup(requests.get(url, headers=header).content, "html.parser")

    search_pages = []
    for url in soup.find_all('a'):
        if url.has_attr('href') and re.match(r'.*offset=\d+0.*', url['href']):
            search_pages.append(url['href'])
    search_pages = ['http://www.liiofindia.org' + page for page in list(set(search_pages)) if 'http://www.liiofindia.org' not in page]

    # search links in first page
    urls = get_law_urls(soup)
    if urls != None:
        query_cases += urls

        # search in offset pages
        for page in search_pages:
            soup = BeautifulSoup(requests.get(page, headers=header).content, "html.parser")
            query_cases += get_law_urls(soup)

        # update query value
        for idx, case in enumerate(query_cases):
            query_cases[idx]['query'] = query
        
        # add to law cases list
        law_cases += query_cases
    
    print(f'{query}: {len(query_cases)}')

In [None]:
df = pd.DataFrame(law_cases)
df.to_csv('search.csv', index=False)

In [None]:
df.head()

## Extract page content

In [None]:
df = pd.read_csv('search.csv')
df['content'] = np.nan

In [None]:
def get_content(soup):
    soup.find_all('h2')
    title = soup.h2
    document = title.find_next('p')

    # remove footnote
    document = re.sub(r'\[\nContext\n\] \[\nHide Context\n\]\n(?:CommonLII|LIIofIndia):\nCopyright Policy\n\|\nDisclaimers\n\|\nPrivacy Policy\n\|\nFeedback\nURL:\s.*', '', document.get_text(strip=True, separator='\n'))
    
    return document

In [None]:
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    if not pd.isna(row.content): continue
    sleep(0.01)
    soup = BeautifulSoup(requests.get(row.url, headers=header).content, "html.parser")

    # check if it is embbeded pdf
    obj = soup.find('object')
    if obj is not None and obj['type'] == 'application/pdf':
        df.loc[idx, 'content'] = ''
        continue

    content = get_content(soup)
    df.loc[idx, 'content'] = content

In [None]:
df.to_csv('lii_india_modern_slavery.csv', index=False)

In [None]:
len(df.loc[df['content']==''])

## Download Embedded PDFs and extract their content

In [None]:
df = pd.read_csv('lii_india_modern_slavery.csv')
pdf_df = df[df.isnull().any(axis=1)]
pdf_df.shape

In [None]:
last_downloaded = -1 # checkpoint
data_path = './data/' # download path
downloaded_files = [] # list of downloaded files with corresponding rows
prev_downloaded = [file.split('/')[-1] for file in os.listdir(data_path)]

In [None]:
i = 0
for idx, row in tqdm(pdf_df.iterrows(), total=pdf_df.shape[0]):
    if idx <= last_downloaded: continue
    sleep(0.01)
    soup = BeautifulSoup(requests.get(row.url, headers=header).content, "html.parser")
    
    obj = soup.find('object')
    if obj is not None and obj['type'] == 'application/pdf':
        file_path = obj['data'] # get file location in database
        filename = file_path.replace('/', '_').strip('_').replace('in_cases_', '')
        if filename not in prev_downloaded:
            sleep(0.01)
            content = requests.get(domain+file_path, headers=header).content # get file from database
            with open(data_path+filename, 'wb') as f:
                f.write(content)
        downloaded_files.append((idx, filename)) # save idx and filename
    elif 'There is no available HTML version of this document' in soup.get_text():
        if file_path := re.search(r'(in\/cases\/.*\.html)', row.url):
            file_path = file_path.group(1).replace('html', 'pdf')
            filename = file_path.replace('/', '_').strip('_').replace('in_cases_', '')
            if filename not in prev_downloaded:
                sleep(0.01)
                content = requests.get(domain+file_path, headers=header).content # get file from database
                with open(data_path+filename, 'wb') as f:
                    f.write(content)
            downloaded_files.append((idx, filename)) # save idx and filename
    else:
        downloaded_files.append((idx, -1)) # save idx and -1 if download file not found

    last_downloaded = idx

In [None]:
print(len(downloaded_files))

In [None]:
idx_list = [idx for idx, _ in downloaded_files]
doc_content = []
for idx, file in downloaded_files:
    if file == -1:
        doc_content.append('')
    else:
        doc = pymupdf.open(data_path+file)
        content = ' '.join([page.get_text() for page in doc])
        doc_content.append(content)

In [None]:
df.loc[idx_list, 'content'] = doc_content

In [None]:
df = df.drop_duplicates('content')

In [None]:
df.to_csv('lii_india_modern_slavery.csv', index=False)
df.to_excel('lii_india_modern_slavery.xlsx', sheet_name='lii_india_modern_slavery', engine='xlsxwriter')