In [None]:
import json
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import glob
import concurrent.futures
from selenium.webdriver.chrome.options import Options

In [None]:
data_folder_path = 'Data/*.json'

json_files = glob.glob(data_folder_path)

In [None]:
def extract_year_from_filename(filename):
    parts = filename.split('\\')[-1].split('_')
    for part in parts:
        if part.isdigit():
            return int(part)
    return None

json_files.sort(key=lambda x: (extract_year_from_filename(x), x), reverse=True)

In [None]:
def initialize_headless_webdriver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

In [None]:
def process_file(file_path):
    print(f"Processing {file_path}...")
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    driver = initialize_headless_webdriver()

    for case in data: 
        case_link = case['Case Link']
        print(f"Opening URL for case: {case['Case Name']}")
        driver.get(case_link)
        wait = WebDriverWait(driver, 20)
        
        try:
            tables_div = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'table-dataview')))
            tables = tables_div.find_elements(By.TAG_NAME, 'table')
            if not tables:
                print("No tables found on the page.")
            else:
                for table in tables:
                    rows = table.find_elements(By.TAG_NAME, 'tr')
                    for row in rows:
                        cells = row.find_elements(By.TAG_NAME, 'td')
                        if len(cells) == 2:
                            key = cells[0].text.strip()
                            value = cells[1].text.strip()
                            if key and value:
                                case[key] = value
                            else:
                                print("Missing data in table row.")
            

            case_referred_data = []
            try:
                current_page = 1
                total_records_label = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".totalrecords")))
                total_records = int(total_records_label.text.split('-')[-1])
                total_pages = (total_records + 14) // 15
                print(f"Total pages: {total_pages}")

                while current_page <= total_pages:
                    case_referred_table = wait.until(EC.presence_of_element_located((By.XPATH, "//h5[contains(text(), 'Case referred')]/following-sibling::div//table")))
                    rows = case_referred_table.find_elements(By.TAG_NAME, 'tr')[1:]
                    for row in rows:
                        cells = row.find_elements(By.TAG_NAME, 'td')
                        if len(cells) >= 4:
                            judgment_name = cells[3].text
                            links = cells[3].find_elements(By.TAG_NAME, 'a')
                            judgment_link = links[0].get_attribute('href') if links else None
                            row_data = {
                                "Scr Citation": cells[1].text,
                                "Judicial Consideration": cells[2].text,
                                "Judgment Name": judgment_name,
                                "Judgment Link": judgment_link,
                            }
                            case_referred_data.append(row_data)

                    current_page += 1
                    if current_page > total_pages:
                        break

                    try:
                        next_page_link = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, f"ul.pagination a.page-link[data-page_number='{current_page}']")))
                        driver.execute_script("arguments[0].click();", next_page_link)
                        wait.until(EC.staleness_of(next_page_link))
                    except TimeoutException:
                        print(f"Failed to load page {current_page}")
                        break

            except NoSuchElementException:
                print("Pagination element not found, possibly reached the last page or there's an issue with the selector.")
            except Exception as e:
                print(f"An error occurred: {e}")

            case["Case referred"] = case_referred_data


            # Acts Section
            try:
                act_elements = driver.find_elements(By.CSS_SELECTOR, ".view-keyword.act li")
                case["Acts"] = [act.text for act in act_elements if act.text.strip() != ""]
            except NoSuchElementException:
                case["Acts"] = "Not available"

            # Keywords Section
            try:
                keyword_elements = driver.find_elements(By.CSS_SELECTOR, ".view-keyword h5 + .view-key > ul > li")
                case["Keyword"] = [keyword.text for keyword in keyword_elements if keyword.text.strip() != ""]
            except NoSuchElementException:
                case["Keyword"] = "Not available"


            # HeadNote Section
            try:
                read_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".read-more__link")))
                driver.execute_script("window.scrollTo(0, arguments[0].getBoundingClientRect().top + window.scrollY - (window.innerHeight / 2));", read_more_button)
                try:
                    read_more_button.click()
                except Exception as e:
                    driver.execute_script("arguments[0].click();", read_more_button)
                
                wait.until(lambda driver: "is-expanded" in driver.find_element(By.CSS_SELECTOR, ".read-more.js-read-more").get_attribute("class"))
                headnote_element = driver.find_element(By.CSS_SELECTOR, ".view-keyword h5 + div")
                headnote_text = ' '.join([p.text for p in headnote_element.find_elements(By.TAG_NAME, 'p')])
                case["HeadNote"] = headnote_text
            except TimeoutException:
                case["HeadNote"] = "Not available or loading timed out"
            except NoSuchElementException:
                case["HeadNote"] = "Not available"

        except Exception as e:
            print(f"An error occurred while processing the page: {e}")
        finally:
            with open(file_path, 'w') as file:
                json.dump(data, file, indent=4)
            print(f"Original JSON data updated and saved to {file_path}.")

    print("Completed", file_path)
    driver.quit()

In [None]:
def main():
    valid_files = [file_path for file_path in json_files if 2010 <= extract_year_from_filename(file_path) <= 2017]
    print(f"Found {len(valid_files)} valid files")


    # with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    #     futures = [executor.submit(process_file, file_path) for file_path in valid_files]
        
    #     for future in concurrent.futures.as_completed(futures):
    #         try:
    #             result = future.result()
    #         except Exception as exc:
    #             print(f'Generated an exception: {exc}')

In [None]:
main()