In [None]:
! pip install -q requests beautifulsoup4 selenium

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

# Set up the Chrome driver
driver = webdriver.Chrome()

# URL of the website you want to interact with
url = 'https://digiscr.sci.gov.in/'

# Navigate to the webpage
driver.get(url)

cases_details = []

# Wait for the dynamic content to load
driver.implicitly_wait(60)  # Adjust the wait time according to your internet speed and website's response time

# Find the <div> element with class 'content-div'
content_div = driver.find_element(By.CLASS_NAME, 'content-div')

# Option 1: Find the first <h2> within this <div>
h2_within_div = content_div.find_element(By.TAG_NAME, 'h2')


# Find the <ul> element(s) with the class 'linking-section'
ul_elements = driver.find_elements(By.CLASS_NAME, 'linking-section')

# Iterate through the found <ul> elements
for ul in ul_elements:
    li_elements = ul.find_elements(By.TAG_NAME, 'li')  # Find all <li> elements within each <ul>
    
    for li in li_elements:
        # Extracting the case name and link
        case_link_element = li.find_element(By.CSS_SELECTOR, 'a.active')
        case_name = case_link_element.text
        case_link = case_link_element.get_attribute('href')

        # Extracting the citation
        citation_element = li.find_element(By.CSS_SELECTOR, '.cititaion span')
        citation = citation_element.text if citation_element else "No citation available"

        # Extracting the date and case type
        date_and_case_type_elements = li.find_elements(By.CSS_SELECTOR, '.civil p')
        case_type = date_and_case_type_elements[0].text if len(date_and_case_type_elements) > 0 else "No case type available"
        date = date_and_case_type_elements[1].text if len(date_and_case_type_elements) > 1 else "No date available"

        # Extracting judge names
        judge_names_elements = li.find_elements(By.CSS_SELECTOR, '.entryjudgment span')
        judge_names = ', '.join([judge.text for judge in judge_names_elements])

        # Extracting PDF link
        pdf_link_element = li.find_element(By.CSS_SELECTOR, 'a[href*="pdf_viewer"]')
        pdf_link = pdf_link_element.get_attribute('href') if pdf_link_element else "No PDF link available"

        # Print extracted information
        cases_details.append({
            'volume': h2_within_div.text,
            'Case Name': case_name,
            'Case Link': case_link,
            'Citation': citation,
            'Case Type': case_type,
            'Date': date,
            'Judge Names': judge_names,
            'PDF Link': pdf_link
        })

for case in cases_details:
    print(case)

# Close the driver
driver.quit()


In [None]:
import json

if cases_details:
    volume_name = cases_details[0]['volume'].replace(', ', '_').replace(' ', '_').replace('.', '')
    file_name = f"{volume_name}.json"

    # Write the list of cases to a JSON file
    with open(file_name, 'w') as json_file:
        json.dump(cases_details, json_file, indent=4)

    print(f"Data saved to {file_name}")


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select

# Set up the Chrome driver
driver = webdriver.Chrome()

# URL of the website to scrape
url = 'https://digiscr.sci.gov.in/'

# Open the URL
driver.get(url)

cases_details = []

# Wait for the dynamic content to load, specifically the dropdown with id 'partno'
wait = WebDriverWait(driver, 30)  # Wait up to 30 seconds
partno_dropdown = wait.until(EC.presence_of_element_located((By.ID, "partno")))

# Ensure the dropdown is in a clickable state
ActionChains(driver).move_to_element(partno_dropdown).click(partno_dropdown).perform()

# Select 'Part II' from the dropdown
select = Select(partno_dropdown)
select.select_by_value('2')  # Using value attribute to avoid text discrepancies

# Wait for the AJAX call to complete and the records to be updated
wait.until(lambda driver: driver.execute_script('return jQuery.active') == 0)


# Wait for the AJAX call to complete and the records to be loaded
# You may need to adjust the selector based on the actual content and structure
records_span = wait.until(
    EC.visibility_of_element_located((By.CSS_SELECTOR, "div.records span"))
)

# Extract and print the text from the <span> element
total_records_text = records_span.text
print(total_records_text)

# After selecting 'Part II', wait for the relevant data to load if needed
# Now, locate the <ul> with class 'linking-section' and count the <li> elements
ul_element = wait.until(
    EC.presence_of_element_located((By.CLASS_NAME, "linking-section"))
)
li_elements = ul_element.find_elements(By.TAG_NAME, 'li')
total_li_count = len(li_elements)
print(f"Total <li> elements within <ul class='linking-section'>: {total_li_count}")
for li in li_elements:
        # Extracting the case name and link
        case_link_element = li.find_element(By.CSS_SELECTOR, 'a.active')
        case_name = case_link_element.text
        case_link = case_link_element.get_attribute('href')
        # Extracting the citation
        citation_element = li.find_element(By.CSS_SELECTOR, '.cititaion span')
        citation = citation_element.text if citation_element else "No citation available"

        # Extracting the date and case type
        date_and_case_type_elements = li.find_elements(By.CSS_SELECTOR, '.civil p')
        case_type = date_and_case_type_elements[0].text if len(date_and_case_type_elements) > 0 else "No case type available"
        date = date_and_case_type_elements[1].text if len(date_and_case_type_elements) > 1 else "No date available"

        # Extracting judge names
        judge_names_elements = li.find_elements(By.CSS_SELECTOR, '.entryjudgment span')
        judge_names = ', '.join([judge.text for judge in judge_names_elements])

        # Extracting PDF link
        pdf_link_element = li.find_element(By.CSS_SELECTOR, 'a[href*="pdf_viewer"]')
        pdf_link = pdf_link_element.get_attribute('href') if pdf_link_element else "No PDF link available"

         # Print extracted information
        cases_details.append({
            'volume': '2024, Volume 1, Part II',
            'Case Name': case_name,
            'Case Link': case_link,
            'Citation': citation,
            'Case Type': case_type,
            'Date': date,
            'Judge Names': judge_names,
            'PDF Link': pdf_link
        })

# Add your scraping logic here
        
for case in cases_details:
    print(case)

# Close the browser when done
driver.quit()


In [None]:
import json

if cases_details:
    volume_name = cases_details[0]['volume'].replace(', ', '_').replace(' ', '_').replace('.', '')
    file_name = f"{volume_name}.json"

    # Write the list of cases to a JSON file
    with open(file_name, 'w') as json_file:
        json.dump(cases_details, json_file, indent=4)

    print(f"Data saved to {file_name}")
