In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import requests
import os
import time
from urllib.parse import urljoin, urlparse
import re

def setup_driver(headless=True):
    """Setup Chrome driver with proper options"""
    chrome_options = Options()
    if headless:
        chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    
    # Important: Allow downloads
    prefs = {
        "download.default_directory": os.path.join(os.getcwd(), "supreme_court_judgements"),
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "plugins.always_open_pdf_externally": True,
        "profile.default_content_setting_values.automatic_downloads": 1
    }
    chrome_options.add_experimental_option("prefs", prefs)
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    return driver

def analyze_page_structure(driver, url):
    """Analyze the actual page structure to find PDFs"""
    print("üîç Analyzing page structure...")
    
    # Get all links on the page
    all_links = driver.find_elements(By.TAG_NAME, "a")
    
    pdf_links = []
    other_links = []
    
    for link in all_links:
        try:
            href = link.get_attribute('href')
            text = link.text.strip()
            
            if href:
                if '.pdf' in href.lower():
                    pdf_links.append({
                        'url': href,
                        'text': text or 'No text',
                        'element': link
                    })
                else:
                    # Check if it might be a judgement link
                    if any(word in (text + href).lower() for word in ['judgement', 'judgment', 'case', 'download', 'view']):
                        other_links.append({
                            'url': href,
                            'text': text or 'No text'
                        })
        except:
            continue
    
    print(f"‚úÖ Found {len(pdf_links)} direct PDF links")
    print(f"‚úÖ Found {len(other_links)} other potential judgement links")
    
    # Show first few PDF links
    if pdf_links:
        print("\nüìã Direct PDF links found:")
        for i, pdf in enumerate(pdf_links[:5], 1):
            print(f"  {i}. {pdf['text'][:50]}... ‚Üí {pdf['url'][:80]}...")
    
    return pdf_links, other_links

def extract_judgement_details(driver):
    """Extract judgement details from the page"""
    print("\nüîç Looking for judgement items...")
    
    # Common selectors for judgement containers
    selectors_to_try = [
        "div.judgement", "div.judgment", "div.item", "article", 
        "div.card", "li.judgement", "div.list-item", "tr",
        "div.post", "div.entry", "div.content-item"
    ]
    
    judgements = []
    
    for selector in selectors_to_try:
        try:
            elements = driver.find_elements(By.CSS_SELECTOR, selector)
            if elements and len(elements) > 2:  # More than 2 suggests it's the right selector
                print(f"‚úÖ Found {len(elements)} elements with selector: '{selector}'")
                
                # Try to extract info from first few
                for i, elem in enumerate(elements[:3]):
                    try:
                        text = elem.text[:200] if elem.text else "No text"
                        print(f"  Sample {i+1}: {text[:100]}...")
                        
                        # Look for PDF links within this element
                        pdf_links_in_elem = elem.find_elements(By.CSS_SELECTOR, "a[href*='.pdf']")
                        if pdf_links_in_elem:
                            print(f"    Contains {len(pdf_links_in_elem)} PDF link(s)")
                            
                    except:
                        continue
                
                judgements = elements
                break
                
        except:
            continue
    
    return judgements

def get_pdf_links_from_analysis(driver, url):
    """Main function to get PDF links using multiple methods"""
    
    # Method 1: Direct PDF links on page
    direct_pdfs, other_links = analyze_page_structure(driver, url)
    
    # Method 2: Extract from judgement containers
    judgements = extract_judgement_details(driver)
    
    pdf_list = []
    
    # Add direct PDFs
    for pdf in direct_pdfs:
        pdf_list.append({
            'url': pdf['url'],
            'filename': create_filename_from_url(pdf['url'], pdf['text']),
            'title': pdf['text'][:100]
        })
    
    # If we found judgement containers but no PDFs, they might be on detail pages
    if judgements and not pdf_list:
        print("\n‚ö†Ô∏è Found judgement containers but no direct PDFs.")
        print("PDFs might be on individual case pages.")
        print(f"Found {len(other_links)} potential case detail links")
        
        # Check first few detail links for PDFs
        for i, link_info in enumerate(other_links[:3]):
            print(f"\nChecking detail page {i+1}: {link_info['text'][:50]}...")
            
            try:
                # Open the detail page
                driver.execute_script("window.open('');")
                driver.switch_to.window(driver.window_handles[1])
                driver.get(link_info['url'])
                time.sleep(2)
                
                # Look for PDFs on detail page
                detail_pdfs = driver.find_elements(By.CSS_SELECTOR, "a[href*='.pdf']")
                if detail_pdfs:
                    print(f"  ‚úÖ Found {len(detail_pdfs)} PDF(s) on detail page")
                    for pdf_elem in detail_pdfs[:2]:  # Take first 2
                        pdf_url = pdf_elem.get_attribute('href')
                        pdf_text = pdf_elem.text.strip() or link_info['text']
                        
                        pdf_list.append({
                            'url': pdf_url,
                            'filename': create_filename_from_url(pdf_url, pdf_text),
                            'title': pdf_text[:100]
                        })
                
                # Close detail tab
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
                
            except Exception as e:
                print(f"  Error checking detail page: {e}")
                if len(driver.window_handles) > 1:
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])
    
    return pdf_list

def create_filename_from_url(url, title):
    """Create a clean filename from URL and title"""
    # Get base filename from URL
    if '/' in url:
        basename = url.split('/')[-1]
        if '?' in basename:
            basename = basename.split('?')[0]
    else:
        basename = url
    
    # If it doesn't end with .pdf, add it
    if not basename.lower().endswith('.pdf'):
        basename += '.pdf'
    
    # Clean the filename
    basename = re.sub(r'[^\w\.\-]', '_', basename)
    
    # If title is meaningful, use it
    clean_title = re.sub(r'[^\w\s\-]', '', title[:50])
    clean_title = re.sub(r'\s+', '_', clean_title.strip())
    
    if clean_title and len(clean_title) > 5:
        filename = f"{clean_title}_{basename}"
    else:
        filename = basename
    
    return filename[:150]  # Limit length

def download_pdf_selenium(driver, pdf_url, download_folder):
    """Download PDF using Selenium (for sites that require browser session)"""
    try:
        print(f"‚¨áÔ∏è  Attempting Selenium download: {pdf_url[:80]}...")
        
        # Open PDF in new tab
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[1])
        
        # Navigate to PDF
        driver.get(pdf_url)
        time.sleep(3)  # Wait for PDF to load/start download
        
        # In headless mode, PDFs usually auto-download to default directory
        print("  ‚è≥ PDF should be downloading...")
        
        # Close the PDF tab
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        
        # Check if file was downloaded
        time.sleep(2)
        download_dir = download_folder
        files_before = set(os.listdir(download_dir)) if os.path.exists(download_dir) else set()
        
        # Wait a bit for download to complete
        time.sleep(3)
        
        files_after = set(os.listdir(download_dir)) if os.path.exists(download_dir) else set()
        new_files = files_after - files_before
        
        if new_files:
            for file in new_files:
                if file.lower().endswith('.pdf') or '.pdf' in file.lower():
                    filepath = os.path.join(download_dir, file)
                    size = os.path.getsize(filepath) // 1024 if os.path.exists(filepath) else 0
                    print(f"  ‚úÖ Downloaded: {file} ({size} KB)")
                    return True
        
        print("  ‚ö†Ô∏è PDF may not have downloaded automatically")
        return False
        
    except Exception as e:
        print(f"  ‚ùå Selenium download failed: {str(e)[:100]}")
        return False
    finally:
        # Ensure we're back to main window
        if len(driver.window_handles) > 1:
            try:
                driver.switch_to.window(driver.window_handles[1])
                driver.close()
            except:
                pass
        if driver.window_handles:
            driver.switch_to.window(driver.window_handles[0])

def download_pdf_direct(pdf_info, download_folder):
    """Try direct download with requests"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept': 'application/pdf, */*',
            'Referer': 'https://www.supremecourt.gov.pk/'
        }
        
        print(f"‚¨áÔ∏è  Direct download: {pdf_info['filename']}")
        
        response = requests.get(pdf_info['url'], headers=headers, stream=True, timeout=30)
        response.raise_for_status()
        
        # Check if it's a PDF
        if 'pdf' not in response.headers.get('content-type', '').lower():
            # Check content
            if response.content[:4] != b'%PDF':
                print(f"  ‚ö†Ô∏è Not a PDF file")
                return False
        
        # Ensure download folder exists
        if not os.path.exists(download_folder):
            os.makedirs(download_folder)
        
        # Create filepath
        filepath = os.path.join(download_folder, pdf_info['filename'])
        
        # Make filename unique
        counter = 1
        original_path = filepath
        while os.path.exists(filepath):
            name, ext = os.path.splitext(pdf_info['filename'])
            filepath = os.path.join(download_folder, f"{name}_{counter}{ext}")
            counter += 1
        
        # Save file
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        
        size = os.path.getsize(filepath) // 1024
        print(f"  ‚úÖ Success ({size} KB)")
        return True
        
    except Exception as e:
        print(f"  ‚ùå Direct download failed: {str(e)[:100]}")
        return False

def main():
    """Main function"""
    TARGET_URL = "https://sacgb.gov.pk/Judgments.html"
    DOWNLOAD_FOLDER = "supreme_court_judgements"
    DELAY = 3
    
    print("=" * 70)
    print("üèõÔ∏è  SUPREME COURT PAKISTAN - SMART PDF DOWNLOADER")
    print("=" * 70)
    
    # Create download folder
    if not os.path.exists(DOWNLOAD_FOLDER):
        os.makedirs(DOWNLOAD_FOLDER)
    
    # Start with NON-headless to see what's happening
    print("\nüöÄ Starting browser (visible mode to debug)...")
    driver = setup_driver(headless=False)  # Changed to False for debugging
    
    try:
        # Load the page
        print(f"\nüåê Loading: {TARGET_URL}")
        driver.get(TARGET_URL)
        time.sleep(5)
        
        # Analyze page and get PDF links
        pdf_list = get_pdf_links_from_analysis(driver, TARGET_URL)
        
        if not pdf_list:
            print("\n‚ùå No PDFs found. Possible reasons:")
            print("   1. PDFs are behind login or not publicly accessible")
            print("   2. PDFs are loaded via different mechanism")
            print("   3. The page structure has changed")
            print("\nüí° Check the browser window that opened to see the actual page.")
            input("Press Enter after inspecting the page...")
            return
        
        print(f"\nüì• Ready to download {len(pdf_list)} PDF(s)")
        print("-" * 70)
        
        # Try downloading
        downloaded = 0
        for i, pdf_info in enumerate(pdf_list, 1):
            print(f"\n[{i}/{len(pdf_list)}] {pdf_info['title'][:60]}...")
            
            # First try direct download
            if not download_pdf_direct(pdf_info, DOWNLOAD_FOLDER):
                # If direct fails, try selenium download
                print("  ‚ö†Ô∏è Direct download failed, trying Selenium method...")
                if download_pdf_selenium(driver, pdf_info['url'], DOWNLOAD_FOLDER):
                    downloaded += 1
            else:
                downloaded += 1
            
            # Delay between downloads
            if i < len(pdf_list):
                print(f"   ‚è≥ Waiting {DELAY} seconds...")
                time.sleep(DELAY)
        
        # Summary
        print("\n" + "=" * 70)
        print("üìä FINAL SUMMARY")
        print("=" * 70)
        print(f"‚úÖ Successfully downloaded: {downloaded}/{len(pdf_list)}")
        print(f"üìÅ Location: {os.path.abspath(DOWNLOAD_FOLDER)}")
        
        # List downloaded files
        if os.path.exists(DOWNLOAD_FOLDER):
            pdf_files = [f for f in os.listdir(DOWNLOAD_FOLDER) if f.lower().endswith('.pdf')]
            if pdf_files:
                print(f"\nüìã Downloaded {len(pdf_files)} PDF file(s):")
                for file in pdf_files[:10]:
                    size = os.path.getsize(os.path.join(DOWNLOAD_FOLDER, file)) // 1024
                    print(f"   ‚Ä¢ {file[:60]}... ({size} KB)")
        
        print("\nüí° Tips if downloads failed:")
        print("   1. Check if PDFs require clicking 'Download' button")
        print("   2. PDFs might be behind additional pages")
        print("   3. Try manual download to understand the flow")
        
    except Exception as e:
        print(f"\n‚ùå Error: {e}")
        import traceback
        traceback.print_exc()
    finally:
        input("\nPress Enter to close browser...")
        driver.quit()
        print("üéâ Process completed!")

if __name__ == "__main__":
    # Check/install dependencies
    try:
        from bs4 import BeautifulSoup
    except ImportError:
        print("Installing required packages...")
        import subprocess
        subprocess.check_call(["pip", "install", "beautifulsoup4", "selenium", "webdriver-manager", "requests"])
    
    main()

üèõÔ∏è  SUPREME COURT PAKISTAN - SMART PDF DOWNLOADER

üöÄ Starting browser (visible mode to debug)...

üåê Loading: https://sacgb.gov.pk/Judgments.html
üîç Analyzing page structure...
‚úÖ Found 1396 direct PDF links
‚úÖ Found 10 other potential judgement links

üìã Direct PDF links found:
  1. CPLA No.75/2019
Prov. Govt. of GB Vs. Saadat Khan... ‚Üí https://sacgb.gov.pk/Judgments/judgements-2021/1.%20judgment%20of%20Saadat%20Kha...
  2. CPLA No. 98/2020
Provincial Govt. through Chief Se... ‚Üí https://sacgb.gov.pk/Judgments/judgements-2021/2.%20final%20Judgment%20of%20Akht...
  3. CPLA No.138/2020
Prov. Government GB through Chief... ‚Üí https://sacgb.gov.pk/Judgments/judgements-2021/3.%20judgement%20of%20Naveed%20En...
  4. CPLA No.51/2018
Govt. of GB through Chief Secretar... ‚Üí https://sacgb.gov.pk/Judgments/judgements-2021/4.%20final%20judgment%20of%20shah...
  5. CPLA U/O No.152/2019
Govt. of GB through Chief Sec... ‚Üí https://sacgb.gov.pk/Judgments/judgements-2021/5.%20fi