# CEQR API Test

Search CEQR (City Environmental Quality Review) projects by borough, block, and lot.

**No browser needed** - fully automated with Python.


In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

print("‚úÖ Ready")


‚úÖ Ready


## Parse Results Function

Function to parse CEQR search results and extract detail page links.


In [7]:
def parse_ceqr_results(response):
    """Extract CEQR results table from HTML response, including detail page links."""
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find results table
    table = soup.find('table', {'id': lambda x: x and 'grdSearchResults' in x})
    
    if not table:
        # Try finding by content
        tables = soup.find_all('table')
        for t in tables:
            if 'CEQR Number' in t.get_text() or 'Project Name' in t.get_text():
                table = t
                break
    
    if not table:
        print("‚ö†Ô∏è  No results table found")
        return None
    
    # Extract rows
    rows = table.find_all('tr')
    if not rows:
        return None
    
    # Get headers
    headers = [th.get_text(strip=True) for th in rows[0].find_all(['th', 'td'])]
    
    # Get data with detail links
    data = []
    detail_links = []
    
    for row in rows[1:]:
        cells = row.find_all(['td', 'th'])
        if cells:
            row_data = [cell.get_text(strip=True) for cell in cells]
            if any(cell.strip() for cell in row_data):
                data.append(row_data)
                
                # Extract detail page link
                detail_link = row.find('a', {'id': lambda x: x and 'hlnkOpenDetails' in x})
                if detail_link and detail_link.get('href'):
                    full_url = f"https://a002-ceqraccess.nyc.gov/ceqr/{detail_link['href']}"
                    detail_links.append(full_url)
                else:
                    detail_links.append("")
    
    if not data:
        return None
    
    # Create DataFrame with detail links column
    df = pd.DataFrame(data, columns=headers[:len(data[0])])
    df['Detail Page'] = detail_links
    
    return df

print("‚úÖ Parser loaded (with detail page links)")


‚úÖ Parser loaded (with detail page links)


## Search Functions

### BBL Parser & Search

Search by BBL (10-digit Borough-Block-Lot number)


In [8]:
def search_ceqr_by_bbl(bbl):
    """
    Search CEQR database by BBL (Borough-Block-Lot).
    
    Args:
        bbl: 10-digit BBL number (string or int)
             Format: BBBBBLLLL where B=borough (1-5), BBBBB=block, LLLL=lot
    
    Returns: DataFrame with results or None
    """
    # Convert to string and pad if needed
    bbl_str = str(bbl).zfill(10)
    
    if len(bbl_str) != 10:
        print(f"‚ùå Invalid BBL: {bbl} (must be 10 digits)")
        return None
    
    # Parse BBL
    boro_code = bbl_str[0]
    block = bbl_str[1:6].lstrip('0') or '0'  # Remove leading zeros
    lot = bbl_str[6:10].lstrip('0') or '0'   # Remove leading zeros
    
    # Map borough code to name
    boro_map = {
        '1': 'Manhattan',
        '2': 'Bronx', 
        '3': 'Brooklyn',
        '4': 'Queens',
        '5': 'Staten Island'
    }
    
    borough = boro_map.get(boro_code)
    if not borough:
        print(f"‚ùå Invalid borough code: {boro_code}")
        return None
    
    print(f"üìç BBL {bbl} ‚Üí {borough}, Block {block}, Lot {lot}")
    
    # Search and parse
    success, result = search_ceqr(borough, block, lot)
    
    if success:
        df = parse_ceqr_results(result)
        return df
    else:
        print(f"‚ùå Search failed: {result}")
        return None


def search_ceqr(borough, block, lot=""):
    """
    Search CEQR database by borough, block, and lot.
    
    Returns: tuple (success: bool, response or error message)
    """
    url = "https://a002-ceqraccess.nyc.gov/ceqr/"
    session = requests.Session()
    
    # Step 1: GET initial page to get VIEWSTATE
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
    }
    
    try:
        print(f"üîç Searching: {borough}, Block {block}" + (f", Lot {lot}" if lot else ""))
        
        # GET the page
        init_resp = session.get(url, headers=headers, timeout=30)
        if init_resp.status_code != 200:
            return False, f"Failed to load page: {init_resp.status_code}"
        
        # Extract VIEWSTATE fields
        soup = BeautifulSoup(init_resp.text, 'html.parser')
        viewstate = soup.find('input', {'id': '__VIEWSTATE'})
        viewstate_gen = soup.find('input', {'id': '__VIEWSTATEGENERATOR'})
        eventval = soup.find('input', {'id': '__EVENTVALIDATION'})
        
        if not viewstate:
            return False, "Could not find VIEWSTATE"
        
        print("‚úÖ Got session")
        
        # Step 2: POST search
        form_data = {
            "__LASTFOCUS": "",
            "__EVENTTARGET": "",
            "__EVENTARGUMENT": "",
            "__VIEWSTATE": viewstate['value'],
            "__VIEWSTATEGENERATOR": viewstate_gen['value'] if viewstate_gen else "F2CE38DF",
            "__SCROLLPOSITIONX": "0",
            "__SCROLLPOSITIONY": "0",
            "__VIEWSTATEENCRYPTED": "",
            "__EVENTVALIDATION": eventval['value'] if eventval else "",
            "ctl00$MainContent$txtKeyword": "",
            "ctl00$MainContent$ddlLeadAgency": "XYU@2!",
            "ctl00$MainContent$txtCeqrNumber": "",
            "ctl00$MainContent$txtProjectName": "",
            "ctl00$MainContent$ddlCommunityDistrict": "XYU@2!",
            "ctl00$MainContent$ddlBorough": borough,
            "ctl00$MainContent$txtBlock": block,
            "ctl00$MainContent$txtLot": lot,
            "ctl00$MainContent$btnSearch": " Search"
        }
        
        post_headers = {
            **headers,
            "Content-Type": "application/x-www-form-urlencoded",
            "Origin": "https://a002-ceqraccess.nyc.gov",
            "Referer": url,
            "Cache-Control": "max-age=0"
        }
        
        response = session.post(url, headers=post_headers, data=form_data, timeout=30)
        
        if response.status_code != 200:
            return False, f"Search failed: {response.status_code}"
        
        # Check for results
        if 'grdSearchResults' in response.text or 'Search Results' in response.text:
            print("‚úÖ Got results")
            return True, response
        elif 'Error' in response.text or 'Unhandled' in response.text:
            return False, "Server error"
        else:
            print("‚ö†Ô∏è  No results found")
            return True, response
            
    except Exception as e:
        return False, f"Error: {str(e)}"

print("‚úÖ Function loaded")


‚úÖ Function loaded


## Parse Results


## Detail Page Scraping

Functions to navigate to detail pages, extract text content, and find PDF links.


In [9]:
def scrape_detail_page(detail_url, session=None):
    """
    Navigate to a CEQR detail page and extract text content and PDF links.
    
    Args:
        detail_url: Full URL to the detail page
        session: Optional requests.Session object for maintaining cookies
    
    Returns:
        dict with keys:
            - 'url': The detail page URL
            - 'text': Full text content from the page
            - 'pdf_links': List of PDF URLs found on the page
            - 'success': Boolean indicating if scraping succeeded
            - 'error': Error message if failed
    """
    if session is None:
        session = requests.Session()
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
    }
    
    result = {
        'url': detail_url,
        'text': '',
        'pdf_links': [],
        'success': False,
        'error': None
    }
    
    try:
        print(f"üåê Fetching: {detail_url[:80]}...")
        
        response = session.get(detail_url, headers=headers, timeout=30)
        
        if response.status_code != 200:
            result['error'] = f"HTTP {response.status_code}"
            return result
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract all text content (removing script and style tags)
        for script in soup(["script", "style"]):
            script.decompose()
        
        # Get main content - try to find the main content area
        main_content = soup.find('div', {'id': lambda x: x and ('MainContent' in x or 'content' in x.lower())})
        if not main_content:
            main_content = soup.find('body')
        
        if main_content:
            # Get all text, preserving some structure
            text = main_content.get_text(separator='\n', strip=True)
            result['text'] = text
        else:
            result['text'] = soup.get_text(separator='\n', strip=True)
        
        # Find all PDF links on the detail page
        pdf_links = []
        
        # Helper function to normalize URLs
        def normalize_url(href, base_url):
            """Convert relative URLs to absolute URLs."""
            if href.startswith('http'):
                return href
            elif href.startswith('/'):
                # Absolute path on same domain
                domain = '/'.join(base_url.split('/')[:3])
                return domain + href
            else:
                # Relative path
                base = '/'.join(base_url.split('/')[:-1])
                return base + '/' + href
        
        # Look for all links that might be PDFs
        # CEQR detail pages often have PDFs in tables or file sections
        for link in soup.find_all('a', href=True):
            href = link['href']
            link_text = link.get_text(strip=True).lower()
            
            # Check if it's a direct PDF link
            is_pdf = (
                href.endswith('.pdf') or 
                '.pdf' in href.lower() or
                # Check for handler URLs (ProjectFile.ashx) - these serve PDFs
                'projectfile.ashx' in href.lower() or
                'filehandler.ashx' in href.lower() or
                'handlers/projectfile.ashx' in href.lower() or
                # Check for file= parameter (common in handler URLs)
                'file=' in href.lower()
            )
            
            # Also check link text for PDF indicators (KB/MB file sizes are common)
            has_pdf_indicator = (
                'pdf' in link_text or
                'kb' in link_text or  # File size indicator (e.g., "156.5KB")
                'mb' in link_text or
                link_text.endswith('.pdf') or
                # Check parent elements for file-related text
                (link.parent and ('file' in link.parent.get_text(strip=True).lower() or 
                                  'document' in link.parent.get_text(strip=True).lower()))
            )
            
            # Include if it's clearly a PDF link or has PDF indicators with handler/file params
            if is_pdf or (has_pdf_indicator and ('handler' in href.lower() or 'file=' in href.lower() or 'ashx' in href.lower())):
                normalized_url = normalize_url(href, detail_url)
                pdf_links.append(normalized_url)
        
        # Also check for direct PDF links in iframes or embedded content
        for iframe in soup.find_all('iframe', src=True):
            src = iframe['src']
            if '.pdf' in src.lower() or 'projectfile.ashx' in src.lower():
                normalized_url = normalize_url(src, detail_url)
                pdf_links.append(normalized_url)
        
        # Look for PDF links in specific CEQR sections (Files section, Documents section, etc.)
        # Check for tables or divs that might contain file listings
        file_sections = soup.find_all(['table', 'div'], class_=lambda x: x and (
            'file' in x.lower() or 
            'document' in x.lower() or
            'pdf' in x.lower()
        ))
        
        for section in file_sections:
            for link in section.find_all('a', href=True):
                href = link['href']
                if ('projectfile.ashx' in href.lower() or 
                    'file=' in href.lower() or 
                    '.pdf' in href.lower()):
                    normalized_url = normalize_url(href, detail_url)
                    if normalized_url not in pdf_links:
                        pdf_links.append(normalized_url)
        
        # Remove duplicates while preserving order
        seen = set()
        unique_pdf_links = []
        for link in pdf_links:
            if link not in seen:
                seen.add(link)
                unique_pdf_links.append(link)
        
        result['pdf_links'] = unique_pdf_links
        result['success'] = True
        
        print(f"‚úÖ Scraped: {len(result['text'])} chars, {len(result['pdf_links'])} PDFs")
        
    except Exception as e:
        result['error'] = str(e)
        print(f"‚ùå Error scraping {detail_url}: {e}")
    
    return result


def scrape_all_detail_pages(df, detail_column='Detail Page', session=None):
    """
    Scrape all detail pages from a DataFrame.
    
    Args:
        df: DataFrame with detail page URLs
        detail_column: Name of column containing detail page URLs
        session: Optional requests.Session object
    
    Returns:
        DataFrame with added columns:
            - 'detail_text': Full text from detail page
            - 'pdf_links': List of PDF URLs (as string, comma-separated)
            - 'pdf_count': Number of PDFs found
            - 'scrape_success': Boolean indicating if scraping succeeded
    """
    if session is None:
        session = requests.Session()
    
    if detail_column not in df.columns:
        print(f"‚ùå Column '{detail_column}' not found in DataFrame")
        return df
    
    # Create new columns
    df = df.copy()
    df['detail_text'] = ''
    df['pdf_links'] = ''
    df['pdf_count'] = 0
    df['scrape_success'] = False
    
    print(f"\nüìÑ Scraping {len(df)} detail pages...\n")
    
    for idx, row in df.iterrows():
        detail_url = row[detail_column]
        
        if not detail_url or pd.isna(detail_url) or detail_url == '':
            continue
        
        result = scrape_detail_page(detail_url, session)
        
        df.at[idx, 'detail_text'] = result['text']
        df.at[idx, 'pdf_links'] = ', '.join(result['pdf_links'])
        df.at[idx, 'pdf_count'] = len(result['pdf_links'])
        df.at[idx, 'scrape_success'] = result['success']
        
        # Small delay to be respectful
        import time
        time.sleep(0.5)
    
    successful = df['scrape_success'].sum()
    total_pdfs = df['pdf_count'].sum()
    
    print(f"\n‚úÖ Completed: {successful}/{len(df)} pages scraped, {total_pdfs} PDFs found")
    
    return df


print("‚úÖ Detail page scraping functions loaded")


‚úÖ Detail page scraping functions loaded


### Download PDFs

Function to download PDF files from the extracted links.


In [10]:
import os
import hashlib
from urllib.parse import urlparse, parse_qs

def download_pdf(pdf_url, output_dir='pdfs', session=None, filename=None):
    """
    Download a PDF file from a URL.
    
    Args:
        pdf_url: URL of the PDF to download
        output_dir: Directory to save PDFs (default: 'pdfs')
        session: Optional requests.Session object
        filename: Optional custom filename (otherwise extracted from URL)
    
    Returns:
        dict with keys:
            - 'success': Boolean
            - 'filepath': Path to saved file (if successful)
            - 'error': Error message (if failed)
            - 'skipped': Boolean indicating if file was skipped (already existed)
    """
    if session is None:
        session = requests.Session()
    
    result = {
        'success': False,
        'filepath': None,
        'error': None,
        'skipped': False
    }
    
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Get filename from URL if not provided
        if not filename:
            parsed = urlparse(pdf_url)
            
            # For handler URLs (ProjectFile.ashx), try to extract filename from query params
            if 'projectfile.ashx' in pdf_url.lower() or 'filehandler.ashx' in pdf_url.lower():
                # Try to get filename from query parameters
                query_params = parse_qs(parsed.query)
                
                # Check if there's a 'file' parameter that might contain filename info
                if 'file' in query_params:
                    # The file parameter is often base64 encoded, so we'll use a hash
                    url_hash = hashlib.md5(pdf_url.encode()).hexdigest()[:12]
                    filename = f"ceqr_file_{url_hash}.pdf"
                else:
                    # Use URL hash as fallback
                    url_hash = hashlib.md5(pdf_url.encode()).hexdigest()[:12]
                    filename = f"ceqr_file_{url_hash}.pdf"
            else:
                # For regular URLs, extract from path
                filename = os.path.basename(parsed.path)
                if not filename or not filename.endswith('.pdf'):
                    # Generate filename from URL
                    filename = pdf_url.split('/')[-1].split('?')[0]
                    if not filename or not filename.endswith('.pdf'):
                        # Use URL hash as fallback
                        url_hash = hashlib.md5(pdf_url.encode()).hexdigest()[:12]
                        filename = f"ceqr_file_{url_hash}.pdf"
        
        filepath = os.path.join(output_dir, filename)
        
        # Skip if file already exists
        if os.path.exists(filepath):
            print(f"‚è≠Ô∏è  Skipping (exists): {filename}")
            result['success'] = True
            result['filepath'] = filepath
            result['skipped'] = True
            return result
        
        print(f"‚¨áÔ∏è  Downloading: {filename}")
        
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
            "Accept": "application/pdf,*/*"
        }
        
        response = session.get(pdf_url, headers=headers, timeout=60, stream=True)
        
        if response.status_code != 200:
            result['error'] = f"HTTP {response.status_code}"
            return result
        
        # Check if it's actually a PDF
        content_type = response.headers.get('Content-Type', '')
        if 'pdf' not in content_type.lower() and not pdf_url.lower().endswith('.pdf'):
            print(f"‚ö†Ô∏è  Warning: Content-Type is {content_type}, not PDF")
        
        # Save file
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        file_size = os.path.getsize(filepath)
        print(f"‚úÖ Saved: {filename} ({file_size:,} bytes)")
        
        result['success'] = True
        result['filepath'] = filepath
        
    except Exception as e:
        result['error'] = str(e)
        print(f"‚ùå Error downloading {pdf_url}: {e}")
    
    return result


def download_all_pdfs(df, pdf_links_column='pdf_links', output_dir='pdfs', session=None):
    """
    Download all PDFs from a DataFrame.
    
    Args:
        df: DataFrame with PDF links
        pdf_links_column: Column name containing PDF links (comma-separated string)
        output_dir: Directory to save PDFs
        session: Optional requests.Session object
    
    Returns:
        dict with download statistics
    """
    if session is None:
        session = requests.Session()
    
    if pdf_links_column not in df.columns:
        print(f"‚ùå Column '{pdf_links_column}' not found in DataFrame")
        return None
    
    stats = {
        'total_pdfs': 0,
        'downloaded': 0,
        'skipped': 0,
        'failed': 0,
        'errors': []
    }
    
    print(f"\n‚¨áÔ∏è  Downloading PDFs to '{output_dir}'...\n")
    
    for idx, row in df.iterrows():
        pdf_links_str = row[pdf_links_column]
        
        if not pdf_links_str or pd.isna(pdf_links_str) or pdf_links_str == '':
            continue
        
        # Parse comma-separated links
        pdf_urls = [url.strip() for url in str(pdf_links_str).split(',') if url.strip()]
        
        for pdf_url in pdf_urls:
            stats['total_pdfs'] += 1
            
            result = download_pdf(pdf_url, output_dir, session)
            
            if result['success']:
                if result.get('skipped', False):
                    stats['skipped'] += 1
                else:
                    stats['downloaded'] += 1
            else:
                stats['failed'] += 1
                stats['errors'].append(f"{pdf_url}: {result.get('error', 'Unknown error')}")
            
            # Small delay between downloads
            import time
            time.sleep(0.5)
    
    print(f"\nüìä Download Summary:")
    print(f"   Total PDFs: {stats['total_pdfs']}")
    print(f"   Downloaded: {stats['downloaded']}")
    print(f"   Skipped (already exists): {stats['skipped']}")
    print(f"   Failed: {stats['failed']}")
    
    if stats['errors']:
        print(f"\n‚ùå Errors:")
        for error in stats['errors'][:10]:  # Show first 10 errors
            print(f"   {error}")
        if len(stats['errors']) > 10:
            print(f"   ... and {len(stats['errors']) - 10} more errors")
    
    return stats


print("‚úÖ PDF download functions loaded")


‚úÖ PDF download functions loaded


## Complete Workflow: Search ‚Üí Scrape Detail Pages ‚Üí Extract PDFs ‚Üí Download

The workflow is:
1. **Search** for CEQR projects (by BBL or borough/block/lot)
2. **Navigate to detail pages** from search results
3. **Scrape detail pages** to extract text content and **find PDF links on those detail pages**
4. **Download PDFs** that were found on the detail pages


In [11]:
# Complete workflow example: Search ‚Üí Scrape Detail Pages ‚Üí Download PDFs
# This demonstrates that PDFs are extracted FROM the detail pages

# Step 1: Search for CEQR projects
print("="*100)
print("STEP 1: Searching for CEQR projects")
print("="*100)
example_bbl = "3014890011"  # Change this to your BBL
df = search_ceqr_by_bbl(example_bbl)

if df is not None and len(df) > 0:
    print(f"\n‚úÖ Found {len(df)} projects")
    print(f"   Detail page URLs will be extracted from search results\n")
    
    # Step 2: Scrape detail pages to extract PDFs
    print("="*100)
    print("STEP 2: Scraping detail pages to extract PDF links")
    print("="*100)
    print("   (This visits each detail page and finds PDF links on those pages)\n")
    
    session = requests.Session()
    df = scrape_all_detail_pages(df, detail_column='Detail Page', session=session)
    
    # Step 3: Show what was found
    print("\n" + "="*100)
    print("STEP 3: Summary of PDFs found on detail pages")
    print("="*100)
    
    total_pdfs = df['pdf_count'].sum()
    projects_with_pdfs = (df['pdf_count'] > 0).sum()
    
    print(f"\nüìä Results:")
    print(f"   Projects with PDFs: {projects_with_pdfs}/{len(df)}")
    print(f"   Total PDFs found on detail pages: {total_pdfs}")
    
    if total_pdfs > 0:
        print(f"\n‚úÖ PDFs were successfully extracted from detail pages!")
        print(f"   Ready to download in next step")
    else:
        print(f"\n‚ö†Ô∏è  No PDFs found on detail pages")
        
else:
    print("‚ùå No projects found. Try a different BBL.")


STEP 1: Searching for CEQR projects
üìç BBL 3014890011 ‚Üí Brooklyn, Block 1489, Lot 11
üîç Searching: Brooklyn, Block 1489, Lot 11
‚úÖ Got session
‚úÖ Got results

‚úÖ Found 2 projects
   Detail page URLs will be extracted from search results

STEP 2: Scraping detail pages to extract PDF links
   (This visits each detail page and finds PDF links on those pages)


üìÑ Scraping 2 detail pages...

üåê Fetching: https://a002-ceqraccess.nyc.gov/ceqr/Details?data=MDZIUEQwMDFL0&signature=bc0454...
‚úÖ Scraped: 781 chars, 3 PDFs
üåê Fetching: https://a002-ceqraccess.nyc.gov/ceqr/Details?data=MTlIUEQwNTdL0&signature=8a44e7...
‚úÖ Scraped: 340 chars, 5 PDFs

‚úÖ Completed: 2/2 pages scraped, 8 PDFs found

STEP 3: Summary of PDFs found on detail pages

üìä Results:
   Projects with PDFs: 2/2
   Total PDFs found on detail pages: 8

‚úÖ PDFs were successfully extracted from detail pages!
   Ready to download in next step


## Test Detail Page Scraping

Test scraping detail pages and extracting PDF links.


In [12]:
# Test scraping detail pages from the previous search results
# This extracts PDF links FROM the detail pages
if 'df' in locals() and df is not None and 'Detail Page' in df.columns:
    # Create a session to maintain cookies
    session = requests.Session()
    
    print("="*100)
    print("STEP 1: Scraping detail pages to extract PDF links")
    print("="*100)
    print(f"Found {len(df)} detail pages to scrape\n")
    
    # Scrape all detail pages (this visits each detail page and extracts PDFs)
    df_with_details = scrape_all_detail_pages(df, detail_column='Detail Page', session=session)
    
    # Display results
    print("\n" + "="*100)
    print("SCRAPED DETAIL PAGES - PDFs EXTRACTED FROM DETAIL PAGES")
    print("="*100)
    
    for idx, row in df_with_details.iterrows():
        print(f"\nüìÑ CEQR Number: {row.get('CEQR Number', 'N/A')}")
        print(f"   Detail Page URL: {row['Detail Page']}")
        print(f"   Scrape Success: {row['scrape_success']}")
        print(f"   Text length: {len(row['detail_text'])} characters")
        print(f"   PDFs found on detail page: {row['pdf_count']}")
        
        if row['pdf_count'] > 0:
            print(f"   PDF links extracted from detail page:")
            for pdf_link in row['pdf_links'].split(', '):
                if pdf_link:
                    # Show shortened version for display
                    if len(pdf_link) > 100:
                        print(f"      - {pdf_link[:100]}...")
                    else:
                        print(f"      - {pdf_link}")
        else:
            print(f"   ‚ö†Ô∏è  No PDFs found on this detail page")
        
        # Show first 300 chars of text
        if row['detail_text']:
            preview = row['detail_text'][:300].replace('\n', ' ')
            print(f"\n   Page text preview: {preview}...")
        
        print("-" * 100)
    
    # Update df variable
    df = df_with_details
    
    # Summary
    total_pdfs = df['pdf_count'].sum()
    successful_scrapes = df['scrape_success'].sum()
    print(f"\nüìä Summary:")
    print(f"   Detail pages scraped: {successful_scrapes}/{len(df)}")
    print(f"   Total PDFs found across all detail pages: {total_pdfs}")
    
else:
    print("‚ö†Ô∏è  No DataFrame with detail pages found. Run a search first.")
    print("   Example: df = search_ceqr_by_bbl('3014890011')")


STEP 1: Scraping detail pages to extract PDF links
Found 2 detail pages to scrape


üìÑ Scraping 2 detail pages...

üåê Fetching: https://a002-ceqraccess.nyc.gov/ceqr/Details?data=MDZIUEQwMDFL0&signature=bc0454...
‚úÖ Scraped: 781 chars, 3 PDFs
üåê Fetching: https://a002-ceqraccess.nyc.gov/ceqr/Details?data=MTlIUEQwNTdL0&signature=8a44e7...
‚úÖ Scraped: 340 chars, 5 PDFs

‚úÖ Completed: 2/2 pages scraped, 8 PDFs found

SCRAPED DETAIL PAGES - PDFs EXTRACTED FROM DETAIL PAGES

üìÑ CEQR Number: 06HPD001K
   Detail Page URL: https://a002-ceqraccess.nyc.gov/ceqr/Details?data=MDZIUEQwMDFL0&signature=bc04541d153279ec253c32419792a4bd91839cff
   Scrape Success: True
   Text length: 781 characters
   PDFs found on detail page: 3
   PDF links extracted from detail page:
      - https://a002-ceqraccess.nyc.gov/ceqr/../Handlers/ProjectFile.ashx?file=MjAwNlwwNkhQRDAwMUtcbGVhZF9hZ...
      - https://a002-ceqraccess.nyc.gov/ceqr/../Handlers/ProjectFile.ashx?file=MjAwNlwwNkhQRDAwMUtcZGV0X3NpZ...
  

## Download PDFs

Download all PDFs found on the detail pages.


In [13]:
# Download all PDFs that were extracted from detail pages
if 'df' in locals() and df is not None and 'pdf_links' in df.columns:
    # Create a session to maintain cookies
    session = requests.Session()
    
    print("="*100)
    print("STEP 2: Downloading PDFs extracted from detail pages")
    print("="*100)
    
    # Count total PDFs to download
    total_pdfs_to_download = 0
    for idx, row in df.iterrows():
        if row['pdf_links'] and pd.notna(row['pdf_links']):
            pdf_list = [url.strip() for url in str(row['pdf_links']).split(',') if url.strip()]
            total_pdfs_to_download += len(pdf_list)
    
    print(f"Found {total_pdfs_to_download} PDFs to download (extracted from detail pages)\n")
    
    # Download PDFs
    stats = download_all_pdfs(df, pdf_links_column='pdf_links', output_dir='pdfs', session=session)
    
    if stats:
        print(f"\n‚úÖ Download complete!")
        print(f"   PDFs saved to: pdfs/")
    else:
        print(f"\n‚ö†Ô∏è  No PDFs were downloaded")
else:
    print("‚ö†Ô∏è  No PDF links found. You need to:")
    print("   1. Run a search: df = search_ceqr_by_bbl('3014890011')")
    print("   2. Scrape detail pages: df = scrape_all_detail_pages(df, ...)")
    print("   3. Then download PDFs: download_all_pdfs(df, ...)")


STEP 2: Downloading PDFs extracted from detail pages
Found 8 PDFs to download (extracted from detail pages)


‚¨áÔ∏è  Downloading PDFs to 'pdfs'...

‚¨áÔ∏è  Downloading: ceqr_file_34d943cde35a.pdf
‚úÖ Saved: ceqr_file_34d943cde35a.pdf (61,570 bytes)
‚¨áÔ∏è  Downloading: ceqr_file_5d18dad37547.pdf
‚úÖ Saved: ceqr_file_5d18dad37547.pdf (160,270 bytes)
‚¨áÔ∏è  Downloading: ceqr_file_46b26accb27b.pdf
‚úÖ Saved: ceqr_file_46b26accb27b.pdf (5,066,111 bytes)
‚¨áÔ∏è  Downloading: ceqr_file_373912afa8eb.pdf
‚úÖ Saved: ceqr_file_373912afa8eb.pdf (216,552 bytes)
‚¨áÔ∏è  Downloading: ceqr_file_7ad839ae6368.pdf
‚úÖ Saved: ceqr_file_7ad839ae6368.pdf (232,748 bytes)
‚¨áÔ∏è  Downloading: ceqr_file_adc66c7e0352.pdf
‚úÖ Saved: ceqr_file_adc66c7e0352.pdf (237,154 bytes)
‚¨áÔ∏è  Downloading: ceqr_file_c622083017b9.pdf
‚úÖ Saved: ceqr_file_c622083017b9.pdf (42,463,882 bytes)
‚¨áÔ∏è  Downloading: ceqr_file_73e88c173ee7.pdf
‚úÖ Saved: ceqr_file_73e88c173ee7.pdf (40,776,723 bytes)

üìä Download Summary:
   T

In [14]:
def parse_ceqr_results(response):
    """Extract CEQR results table from HTML response, including detail page links."""
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find results table
    table = soup.find('table', {'id': lambda x: x and 'grdSearchResults' in x})
    
    if not table:
        # Try finding by content
        tables = soup.find_all('table')
        for t in tables:
            if 'CEQR Number' in t.get_text() or 'Project Name' in t.get_text():
                table = t
                break
    
    if not table:
        print("‚ö†Ô∏è  No results table found")
        return None
    
    # Extract rows
    rows = table.find_all('tr')
    if not rows:
        return None
    
    # Get headers
    headers = [th.get_text(strip=True) for th in rows[0].find_all(['th', 'td'])]
    
    # Get data with detail links
    data = []
    detail_links = []
    
    for row in rows[1:]:
        cells = row.find_all(['td', 'th'])
        if cells:
            row_data = [cell.get_text(strip=True) for cell in cells]
            if any(cell.strip() for cell in row_data):
                data.append(row_data)
                
                # Extract detail page link
                detail_link = row.find('a', {'id': lambda x: x and 'hlnkOpenDetails' in x})
                if detail_link and detail_link.get('href'):
                    full_url = f"https://a002-ceqraccess.nyc.gov/ceqr/{detail_link['href']}"
                    detail_links.append(full_url)
                else:
                    detail_links.append("")
    
    if not data:
        return None
    
    # Create DataFrame with detail links column
    df = pd.DataFrame(data, columns=headers[:len(data[0])])
    df['Detail Page'] = detail_links
    
    return df

print("‚úÖ Parser loaded (with detail page links)")


‚úÖ Parser loaded (with detail page links)


## Test Searches

### Test 1: Search by BBL


In [15]:
df = search_ceqr_by_bbl("3014890011")

if df is not None:
    print(f"\nüìä Found {len(df)} results\n")
    print("=" * 150)
    print(df.to_string(index=False))
    print("=" * 150)
    
    # Show detail links
    if 'Detail Page' in df.columns:
        print("\nüîó Detail Pages:")
        for idx, link in enumerate(df['Detail Page'], 1):
            print(f"  {idx}. {link}")
else:
    print("No results found")


üìç BBL 3014890011 ‚Üí Brooklyn, Block 1489, Lot 11
üîç Searching: Brooklyn, Block 1489, Lot 11
‚úÖ Got session
‚úÖ Got results

üìä Found 2 results

CEQR Number                                                                                                                                                                                                        Project Name                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

### Test 2: Search by Borough/Block/Lot directly


In [16]:
# Alternative: search directly by borough/block/lot
success, result = search_ceqr("Brooklyn", "7061", "27")

if success:
    df = parse_ceqr_results(result)
    if df is not None:
        print(f"‚úÖ Found {len(df)} results using direct search")
    else:
        print("No results")
else:
    print(f"‚ùå Error: {result}")


üîç Searching: Brooklyn, Block 7061, Lot 27
‚úÖ Got session
‚úÖ Got results
‚úÖ Found 2 results using direct search


In [17]:
parse_ceqr_results(result)

Unnamed: 0,CEQR Number,Project Name,Project Description,Detail Page
0,08DME007K,Coney Island RezoningLatest Milestone12/06/201...,The proposed actions would include zoning map ...,https://a002-ceqraccess.nyc.gov/ceqr/Details?d...
1,24HPD040K,Coney Island Taconic Phase 3Latest Milestone08...,The Department of Housing Preservation and Dev...,https://a002-ceqraccess.nyc.gov/ceqr/Details?d...
