In [1]:
import os
import requests
import logging
import pandas as pd
import re
from bs4 import BeautifulSoup

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# Base URL for linking
base_url = "https://lawphil.net/statutes/comacts/"
start_url = "https://lawphil.net/statutes/comacts/comacts.html"

# Directory to save the data files
data_dir = "Data/Statutes/Common Wealth Acts"
os.makedirs(data_dir, exist_ok=True)

def fetch_page(url):
    """Fetches the HTML content of a given URL."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        logger.error(f"Failed to fetch page: {url}. Error: {e}")
        return None

def parse_main_page(html):
    """Parses the main page to find the desired table with the list of acts."""
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', {'id': 's-menu'})
    links = []
    
    if not table:
        logger.error("No table with id 's-menu' found on the page.")
        return links
    
    # Process rows within the table
    for row in table.find_all('tr')[1:]:  # Skip the header row
        columns = row.find_all('td')
        if len(columns) < 2:
            continue

        number_link = columns[0].find('a')
        title = columns[1].text.strip()
        
        if number_link and number_link['href']:
            full_link = base_url + number_link['href']
            links.append((full_link, title))
    
    logger.info(f"Found {len(links)} links in the table.")
    return links

def parse_act_page(url):
    """Parses the act page to retrieve the Number, Title, and Data."""
    html = fetch_page(url)
    if not html:
        return None, None, None

    soup = BeautifulSoup(html, 'html.parser')
    blockquote = soup.find('blockquote')
    if not blockquote:
        logger.warning(f"No blockquote found on page: {url}")
        return None, None, None

    paragraphs = blockquote.find_all('p')
    if len(paragraphs) < 2:
        logger.warning(f"Insufficient paragraphs in blockquote at: {url}")
        return None, None, None

    # Clean the 'Number' by removing square brackets and extra spaces
    number = paragraphs[0].text.strip()
    number = re.sub(r'[\[\]]', '', number)  # Remove square brackets

    title = paragraphs[1].text.strip()
    data = blockquote.text.strip()
    
    return number, title, data

def save_to_csv(number, title, data):
    """Saves the act information to a CSV file in the data directory."""
    file_path = os.path.join(data_dir, f"{number}.csv")
    df = pd.DataFrame([[number, title, data]], columns=["Number", "Title", "Data"])
    df.to_csv(file_path, index=False, encoding='utf-8')
    logger.info(f"Data saved to {file_path}")

def main():
    main_page_html = fetch_page(start_url)
    if not main_page_html:
        logger.error("Failed to retrieve main page.")
        return

    act_links = parse_main_page(main_page_html)
    for url, title in act_links:
        logger.info(f"Processing URL: {url}")
        number, title, data = parse_act_page(url)
        if number and title and data:
            save_to_csv(number, title, data)
        else:
            logger.warning(f"Skipping due to incomplete data for URL: {url}")

if __name__ == "__main__":
    main()


2024-11-14 16:12:26,128 - INFO - Found 734 links in the table.
2024-11-14 16:12:26,128 - INFO - Processing URL: https://lawphil.net/comacts/acts/ca1946/ca_733_1946.html
2024-11-14 16:12:34,744 - ERROR - Failed to fetch page: https://lawphil.net/comacts/acts/ca1946/ca_733_1946.html. Error: 404 Client Error: Not Found for url: https://lawphil.net/comacts/acts/ca1946/ca_733_1946.html
2024-11-14 16:12:34,747 - INFO - Processing URL: https://lawphil.net/comacts/acts/ca1946/ca_732_1946.html


KeyboardInterrupt: 