In [9]:
import requests
import os
import csv
import logging
from bs4 import BeautifulSoup

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Base URL
base_url = "https://lawphil.net/consti/"
index_url = f"{base_url}constitu.html"

# Directory to save the constitution CSV files
output_dir = "Data/Constitutions"
os.makedirs(output_dir, exist_ok=True)

# Function to scrape constitution links from index page
def get_constitution_links():
    logging.info("Fetching constitution links from index page.")
    response = requests.get(index_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    links = [
        (a_tag.text.strip(), base_url + a_tag['href'])
        for a_tag in soup.find_all("a", class_="off")
        if "href" in a_tag.attrs
    ]
    logging.info(f"Found {len(links)} constitution links.")
    return links

# Function to fetch and parse constitution content
def fetch_constitution_content(url):
    logging.info(f"Fetching constitution content from {url}")
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Try to find the title in the first <b> element of the first <p> in blockquote
    blockquotes = soup.find_all("blockquote")
    title = ""
    
    # Get title from the first blockquote
    first_paragraph = blockquotes[0].find("p")
    if first_paragraph:
        bold_tag = first_paragraph.find("b")
        if bold_tag:
            title = bold_tag.get_text(strip=True)
        else:
            title = first_paragraph.get_text(strip=True)
    
    # Get the text content of the blockquote
    content = "\n".join([bq.get_text(separator=" ", strip=True) for bq in blockquotes])
    return title, content

# Function to save constitution content to a CSV file
def save_to_csv(constitution_name, title, content):
    file_name = os.path.join(output_dir, f"{constitution_name}.csv")
    logging.info(f"Saving content to {file_name}")
    with open(file_name, mode="w", newline='', encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Constitution", "Title", "Data"])
        writer.writerow([constitution_name, title, content])
    logging.info(f"Successfully saved {constitution_name} to CSV.")

# Main function to orchestrate the scraping and saving
def scrape_constitutions():
    links = get_constitution_links()
    for constitution_name, url in links:
        try:
            title, content = fetch_constitution_content(url)
            save_to_csv(constitution_name, title, content)
        except Exception as e:
            logging.error(f"Error processing {constitution_name}: {e}")

# Run the script
if __name__ == "__main__":
    logging.info("Starting constitution scraping process.")
    scrape_constitutions()
    logging.info("Constitution scraping process completed.")


2024-11-14 11:48:18,098 - INFO - Starting constitution scraping process.
2024-11-14 11:48:18,099 - INFO - Fetching constitution links from index page.
2024-11-14 11:48:25,364 - INFO - Found 6 constitution links.
2024-11-14 11:48:25,365 - INFO - Fetching constitution content from https://lawphil.net/consti/cons1987.html
2024-11-14 11:48:43,365 - INFO - Saving content to Data/Constitutions/1987 Constitution.csv
2024-11-14 11:48:43,366 - INFO - Successfully saved 1987 Constitution to CSV.
2024-11-14 11:48:43,367 - INFO - Fetching constitution content from https://lawphil.net/consti/cons1973.html
2024-11-14 11:48:50,031 - INFO - Saving content to Data/Constitutions/1973 Constitution.csv
2024-11-14 11:48:50,034 - INFO - Successfully saved 1973 Constitution to CSV.
2024-11-14 11:48:50,035 - INFO - Fetching constitution content from https://lawphil.net/consti/cons1935.html
2024-11-14 11:48:55,901 - INFO - Saving content to Data/Constitutions/1935 Constitution.csv
2024-11-14 11:48:55,904 - INF