In [7]:
import requests
import os
import csv
import logging
from bs4 import BeautifulSoup

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Base URL
base_url = "https://lawphil.net/consti/"
index_url = f"{base_url}constitu.html"

# Directory to save the constitution CSV files
output_dir = "Constitutions"
os.makedirs(output_dir, exist_ok=True)

# Function to scrape constitution links from index page
def get_constitution_links():
    logging.info("Fetching constitution links from index page.")
    response = requests.get(index_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    links = [
        (a_tag.text.strip(), base_url + a_tag['href'])
        for a_tag in soup.find_all("a", class_="off")
        if "href" in a_tag.attrs
    ]
    logging.info(f"Found {len(links)} constitution links.")
    return links

# Function to fetch and parse constitution content
def fetch_constitution_content(url):
    logging.info(f"Fetching constitution content from {url}")
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    blockquotes = soup.find_all("blockquote")
    content = "\n".join([bq.get_text(separator=" ", strip=True) for bq in blockquotes])
    return content

# Function to save constitution content to a CSV file
def save_to_csv(title, content):
    file_name = os.path.join(output_dir, f"{title}.csv")
    logging.info(f"Saving content to {file_name}")
    with open(file_name, mode="w", newline='', encoding="utf-8") as file:
        writer = csv.writer(file)
        # Write header row
        writer.writerow(["constitution", "data"])
        # Write each paragraph with the title in the "constitution" column
        for line in content.split("\n"):
            writer.writerow([title, line])
    logging.info(f"Successfully saved {title} to CSV.")


# Main function to orchestrate the scraping and saving
def scrape_constitutions():
    links = get_constitution_links()
    for title, url in links:
        try:
            content = fetch_constitution_content(url)
            save_to_csv(title, content)
        except Exception as e:
            logging.error(f"Error processing {title}: {e}")

# Run the script
if __name__ == "__main__":
    logging.info("Starting constitution scraping process.")
    scrape_constitutions()
    logging.info("Constitution scraping process completed.")


2024-11-14 08:30:26,199 - Starting constitution scraping process.
2024-11-14 08:30:26,200 - Fetching constitution links from index page.
2024-11-14 08:30:26,453 - Found 6 constitution links.
2024-11-14 08:30:26,454 - Fetching constitution content from https://lawphil.net/consti/cons1987.html
2024-11-14 08:30:27,147 - Saving content to Constitutions/1987 Constitution.csv
2024-11-14 08:30:27,148 - Successfully saved 1987 Constitution to CSV.
2024-11-14 08:30:27,148 - Fetching constitution content from https://lawphil.net/consti/cons1973.html
2024-11-14 08:30:27,654 - Saving content to Constitutions/1973 Constitution.csv
2024-11-14 08:30:27,656 - Successfully saved 1973 Constitution to CSV.
2024-11-14 08:30:27,656 - Fetching constitution content from https://lawphil.net/consti/cons1935.html
2024-11-14 08:30:28,058 - Saving content to Constitutions/1935 Constitution.csv
2024-11-14 08:30:28,063 - Successfully saved 1935 Constitution to CSV.
2024-11-14 08:30:28,063 - Fetching constitution co