In [8]:
import requests
import os
import csv
import logging
from bs4 import BeautifulSoup

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Base URL
base_url = "https://lawphil.net/consti/"
index_url = f"{base_url}constitu.html"

# Directory to save the constitution CSV files
output_dir = "Constitutions"
os.makedirs(output_dir, exist_ok=True)

# Function to scrape constitution links from index page
def get_constitution_links():
    logging.info("Fetching constitution links from index page.")
    response = requests.get(index_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    links = [
        (a_tag.text.strip(), base_url + a_tag['href'])
        for a_tag in soup.find_all("a", class_="off")
        if "href" in a_tag.attrs
    ]
    logging.info(f"Found {len(links)} constitution links.")
    return links

# Function to fetch and parse constitution content
def fetch_constitution_content(url):
    logging.info(f"Fetching constitution content from {url}")
    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Collect data from each blockquote
    data = []
    blockquotes = soup.find_all("blockquote")
    
    for blockquote in blockquotes:
        # Extract title from the first <b> element if available
        title_tag = blockquote.find("b")
        title = title_tag.get_text(strip=True) if title_tag else "No Title Found"
        
        # Extract the rest of the blockquote text
        content = blockquote.get_text(separator=" ", strip=True)
        data.append({"title": title, "data": content})
    
    return data

# Function to save constitution content to a CSV file
def save_to_csv(constitution_name, data):
    file_name = os.path.join(output_dir, f"{constitution_name}.csv")
    logging.info(f"Saving content to {file_name}")
    
    # Write to CSV with headers "constitution", "title", and "data"
    with open(file_name, mode="w", newline='', encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["constitution", "title", "data"])
        
        for entry in data:
            writer.writerow([constitution_name, entry["title"], entry["data"]])
    
    logging.info(f"Successfully saved {constitution_name} to CSV.")

# Main function to orchestrate the scraping and saving
def scrape_constitutions():
    links = get_constitution_links()
    for constitution_name, url in links:
        try:
            data = fetch_constitution_content(url)
            save_to_csv(constitution_name, data)
        except Exception as e:
            logging.error(f"Error processing {constitution_name}: {e}")

# Run the script
if __name__ == "__main__":
    logging.info("Starting constitution scraping process.")
    scrape_constitutions()
    logging.info("Constitution scraping process completed.")


2024-11-14 09:27:30,101 - Starting constitution scraping process.
2024-11-14 09:27:30,102 - Fetching constitution links from index page.
2024-11-14 09:27:36,196 - Found 6 constitution links.
2024-11-14 09:27:36,198 - Fetching constitution content from https://lawphil.net/consti/cons1987.html
2024-11-14 09:27:41,469 - Saving content to Constitutions/1987 Constitution.csv
2024-11-14 09:27:41,471 - Successfully saved 1987 Constitution to CSV.
2024-11-14 09:27:41,471 - Fetching constitution content from https://lawphil.net/consti/cons1973.html
2024-11-14 09:27:48,419 - Saving content to Constitutions/1973 Constitution.csv
2024-11-14 09:27:48,424 - Successfully saved 1973 Constitution to CSV.
2024-11-14 09:27:48,424 - Fetching constitution content from https://lawphil.net/consti/cons1935.html
2024-11-14 09:27:54,845 - Saving content to Constitutions/1935 Constitution.csv
2024-11-14 09:27:54,849 - Successfully saved 1935 Constitution to CSV.
2024-11-14 09:27:54,849 - Fetching constitution co