In [3]:
import os
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import re
import logging

# Define base URL and main folder for saving data
BASE_URL = "https://lawphil.net/judjuris/judjuris.html"
MAIN_FOLDER = "Data/Jurisprudence"

# Define delay and maximum retries
REQUEST_DELAY = 0  # seconds between requests
TIMEOUT = 10  # seconds before timing out the request

# Configure logging
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

# Function to create folders if they do not exist
def create_folder(path):
    if not os.path.exists(path):
        os.makedirs(path)

# Function to sanitize filenames and folder names
def sanitize_filename(name, max_length=100):
    # Replace invalid characters and truncate the name if it's too long
    sanitized_name = re.sub(r'[\\/*?:"<>|]', "_", name)
    return sanitized_name[:max_length]  # Ensure the filename doesn't exceed max_length


# Function to check if a folder already contains data
def is_data_collected(year_folder, month_folder):
    # Check if the folder contains any CSV files (indicating data has been collected)
    if not os.path.exists(month_folder):
        create_folder(month_folder)
    return any(file.endswith(".csv") for file in os.listdir(month_folder))

# Function to scrape data from each case page
def scrape_case_data(case_url, year, month, case_number):
    retries = 3
    while retries > 0:
        try:
            response = requests.get(case_url, timeout=TIMEOUT)
            response.raise_for_status()
            case_soup = BeautifulSoup(response.content, 'html.parser')
            blockquote = case_soup.find("blockquote")

            if blockquote:
                paragraphs = blockquote.find_all("p")
                if len(paragraphs) >= 3:
                    # Extract Case Number (remove square brackets), Title, and full data
                    case_number_text = re.sub(r'\[|\]', '', paragraphs[2].get_text(strip=True))
                    title_text = paragraphs[3].get_text(strip=True)
                    case_data = blockquote.get_text(separator="\n").strip()
                    return case_number_text, title_text, case_data
            return None, None, None
        except requests.RequestException as e:
            retries -= 1
            logger.warning(f"Error fetching case data from {case_url}: {e}. Retrying {retries} more times...")
            time.sleep(REQUEST_DELAY * (2 ** retries))  # Exponential backoff
            
    logger.error(f"Failed to fetch case data after retries: {case_url}")
    return None, None, None

# Function to get all links from a page and retry if request fails
def get_links(url, retry=3):
    while retry > 0:
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            return soup
        except requests.RequestException as e:
            logger.warning(f"Error fetching {url}: {e}. Retrying {retry - 1} more times...")
            retry -= 1
            time.sleep(2)  # wait before retry
    logger.error(f"Failed to fetch {url} after retries.")
    return None

# Main function to scrape years, months, and cases
def scrape_jurisprudence():
    # Fetch the base index page
    soup = get_links(BASE_URL)
    if not soup:
        logger.error("Failed to fetch the base page.")
        return
    
    # Create the main directory for storing data
    create_folder(MAIN_FOLDER)
    
    # Find all year links and sort them chronologically
    # Find all year links and filter them to start from 1997
    year_links = sorted([link for link in soup.select("a.off_n1") if int(link.get_text()) >= 1997],key=lambda link: int(link.get_text()))

    
    for year_link in year_links:
        year_text = year_link.get_text()
        year_url = urljoin(BASE_URL, year_link['href'])
        year_folder = os.path.join(MAIN_FOLDER, sanitize_filename(year_text))
        create_folder(year_folder)
        
        logger.info(f"Starting processing for year: {year_text}")
        
        # Fetch the year page
        year_soup = get_links(year_url)
        if not year_soup:
            continue
        
        # Find all month links and sort them from January to December
        month_links = sorted(year_soup.select("a.off"), key=lambda link: link.get_text())
        
        for month_link in month_links:
            month_text = month_link.get_text()
            month_folder = os.path.join(year_folder, sanitize_filename(month_text))
            
            # Check if the data for the year and month has already been collected
            if is_data_collected(year_folder, month_folder):
                logger.info(f"Data for {month_text} {year_text} already collected. Skipping.")
                continue

            logger.info(f"Starting processing for month: {month_text} of year {year_text}")
            
            # Fetch the month page
            month_url = urljoin(year_url, month_link['href'])
            month_soup = get_links(month_url)
            if not month_soup:
                continue
            
            # Find all case links on the month page
            case_links = month_soup.find_all("a")
            
            for case_link in case_links:
                # Skip links that don't have an 'href' attribute or are empty
                if 'href' not in case_link.attrs or not case_link['href']:
                    logger.warning(f"Skipping invalid link in month {month_text}, year {year_text}: {case_link}")
                    continue

                case_text = case_link.get_text()
                case_url = urljoin(month_url, case_link['href'])
                
                # Sanitize the case number text to create a valid file name
                case_file_name = sanitize_filename(case_text) + ".csv"
                csv_file_path = os.path.join(month_folder, case_file_name)
                
                logger.info(f"Processing case number: {case_text} in month {month_text}, year {year_text}")
                
                # Scrape case data
                case_number, title, case_data = scrape_case_data(case_url, year_text, month_text, case_text)
                
                if case_data:
                    # Save case data to a CSV file
                    with open(csv_file_path, mode="w", newline='', encoding="utf-8", errors="replace") as csv_file:
                        writer = csv.writer(csv_file)
                        writer.writerow(["Year", "Month", "Case Number", "Title", "Data"])
                        writer.writerow([year_text, month_text, case_number, title, case_data])
                    
                    logger.info(f"Saved case data for {case_text} in month {month_text}, year {year_text}")
                else:
                    logger.warning(f"Skipping case number: {case_text} as no data was found")

                # Respectful delay to avoid hitting the server too hard
                time.sleep(1)

    logger.info("Completed scraping process.")

# Run the scraping process
scrape_jurisprudence()


[2024-11-15 21:01:46,612] INFO: Starting processing for year: 1997
[2024-11-15 21:01:46,927] INFO: Starting processing for month: April of year 1997
[2024-11-15 21:01:47,267] INFO: Processing case number: G.R. No. 110286 in month April, year 1997
[2024-11-15 21:01:47,625] INFO: Saved case data for G.R. No. 110286 in month April, year 1997
[2024-11-15 21:01:48,632] INFO: Processing case number: G.R. No. 116732 in month April, year 1997
[2024-11-15 21:01:48,977] INFO: Saved case data for G.R. No. 116732 in month April, year 1997
[2024-11-15 21:01:49,985] INFO: Processing case number: A.M. No. MTJ-97-1114 in month April, year 1997
[2024-11-15 21:01:50,263] INFO: Saved case data for A.M. No. MTJ-97-1114 in month April, year 1997
[2024-11-15 21:01:51,270] INFO: Processing case number: G.R. No. 100197 in month April, year 1997
[2024-11-15 21:01:51,705] INFO: Saved case data for G.R. No. 100197 in month April, year 1997
[2024-11-15 21:01:52,713] INFO: Processing case number: G.R. No. 100197 i

OSError: [Errno 63] File name too long: 'Data/Jurisprudence/1997/August/Justice Davide, Jr.\nConcurring OpinionJustice Francisco\nDissenting OpinionJustice Regalado\n\nG.R. No. 118815August 18, 1997\r\n\r\nPeople of the Philippines vs. Anita Melgar — Mercader Y Tongco\r\n\n\nG.R. No. 119252August 18, 1997\r\n\r\nCommissioner of Internal Revenue, et al. vs. Apolinario B. Santos, et al.\r\n\n\nG.R. No. 119288August 18, 1997\r\n\r\nRepublic of the Philippines vs. Court of Appeals, et al.\r\n\n\nG.R. No. 119368August 18, 1997\r\n\r\nPeople of the Philippines vs. Marcelino Erardo\r\n\n\nG.R. No. 119696August 18, 1997\r\n\r\nPeople of the Philippines vs. Razul Guiamil Y Angkat, et al.\r\n\n\nG.R. No. 120256August 18, 1997\r\n\r\nHermito Cabcaban vs. National Labor Relations Commission, et al.\r\n\n\nG.R. No. 123276August 18, 1997\r\n\r\nMario Tiu, et al. vs. National Labor Relations Commission, et al.\r\n\n\nG.R. No. 124520August 18, 1997\r\n\r\nNilo Cha, et al. vs. Court of Appeals, et al.\r\n\n\nG.R. No. 95449August 18, 1997\r\n\r\nPhilippine-Singapore Transport Services, Inc. vs. National Labor Relations Commission, et al.\r\n\n\nG.R. No. 95523August 18, 1997\r\n\r\nReynaldo R. Gonzales vs. Court of Appeals, et al.\r\n\n\nG.R. No. 98107August 18, 1997\r\n\r\nBenjamin C. Juco vs. National Labor Relations Commission, et al.\r\n\n\nG.R. No. 108611August 20, 1997\r\n\r\nPeople of the Philippines vs. Jose Asto, et al.\r\n\n\nA.M. No. 93-9-1237-RTCAugust 21, 1997\r\n\r\nRe_ Loss of Court Exhibits\r\n\n\nA.M. No. 96-11-402-RTCAugust 21, 1997\r\n\r\nRe_ Report on the Judicial Audit\r\n\n\nA.M. No. 97-2-12-MTCAugust 21, 1997\r\n\r\nRe_ Issuance of Subpoena to Prisoner\r\n\n\nG.R. No. 101829August 21, 1997\r\n\r\nPeople of the Philippines vs. Bonifacio Zamora, et al.\r\n\n\nG.R. No. 102018August 21, 1997\r\n\r\nPeople of the Philippines vs. Jerry Gabayron\r\n\n\nG.R. No. 103959August 21, 1997\r\n\r\nRegalado Santiago, et al. vs. Court of Appeals, et al.\r\n\n\nG.R. No. 108183-85August 21, 1997\r\n\r\nPeople of the Philippines vs. Dione Palomar, et al.\r\n\n\nG.R. No. 110249August 21, 1997\r\n\r\nAlfredo Tano, et al. vs. Salvador P. Socrates, et al.\r\nDissenting OpinionJustice Bellosillo\nConcurring OpinionJustice Mendoza\n\n\nG.R. No. 112513August 21, 1997\r\n\r\nEdgar R. Del Castillo vs. Civil Service Commission, et al.\r\n\n\nG.R. No. 113032August 21, 1997\r\n\r\nWestern Institute of Technology, Inc., et al. vs. Ricardo T. Salas, et al.\r\n\n\nG.R. No. 116294August 21, 1997\r\n\r\nPeople of the Philippines vs. Antonio Chavez Y Estamante, et al.\r\n\n\nG.R. No. 120691August 21, 1997\r\n\r\nBionic Heavy Equipments, Inc., et al. vs. National Labor Relations Commission, et al.\r\n\n\nG.R. No. 123053August 21, 1997\r\n\r\nPeople of the Philippines vs. Leonardo L. Carizo, et al.\r\n\n\nG.R. No. 123492August 21, 1997\r\n\r\nDanilo A. Yap vs. National Labor Relations Commission, et al.\r\n\n\nG.R. No. 126749August 21, 1997\r\n\r\nEriberto M. Suson vs. Court of Appeals, et al.\r\n\n\nG.R. No. 127896August 21, 1997\r\n\r\nAdriano A. Arellano, Jr. vs. National Labor Relations Commission, et al.\r\n\n\nG.R. No. 94723August 21, 1997\r\n\r\nKaren E. Salvacion, et al. vs. Central Bank of the Philippines, et al.\r\n\n\nG.R. No. 96176August 21, 1997\r\n\r\nPeople of the Philippines vs. Zenaida Isla\r\n\n\nG.R. Nos. 116602-03August 21, 1997\r\n\r\nCarmelita Sarao vs. Court of Appeals, et al.\r\n\n\nG.R. No. 109578August 27, 1997\r\n\r\nPeople of the Philippines vs. Ronaldo S. Fabro, et al.\r\n\n\nG.R. No. 115581August 29, 1997\r\n\r\nPeople of the Philippines vs. Vacita Latura Jones\r\n\n\nG.R. No. 119332August 29, 1997\r\n\r\nPeople of the Philippines vs. Jack V. Sorrel\r\n\n\nG.R. No. 123581August 29, 1997\r\n\r\nRodrigo B. Bangayan, et al. vs. Court of Appeals, et al.\r\n\n\nG.R. No. 97642August 29, 1997\r\n\r\nAvon Insurance PLC. British Reserve Insurance Co., Ltd., et al. vs. Court of Appeals, et al.\r\n\n\nG.R. Nos. 116744-47August 29, 1997\r\n\r\nPeople of the Philippines vs. Bernardo _Toldo_ Panes, et al.\r\n\nThe Lawphil Project - Arellano Law Foundation, Inc.\n\n.csv'