In [None]:
import os
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import re

# Define base URL and main folder for saving data
BASE_URL = "https://lawphil.net/judjuris/judjuris.html"
MAIN_FOLDER = "Jurisprudence"

# Function to create folders if they do not exist
def create_folder(path):
    if not os.path.exists(path):
        os.makedirs(path)

# Function to log messages
def log(message):
    print(f"[LOG] {message}")

# Function to sanitize filenames and folder names
def sanitize_filename(name):
    return re.sub(r'[\\/*?:"<>|]', "_", name)

# Function to check if a folder already contains data
def is_data_collected(year_folder, month_folder):
    # Check if the folder contains any CSV files (indicating data has been collected)
    if not os.path.exists(month_folder):
        create_folder(month_folder)
    return any(file.endswith(".csv") for file in os.listdir(month_folder))

# Function to scrape data from each case page
def scrape_case_data(case_url, year, month, case_number):
    try:
        response = requests.get(case_url)
        case_soup = BeautifulSoup(response.content, 'html.parser')
        blockquote = case_soup.find("blockquote")
        
        if blockquote:
            case_data = blockquote.get_text(separator="\n").strip()
            return case_data
    except requests.RequestException as e:
        log(f"Error fetching case data: {e}")
    return None

# Function to get all links from a page and retry if request fails
def get_links(url, retry=3):
    while retry > 0:
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            return soup
        except requests.RequestException as e:
            log(f"Error fetching {url}: {e}")
            retry -= 1
            time.sleep(2)  # wait before retry
    return None

# Main function to scrape years, months, and cases
def scrape_jurisprudence():
    # Fetch the base index page
    soup = get_links(BASE_URL)
    if not soup:
        log("Failed to fetch the base page.")
        return
    
    # Create the main directory for storing data
    create_folder(MAIN_FOLDER)
    
    # Find all year links and sort them chronologically
    year_links = sorted(soup.select("a.off_n1"), key=lambda link: int(link.get_text()))
    
    for year_link in year_links:
        year_text = year_link.get_text()
        year_url = urljoin(BASE_URL, year_link['href'])
        year_folder = os.path.join(MAIN_FOLDER, sanitize_filename(year_text))
        create_folder(year_folder)
        
        log(f"Processing year: {year_text}")
        
        # Fetch the year page
        year_soup = get_links(year_url)
        if not year_soup:
            continue
        
        # Find all month links and sort them from January to December
        month_links = sorted(year_soup.select("a.off"), key=lambda link: link.get_text())
        
        for month_link in month_links:
            month_text = month_link.get_text()
            month_folder = os.path.join(year_folder, sanitize_filename(month_text))
            
            # Check if the data for the year and month has already been collected
            if is_data_collected(year_folder, month_folder):
                log(f"Data for {month_text} {year_text} already collected. Skipping.")
                continue  # Skip this month as it's already processed

            log(f"Processing month: {month_text} of year {year_text}")
            
            # Fetch the month page
            month_url = urljoin(year_url, month_link['href'])
            month_soup = get_links(month_url)
            if not month_soup:
                continue
            
            # Find all case links on the month page
            case_links = month_soup.find_all("a")
            
            for case_link in case_links:
                # Skip links that don't have an 'href' attribute or are empty
                if 'href' not in case_link.attrs or not case_link['href']:
                    log(f"Skipping invalid link in month {month_text}, year {year_text}: {case_link}")
                    continue

                case_text = case_link.get_text()
                case_url = urljoin(month_url, case_link['href'])
                
                # Sanitize the case number text to create a valid file name
                case_file_name = sanitize_filename(case_text) + ".csv"
                csv_file_path = os.path.join(month_folder, case_file_name)
                
                log(f"Processing case number: {case_text} in month {month_text}, year {year_text}")
                
                # Scrape case data
                case_data = scrape_case_data(case_url, year_text, month_text, case_text)
                
                if case_data:
                    # Save case data to a CSV file
                    with open(csv_file_path, mode="w", newline='', encoding="utf-8", errors="replace") as csv_file:
                        writer = csv.writer(csv_file)
                        writer.writerow(["Year", "Month", "Case Number", "Case Data"])
                        writer.writerow([year_text, month_text, case_text, case_data])
                    
                    log(f"Saved case data for {case_text} in month {month_text}, year {year_text}")
                else:
                    log(f"Skipping case number: {case_text} as no data was found")

                # Respectful delay to avoid hitting the server too hard
                time.sleep(1)

# Run the scraping process
scrape_jurisprudence()


[LOG] Processing year: 1901
[LOG] Processing month: August of year 1901
[LOG] Skipping invalid link in month August, year 1901: <a name="top"></a>
[LOG] Processing case number: G.R. No. 456 in month August, year 1901
[LOG] Saved case data for G.R. No. 456 in month August, year 1901
[LOG] Processing case number: G.R. No. 17 in month August, year 1901
[LOG] Saved case data for G.R. No. 17 in month August, year 1901
[LOG] Skipping invalid link in month August, year 1901: <a class="vs">vs.</a>
[LOG] Processing case number: G.R. No. 26 in month August, year 1901
[LOG] Saved case data for G.R. No. 26 in month August, year 1901
[LOG] Skipping invalid link in month August, year 1901: <a class="vs">vs.</a>
[LOG] Processing case number: G.R. No. 12 in month August, year 1901
[LOG] Saved case data for G.R. No. 12 in month August, year 1901
[LOG] Processing case number:  in month August, year 1901
[LOG] Error fetching case data: No connection adapters were found for 'javascript:history.back(1)'
[L