In [1]:
!pip install requests beautifulsoup4 pymupdf pytesseract pdf2image Pillow opencv-python langdetect numpy

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import requests
from bs4 import BeautifulSoup
import fitz  # PyMuPDF
import pytesseract
from pdf2image import convert_from_path
import re
import os
import json
from langdetect import detect
import time
from PIL import Image
import cv2
import numpy as np

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"


In [2]:

# Configuration
BASE_URLS = [
    "https://www.lawethiopia.com",  # Replace with actual legal sites
    "https://www.fanabc.com",
    "https://borkena.com/category/law/"
]
DOWNLOAD_DIR = "amharic_legal_pdfs"
OUTPUT_JSON = "amharic_legal_data.json"
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) LegalResearchBot/1.0"
REQUEST_DELAY = 5  # seconds between requests

# Create directories if they don't exist
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

def preprocess_image(image):
    """Enhance image quality for better OCR results"""
    # Convert to numpy array
    img = np.array(image)
    
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    
    # Apply adaptive thresholding
    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                  cv2.THRESH_BINARY, 11, 2)
    
    # Denoising
    denoised = cv2.fastNlMeansDenoising(thresh, None, 30, 7, 21)
    
    return Image.fromarray(denoised)



In [3]:
def extract_text_with_ocr(pdf_path):
    """Extract text from scanned PDFs using OCR"""
    try:
        images = convert_from_path(pdf_path, dpi=300)
        text = ""
        
        for i, image in enumerate(images):
            # Pre-process image
            processed_image = preprocess_image(image)
            
            # OCR with Amharic language
            page_text = pytesseract.image_to_string(processed_image, lang='amh')
            
            # Post-process text
            cleaned_text = re.sub(r'\s+', ' ', page_text).strip()
            text += cleaned_text + "\n"
        
        return text if text.strip() else None
    except Exception as e:
        print(f"OCR failed for {pdf_path}: {str(e)}")
        return None


In [4]:

def extract_text_from_pdf(pdf_url):
    """Hybrid text extraction - tries digital first, then OCR"""
    try:
        # Download PDF
        response = requests.get(pdf_url, headers={'User-Agent': USER_AGENT})
        response.raise_for_status()
        
        # First try digital text extraction
        with fitz.open(stream=response.content, filetype="pdf") as doc:
            digital_text = ""
            for page in doc:
                digital_text += page.get_text("text") or ""
        
        # If we got sufficient Amharic text, return it
        if is_valid_amharic(digital_text):
            return digital_text
        
        # Otherwise, save to temp file and try OCR
        temp_path = os.path.join(DOWNLOAD_DIR, "temp.pdf")
        with open(temp_path, 'wb') as f:
            f.write(response.content)
        
        ocr_text = extract_text_with_ocr(temp_path)
        os.remove(temp_path)  # Clean up
        
        return ocr_text
    
    except Exception as e:
        print(f"Error processing {pdf_url}: {str(e)}")
        return None


In [5]:

def is_valid_amharic(text):
    """Enhanced validation for Amharic content"""
    if not text or len(text.strip()) < 100:
        return False
    
    # Check for minimum Amharic characters
    amharic_chars = re.findall(r'[\u1200-\u137F]', text)
    if len(amharic_chars) < 50:  # At least 50 Amharic characters
        return False
    
    # Language detection (with fallback)
    try:
        return detect(text) == 'am'
    except:
        return bool(amharic_chars)  # Fallback to character check if detection fails


In [6]:

def scrape_legal_pdfs():
    """Main scraping function with ethical delays and error handling"""
    processed_urls = set()
    results = []
    
    # Load existing results to avoid re-processing
    if os.path.exists(OUTPUT_JSON):
        with open(OUTPUT_JSON, 'r', encoding='utf-8') as f:
            results = json.load(f)
            processed_urls = {item['pdf_url'] for item in results}
    
    for base_url in BASE_URLS:
        try:
            print(f"Processing: {base_url}")
            time.sleep(REQUEST_DELAY)
            
            response = requests.get(base_url, headers={'User-Agent': USER_AGENT})
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            
            for link in soup.find_all('a', href=True):
                pdf_url = link['href']
                
                # Normalize URL
                if not pdf_url.startswith('http'):
                    pdf_url = requests.compat.urljoin(base_url, pdf_url)
                
                # Skip non-PDF or already processed
                if not pdf_url.lower().endswith('.pdf') or pdf_url in processed_urls:
                    continue
                
                print(f"Found PDF: {pdf_url}")
                time.sleep(REQUEST_DELAY)
                
                # Extract text
                text = extract_text_from_pdf(pdf_url)
                if not text or not is_valid_amharic(text):
                    continue
                
                # Save metadata
                filename = os.path.basename(pdf_url)
                filepath = os.path.join(DOWNLOAD_DIR, filename)
                
                # Save the PDF
                with open(filepath, 'wb') as f:
                    f.write(requests.get(pdf_url).content)
                
                # Add to results
                result = {
                    "source": base_url,
                    "pdf_url": pdf_url,
                    "local_path": filepath,
                    "title": filename.replace('.pdf', ''),
                    "type": "legal_document",
                    "language": "am",
                    "content": text,
                    "date_scraped": time.strftime("%Y-%m-%d")
                }
                results.append(result)
                processed_urls.add(pdf_url)
                
                # Save incremental results
                with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
                    json.dump(results, f, ensure_ascii=False, indent=2)
                
        except Exception as e:
            print(f"Error processing {base_url}: {str(e)}")
            continue
    
    return results


In [None]:

if __name__ == "__main__":
    # Verify Tesseract is properly configured
    try:
        pytesseract.get_tesseract_version()
    except:
        print("Error: Tesseract OCR not properly installed or configured")
        print("On Windows: Download Amharic traineddata from https://github.com/tesseract-ocr/tessdata")
        exit(1)
    
    print("Starting Amharic legal document scraping...")
    final_results = scrape_legal_pdfs()
    print(f"Completed! Found {len(final_results)} valid Amharic legal documents.")

Starting Amharic legal document scraping...
Processing: https://www.lawethiopia.com
Found PDF: https://www.lawethiopia.com/images/cassation/Ethiopia cassation index volume 1-18.pdf
Found PDF: https://www.lawethiopia.com/images/cassation/cassation decisions by volumes/volume 1-3.pdf
Found PDF: https://www.lawethiopia.com/images/cassation/cassation decisions by volumes/volume 4.pdf
OCR failed for amharic_legal_pdfs\temp.pdf: Unable to get page count. Is poppler installed and in PATH?
Found PDF: https://www.lawethiopia.com/images/cassation/cassation decisions by volumes/volume 5.pdf
OCR failed for amharic_legal_pdfs\temp.pdf: Unable to get page count. Is poppler installed and in PATH?
Found PDF: https://www.lawethiopia.com/images/cassation/cassation decisions by volumes/volume 6.pdf
OCR failed for amharic_legal_pdfs\temp.pdf: Unable to get page count. Is poppler installed and in PATH?
Found PDF: https://www.lawethiopia.com/images/cassation/cassation decisions by volumes/volume 7.pdf
OCR f