In [None]:
import requests
from bs4 import BeautifulSoup
import os

# List of VARIABLE strings
variables = [
    "agastya",
    "ananda-tirtha",
    "annamacharya",
    "appayya-dikshita",
    # "brahmananda",  # commented out
    "chanakya",
    "chandrashekharabharati",
    "ganapati-muni",
    "gaudapaada",
    "jagadisha-shastri",
    "kalidasa",
    "krishnanandasarasvati",
    "bhartrihari",
    "madhusudanasarasvati",
    "maheshvaranandasarasvati",
    "muttusvami-dikshitara",
    "nrisimhabharatisvami",
    "pandita-bellamkonda-ramaraya-kavindra",
    "pushpadanta",
    "ramana-maharshi",
    "ramanuja",
    "sachchidananda-shivabhinava-nrisimhabharati",
    "sadashivabrahmendra",
    "samartha-ramadasa",
    "shankaracharya",
    "shridhara-venkatesha",
    "tyagaraja",
    "vadiraja",
    "vallabhaachaarya",
    "valmiki",
    "vangipuram-narasinhacharya",
    "varahamihira",
    "vasudevananda-sarasvati",
    "vedanta-deshika",
    "vivekananda",
    "vyasa",
    "yogananda"
]

# Base URL pattern
base_url = "https://sanskritdocuments.org/iast/"

# Create output folder
output_folder = r"docs/"
os.makedirs(output_folder, exist_ok=True)

# Improved headers to prevent 406 error
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9"
}



In [None]:
# Iterate over each variable
for variable in variables:
    url = base_url + variable + "/"
    print(f"Processing URL: {url}")

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")
        html_links = []

        # Extract all PDF links
        for ul in soup.find_all("ul"):
            for li in ul.find_all("li"):
                pdf_tag = li.find("a", href=lambda href: href and href.endswith(".html"))
                if pdf_tag:
                    html_links.append(pdf_tag['href'])

        # Handle relative URLs
        html_links = ["https://sanskritdocuments.org" + link if not link.startswith("http") else link for link in html_links]

        # Download PDFs
        # for pdf_url in html_links:
        #     filename = os.path.join(output_folder, pdf_url.split("/")[-1])
        #     print(f"Downloading: {pdf_url}")

        #     try:
        #         pdf_response = requests.get(pdf_url, headers=headers)
        #         pdf_response.raise_for_status()

        #         with open(filename, "wb") as f:
        #             f.write(pdf_response.content)
        #         print(f"Saved to: {filename}")
        #     except Exception as e:
        #         print(f"Failed to download {pdf_url}: {e}")

    except Exception as e:
        print(f"Failed to process {url}: {e}")

print("html list prepared")

In [None]:
html_links

In [None]:
# Cell 1: Import Libraries
# ---
# Import necessary libraries for making HTTP requests and parsing HTML.
# Make sure you have these installed ('pip install requests beautifulsoup4')
# ---
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os # Added for potential file saving later

print("Libraries imported successfully.")

In [None]:
# This function takes the base URL and a list of page suffixes,
# visits each page, parses the HTML, and extracts links ending in '.html'
# based on the specific structure (<div class='index-content'> -> <ul> -> <li> -> <a>).
# ---
def extract_html_links(base_url, page_suffixes):
    """
    Fetches pages based on suffixes, parses them, and extracts .html links
    from the specified structure.

    Args:
        base_url (str): The base URL of the website (e.g., "https://sanskritdocuments.org/").
        page_suffixes (list): A list of strings, where each string is a suffix
                               to append to the base_url to get a page URL.

    Returns:
        dict: A dictionary where keys are the full page URLs visited and
              values are lists of the extracted .html hrefs found on that page.
              Returns an empty dictionary if errors occur or no links are found.
    """
    all_extracted_links = {}

    # Ensure the base URL ends with a slash for proper joining
    if not base_url.endswith('/'):
        base_url += '/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
    }
    print(f"Starting extraction from base URL: {base_url}")

    for suffix in page_suffixes:
        # Construct the full URL for the index page
        page_url = urljoin(base_url, suffix.lstrip('/')) # Use urljoin for robust URL construction
        print(f"\nProcessing page: {page_url}")

        page_html_links = [] # List to store .html links for the current page

        try:
            # Send an HTTP GET request to the page URL
            # Use a session object for potential performance improvements (connection reuse)
            with requests.Session() as session:
                response = session.get(page_url, headers=headers, timeout=15) # Increased timeout slightly
                response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)

            # Parse the HTML content of the page
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find the specific div with class="index-content"
            index_content_div = soup.find('div', class_='index-content')

            if not index_content_div:
                print(f"  - Warning: Could not find <div class='index-content'> on {page_url}")
                all_extracted_links[page_url] = {"status": "warning", "message": "index-content div not found", "links": []}
                continue # Move to the next suffix

            # Find the <ul> tag within that div
            ul_tag = index_content_div.find('ul')
            # print(ul_tag)
            if not ul_tag:
                print(f"  - Warning: Could not find <ul> within <div class='index-content'> on {page_url}")
                all_extracted_links[page_url] = {"status": "warning", "message": "ul tag not found in index-content", "links": []}
                continue # Move to the next suffix

            # Find all <li> tags within that <ul>
            # Search recursively first, as the exact structure might vary slightly
            li_tags = ul_tag.find_all('li')

            if not li_tags:
                 print(f"  - Warning: Could not find any <li> tags within <ul> on {page_url}")
                 all_extracted_links[page_url] = {"status": "warning", "message": "li tags not found in ul", "links": []}
                 continue # Move to the next suffix

            # Iterate through each <li> tag found
            for li in li_tags:
                # Find all <a> tags within the current <li> that have an href
                a_tags = li.find_all('a', href=True)

                # Iterate through each <a> tag found
                for a in a_tags:
                    href = a['href'] # Get the value of the href attribute
                    # Check if the href ends with '.html'
                    if href.endswith('.html'):
                        # Optionally resolve relative URLs fully here if needed later
                        # full_link_url = urljoin(page_url, href) # Example
                        print(f"  - Found link: {href}")
                        page_html_links.append(href) # Store the relative href as found

            # Store the found links for this page URL
            all_extracted_links[page_url] = {"status": "success", "message": f"Found {len(page_html_links)} '.html' links.", "links": page_html_links}
            print(f"  - Finished processing {page_url}. Found {len(page_html_links)} '.html' links.")

        except requests.exceptions.RequestException as e:
            print(f"  - Error fetching page {page_url}: {e}")
            all_extracted_links[page_url] = {"status": "error", "message": f"RequestException: {e}", "links": []}
        except Exception as e:
            print(f"  - An unexpected error occurred while processing {page_url}: {e}")
            all_extracted_links[page_url] = {"status": "error", "message": f"Unexpected error: {e}", "links": []}

    return all_extracted_links

print("Function 'extract_html_links' defined.")

In [None]:
BASE_WEBSITE_URL = "https://sanskritdocuments.org/"

# Example suffixes (replace with your actual list)
PAGE_SUFFIXES = variables

print("Configuration set:")
print(f"Base URL: {BASE_WEBSITE_URL}")
print(f"Page Suffixes to process: {len(PAGE_SUFFIXES)}")

In [None]:
print("\nStarting the extraction process...")

extracted_data = extract_html_links(BASE_WEBSITE_URL, PAGE_SUFFIXES)

print("\n--- Extraction Summary ---")
if extracted_data:
    successful_extractions = 0
    warnings = 0
    errors = 0
    total_links_found = 0

    for page, result in extracted_data.items():
        print(f"\nPage: {page}")
        print(f"  Status: {result.get('status', 'unknown')}")
        print(f"  Message: {result.get('message', 'N/A')}")

        if result.get('status') == 'success':
            successful_extractions += 1
            links = result.get('links', [])
            total_links_found += len(links)
            if links:
                print(f"  Links ({len(links)}):")
                for link in links:
                    print(f"    - {link}")
            else:
                print("  - No '.html' links found matching the criteria.")
        elif result.get('status') == 'warning':
            warnings += 1
        elif result.get('status') == 'error':
            errors += 1

    print("\n--- Overall Stats ---")
    print(f"Total pages processed: {len(extracted_data)}")
    print(f"Successful extractions: {successful_extractions}")
    print(f"Pages with warnings: {warnings}")
    print(f"Pages with errors: {errors}")
    print(f"Total '.html' links found across all successful pages: {total_links_found}")

else:
    print("No data was extracted. Check logs or script configuration.")

In [None]:
print("\n--- Extraction Summary ---")
if extracted_data:
    successful_extractions = 0
    warnings = 0
    errors = 0
    total_links_found = 0

    for page, result in extracted_data.items():
        print(f"\nPage: {page}")
        print(f"  Status: {result.get('status', 'unknown')}")
        print(f"  Message: {result.get('message', 'N/A')}")

        if result.get('status') == 'success':
            successful_extractions += 1
            links = result.get('links', [])
            total_links_found += len(links)
            if links:
                print(f"  Links ({len(links)}):")
                for link in links:
                    print(f"    - {link}")
            else:
                print("  - No '.html' links found matching the criteria.")
        elif result.get('status') == 'warning':
            warnings += 1
        elif result.get('status') == 'error':
            errors += 1

    print("\n--- Overall Stats ---")
    print(f"Total pages processed: {len(extracted_data)}")
    print(f"Successful extractions: {successful_extractions}")
    print(f"Pages with warnings: {warnings}")
    print(f"Pages with errors: {errors}")
    print(f"Total '.html' links found across all successful pages: {total_links_found}")

else:
    print("No data was extracted. Check logs or script configuration.")

In [None]:
extracted_data

In [None]:
internal_links = []
base_url = "https://sanskritdocuments.org/"

for page_url, data in extracted_data.items():
    if data.get('status') == 'success':
        for link in data.get('links', []):
            if not link.startswith('https'):
                full_link = urljoin(base_url, link)
                internal_links.append(full_link)

print("\n--- Internal Links (prefixed with base URL) ---")
if internal_links:
    for link in internal_links:
        print(link)
    print(f"\nTotal internal links found: {len(internal_links)}")
else:
    print("No internal links found based on the extracted data.")

In [None]:
def get_sanskrit_text(html_url):
    """
    Fetches the HTML content from a URL and extracts text within elements
    that have the attribute lang='sa'.

    Args:
        html_url (str): The URL of the HTML page.

    Returns:
        list: A list of strings, where each string is a block of Sanskrit text found.
              Returns an empty list if there's an error or no Sanskrit text is found.
    """
    sanskrit_texts = []

    try:
        with requests.Session() as session:
            response = session.get(html_url, headers=headers, timeout=15)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find all elements that have the lang attribute set to 'sa'
            sanskrit_elements = soup.find_all(attrs={'lang': 'sa'})

            for element in sanskrit_elements:
                # Extract the text content of each Sanskrit element and trim whitespace
                text = element.get_text(separator='\n', strip=True)
                if text:  # Only add if the text is not empty
                    sanskrit_texts.append(text)

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {html_url}: {e}")
    except Exception as e:
        print(f"An error occurred while processing {html_url}: {e}")

    return sanskrit_texts


In [None]:
len(internal_links)

In [None]:
# Assuming you have a list of HTML links in a variable called 'html_links'
# Replace this with your actual list of links
html_links = internal_links

all_sanskrit_data = {}

for link in html_links:
    print(f"\nProcessing: {link}")
    sanskrit_data = get_sanskrit_text(link)
    if sanskrit_data:
        print("Found Sanskrit text:")
        for text_block in sanskrit_data:
            print("-" * 20)
            print(text_block)
        all_sanskrit_data[link] = sanskrit_data
    else:
        print("No Sanskrit text found on this page (with lang='sa').")

print("\n--- Summary ---")
if all_sanskrit_data:
    print(f"Sanskrit text extracted from {len(all_sanskrit_data)} pages.")
else:
    print("No Sanskrit text was extracted from any of the provided links.")

In [None]:
len(all_sanskrit_data)

In [None]:
type(all_sanskrit_data)

In [None]:
all_sanskrit_data

In [None]:
import re

def extract_sanskrit_shloka(text):
    """
    Extracts Sanskrit shlokas from a text by identifying lines that
    predominantly contain Devanagari script.

    Args:
        text (str): The input text containing Sanskrit shlokas and other content.

    Returns:
        str: A string containing the extracted Sanskrit shlokas, with each shloka
             separated by a newline.
    """
    sanskrit_shlokas = []
    lines = text.strip().split('\n')
    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Check if the line contains a significant portion of Devanagari characters
        devanagari_count = 0
        total_count = 0
        for char in line:
            if '\u0900' <= char <= '\u097F':  # Devanagari Unicode range
                devanagari_count += 1
            if char.strip():  # Count only non-whitespace characters
                total_count += 1

        # Heuristic: If more than 60% of non-whitespace characters are Devanagari,
        # consider it a Sanskrit shloka line. Adjust this threshold as needed.
        if total_count > 0 and (devanagari_count / total_count) > 0.7:
            sanskrit_shlokas.append(line)

    return "\n".join(sanskrit_shlokas)

text_to_extract = all_sanskrit_data["https://sanskritdocuments.org/doc_devii/apItakuchAmbAstava.html"][0]

extracted_shlokas = extract_sanskrit_shloka(text_to_extract)
print(extracted_shlokas)

In [None]:
one = all_sanskrit_data["https://sanskritdocuments.org/doc_devii/apItakuchAmbAstava.html"][0]

In [None]:
import json
with open('docs/sans_data.json', "w") as file: 
        json.dump(all_sanskrit_data, file)

In [None]:
with open('docs/sans_data.json', 'r') as file:
        loaded_dict = json.load(file)

In [None]:
two = loaded_dict["https://sanskritdocuments.org/doc_devii/apItakuchAmbAstava.html"][0]

In [None]:
if one==two:
    print('yay')

In [None]:
from IPython.display import display, Markdown
import re
import json


def contains_english(text):
    """
    Checks if a string contains any English alphabet characters (a-z, A-Z).

    Args:
        text (str): The input string.

    Returns:
        bool: True if the string contains English letters, False otherwise.
    """
    return bool(re.search(r'[a-zA-Z]', text))

# Load the all_sanskrit_data from the JSON file
try:
    with open('docs/sans_data.json', 'r') as file:
        all_sanskrit_data = json.load(file)
    display(Markdown("**Loaded `all_sanskrit_data` from 'docs/sans_data.json'**"))
except FileNotFoundError:
    display(Markdown("<span style='color:red'>**Error:** 'docs/sans_data.json' not found. Please ensure the file exists.</span>"))
    all_sanskrit_data = {}
except json.JSONDecodeError:
    display(Markdown("<span style='color:red'>**Error:** Could not decode JSON from 'docs/sans_data.json'. The file might be corrupted.</span>"))
    all_sanskrit_data = {}
except Exception as e:
    display(Markdown(f"<span style='color:red'>**Error:** An unexpected error occurred while loading JSON: {e}</span>"))
    all_sanskrit_data = {}

# Apply extract_sanskrit_shloka to all texts
extracted_shloka_data = {}
if all_sanskrit_data:
    display(Markdown("\n**Applying `extract_sanskrit_shloka` to all extracted texts...**"))
    for url, texts in all_sanskrit_data.items():
        extracted_shlokas_for_url = []
        if isinstance(texts, list):
            for text in texts:
                extracted = extract_sanskrit_shloka(text)
                if extracted:
                    extracted_shlokas_for_url.append(extracted)
        elif isinstance(texts, str):
            extracted = extract_sanskrit_shloka(texts)
            if extracted:
                extracted_shlokas_for_url.append(extracted)
        extracted_shloka_data[url] = extracted_shlokas_for_url
    display(Markdown("**Extraction complete.**"))
else:
    display(Markdown("<span style='color:orange'>**Warning:** `all_sanskrit_data` is empty. No texts to process.</span>"))

# Save the extracted shloka data to a new JSON file
output_filename = 'docs/sanskrit_shlokas.json'
try:
    with open(output_filename, "w") as outfile:
        json.dump(extracted_shloka_data, outfile, indent=4, ensure_ascii=False)
    display(Markdown(f"**Extracted Sanskrit shlokas saved to '{output_filename}'**"))
except Exception as e:
    display(Markdown(f"<span style='color:red'>**Error:** Could not save extracted shlokas to '{output_filename}': {e}</span>"))

# Check for English in the extracted shlokas
if extracted_shloka_data:
    display(Markdown("\n**Checking for English words in the extracted shlokas...**"))
    english_found = {}
    for url, shlokas in extracted_shloka_data.items():
        english_in_url = []
        for shloka in shlokas:
            if contains_english(shloka):
                english_in_url.append(shloka.split('\n')) # Check line by line
        if english_in_url:
            english_found[url] = english_in_url

    if english_found:
        display(Markdown("<span style='color:orange'>**Warning:** Potential English words found in the extracted shlokas:</span>"))
        for url, lines_with_english in english_found.items():
            display(Markdown(f"**URL:** `{url}`"))
            display(Markdown("```"))
            for lines in lines_with_english:
                for line in lines:
                    if contains_english(line):
                        print(line)
            display(Markdown("```"))
    else:
        display(Markdown("**No English words found in the extracted shlokas (based on simple alphabet check).**"))
else:
    display(Markdown("<span style='color:orange'>**Warning:** No extracted shlokas to check for English.</span>"))

In [None]:
import re
import json
from IPython.display import display, Markdown

def remove_non_devanagari(text):
    """Removes any character that is not Devanagari, newline, or common Sanskrit punctuation."""
    return "".join(char for char in text if '\u0900' <= char <= '\u097F' or char in ['\n', ' ', '।', '॥', ',', ';', ':', '-', '_', '(', ')', '[', ']'])

def contains_english(text):
    """Checks if a string contains any English alphabet characters (a-z, A-Z)."""
    return bool(re.search(r'[a-zA-Z]', text))

# Load the extracted shloka data from the JSON file
try:
    with open('docs/sanskrit_shlokas.json', 'r') as file:
        extracted_shloka_data = json.load(file)
    display(Markdown("**Loaded `extracted_shloka_data` from 'docs/sanskrit_shlokas.json'**"))
except FileNotFoundError:
    display(Markdown("<span style='color:red'>**Error:** 'docs/sanskrit_shlokas.json' not found. Please ensure the file exists.</span>"))
    extracted_shloka_data = {}
except json.JSONDecodeError:
    display(Markdown("<span style='color:red'>**Error:** Could not decode JSON from 'docs/sanskrit_shlokas.json'. The file might be corrupted.</span>"))
    extracted_shloka_data = {}
except Exception as e:
    display(Markdown(f"<span style='color:red'>**Error:** An unexpected error occurred while loading JSON: {e}</span>"))
    extracted_shloka_data = {}

# Strictly remove non-Devanagari characters, keeping newlines
sanskrit_only_data = {}
if extracted_shloka_data:
    display(Markdown("\n**Strictly removing non-Devanagari characters (keeping newlines)...**"))
    for url, shlokas_list in extracted_shloka_data.items():
        cleaned_shlokas = []
        if isinstance(shlokas_list, list):
            for shloka in shlokas_list:
                cleaned_shlokas.append(remove_non_devanagari(shloka).strip())
        elif isinstance(shlokas_list, str):
            cleaned_shlokas.append(remove_non_devanagari(shlokas_list).strip())
        sanskrit_only_data[url] = [s for s in cleaned_shlokas if s] # Remove empty strings
    display(Markdown("**Non-Devanagari removal complete (keeping newlines).**"))
else:
    display(Markdown("<span style='color:orange'>**Warning:** `extracted_shloka_data` is empty. No texts to process.</span>"))

# Save the strictly Sanskrit-only data to a new JSON file
output_filename = 'docs/sanskrit_only_strict.json'
try:
    with open(output_filename, "w", encoding='utf-8') as outfile:
        json.dump(sanskrit_only_data, outfile, indent=4, ensure_ascii=False)
    display(Markdown(f"**Strictly Sanskrit-only data saved to '{output_filename}'**"))
except Exception as e:
    display(Markdown(f"<span style='color:red'>**Error:** Could not save strictly Sanskrit-only data to '{output_filename}': {e}</span>"))

# Verification (Optional): Check for any remaining English
remaining_english = {}
if sanskrit_only_data:
    display(Markdown("\n**Verifying for any remaining English characters...**"))
    for url, shlokas in sanskrit_only_data.items():
        english_in_url = []
        for shloka in shlokas:
            if contains_english(shloka):
                english_in_url.append(shloka)
        if english_in_url:
            remaining_english[url] = english_in_url

    if remaining_english:
        display(Markdown("<span style='color:orange'>**Warning:** English characters still found after strict removal:</span>"))
        for url, lines_with_english in remaining_english.items():
            display(Markdown(f"**URL:** `{url}`"))
            display(Markdown("```"))
            for line in lines_with_english[:5]:
                print(line)
            display(Markdown("```"))
    else:
        display(Markdown("**No English characters found after strict removal.**"))
else:
    display(Markdown("<span style='color:orange'>**Warning:** No Sanskrit-only data to verify.</span>"))

In [None]:
import json
from IPython.display import display, Markdown
import os

# Load the strictly Sanskrit-only data from the JSON file
try:
    with open('docs/sanskrit_only_strict.json', 'r', encoding='utf-8') as file:
        sanskrit_only_data = json.load(file)
    display(Markdown("**Loaded strictly Sanskrit-only data.**"))
except FileNotFoundError:
    display(Markdown("<span style='color:red'>**Error:** 'docs/sanskrit_only_strict.json' not found.</span>"))
    sanskrit_only_data = {}
except json.JSONDecodeError:
    display(Markdown("<span style='color:red'>**Error:** Could not decode JSON from 'docs/sanskrit_only_strict.json'.</span>"))
    sanskrit_only_data = {}
except Exception as e:
    display(Markdown(f"<span style='color:red'>**Error:** An unexpected error occurred: {e}</span>"))
    sanskrit_only_data = {}

# Create a directory to save the text files if it doesn't exist
output_dir = 'docs/text_file'
os.makedirs(output_dir, exist_ok=True)
display(Markdown(f"**Saving Sanskrit texts to the '{output_dir}' directory.**"))

# Generate text files for each list of Sanskrit texts
if sanskrit_only_data:
    file_counter = 1
    for url, shlokas_list in sanskrit_only_data.items():
        if isinstance(shlokas_list, list):
            for i, shloka in enumerate(shlokas_list):
                filename = os.path.join(output_dir, f"sanskrit_text_{file_counter:04d}.txt")
                try:
                    with open(filename, 'w', encoding='utf-8') as outfile:
                        outfile.write(shloka)
                    display(Markdown(f"Saved: `{filename}` (from URL: `{url}`, entry {i+1})"))
                    file_counter += 1
                except Exception as e:
                    display(Markdown(f"<span style='color:red'>**Error saving** `{filename}`: {e}</span>"))
        elif isinstance(shlokas_list, str):
            filename = os.path.join(output_dir, f"sanskrit_text_{file_counter:04d}.txt")
            try:
                with open(filename, 'w', encoding='utf-8') as outfile:
                    outfile.write(shlokas_list)
                display(Markdown(f"Saved: `{filename}` (from URL: `{url}`)"))
                file_counter += 1
            except Exception as e:
                display(Markdown(f"<span style='color:red'>**Error saving** `{filename}`: {e}</span>"))
    display(Markdown("**Sanskrit text file generation complete.**"))
else:
    display(Markdown("<span style='color:orange'>**Warning:** `sanskrit_only_data` is empty. No texts to save.</span>"))