<a href="https://colab.research.google.com/github/abhishekjkrsna/Gita-Web-Scrapping/blob/main/gita_scrapping_script.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Script to get all the verses of Bhagvad gita and store them in a json file

It uses requests and BeautifulSoup for webscrapping the results are stored in a folder gita


In [1]:
!pip install requests bs4 --quiet
!mkdir gita --quiet

# Code

In [12]:
import requests
from bs4 import BeautifulSoup
import json
from requests.exceptions import RequestException
import os

MAX_RETRIES = 3

def process_chapter(chapter):
    print("Processing chapter: ", chapter)
    # The URL with all the links
    url = f"https://vedabase.io/en/library/bg/{chapter}/"

    response = fetch_with_retries(url)
    soup = BeautifulSoup(response.text, "html.parser")

    base_url = "https://vedabase.io"

    # Filter for specific anchor tags containing "TEXT #:"
    specific_anchor_tags = soup.find_all(
        "a", text=lambda text: text and text.startswith("TEXT")
    )

    links = []
    # Getting all the links on the page
    for tag in specific_anchor_tags:
        href = tag.get("href")
        if href and not href.startswith("http"):
            href = base_url + href
        links.append(href)
        print("Found link: ", href)

    output = []
    # Going over the links and extracting the text
    for link in links:
        response = fetch_with_retries(link)
        soup = BeautifulSoup(response.text, "html.parser", from_encoding="utf-8")

        verse_section = soup.find("div", {"class": "r r-title r-verse"})
        verse_text = (
            verse_section.text.strip().replace(f"Bg. {chapter}.", "")
            if verse_section
            else ""
        )

        # Find the sloka section
        sloka_section = soup.find("div", {"class": "r r-devanagari"})
        sloka_text = sloka_section.text.strip() if sloka_section else ""

        # Find the translation section
        translation_section = soup.find("div", {"class": "r r-lang-en r-translation"})
        translation_text = (
            translation_section.text.strip() if translation_section else ""
        )

        # Find the purport section
        purport_sections = soup.find_all("div", {"class": "r r-lang-en r-paragraph"})
        purport_text = "\n".join(section.text.strip() for section in purport_sections)

        temp = {
            "chapter": chapter,
            "sloka": verse_text,
            "text": sloka_text,
            "translation": translation_text,
            "purport": purport_text,
        }
        output.append(temp)
        print(f"Processed verse: BG {chapter}.{verse_text}")

    print(f"Writing the output File chapter_{chapter}.json")
    with open(os.getcwd() + f"/gita/chapter_{chapter}.json", "w") as f:
            json.dump(output, f, indent=4, ensure_ascii=False)


def fetch_with_retries(url):
    retries = 0
    while retries < MAX_RETRIES:
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an exception for non-2xx status codes
            return response
        except RequestException as e:
            retries += 1
            print(f"Error fetching {url}: {e}. Retrying... ({retries}/{MAX_RETRIES})")
    raise Exception(f"Failed to fetch {url} after {MAX_RETRIES} retries.")


if __name__ == "__main__":
    from time import sleep

    for i in range(1, 19):
        process_chapter(i)
        print(f"Chapter {i} processed... Entering Sleep Mode")
        sleep(15) if i != 18 else None


  specific_anchor_tags = soup.find_all(


Found link:  https://vedabase.io/en/library/bg/1/1/
Found link:  https://vedabase.io/en/library/bg/1/2/
Found link:  https://vedabase.io/en/library/bg/1/3/
Found link:  https://vedabase.io/en/library/bg/1/4/
Found link:  https://vedabase.io/en/library/bg/1/5/
Found link:  https://vedabase.io/en/library/bg/1/6/
Found link:  https://vedabase.io/en/library/bg/1/7/
Found link:  https://vedabase.io/en/library/bg/1/8/
Found link:  https://vedabase.io/en/library/bg/1/9/
Found link:  https://vedabase.io/en/library/bg/1/10/
Found link:  https://vedabase.io/en/library/bg/1/11/
Found link:  https://vedabase.io/en/library/bg/1/12/
Found link:  https://vedabase.io/en/library/bg/1/13/
Found link:  https://vedabase.io/en/library/bg/1/14/
Found link:  https://vedabase.io/en/library/bg/1/15/
Found link:  https://vedabase.io/en/library/bg/1/16-18/
Found link:  https://vedabase.io/en/library/bg/1/19/
Found link:  https://vedabase.io/en/library/bg/1/20/
Found link:  https://vedabase.io/en/library/bg/1/21-



Processed verse:  1
Processed verse:  2
Processed verse:  3
Processed verse:  4
Processed verse:  5
Processed verse:  6
Processed verse:  7
Processed verse:  8
Processed verse:  9
Processed verse:  10
Processed verse:  11
Processed verse:  12
Processed verse:  13
Processed verse:  14
Processed verse:  15
Processed verse:  16-18
Processed verse:  19
Processed verse:  20
Processed verse:  21-22
Processed verse:  23
Processed verse:  24
Processed verse:  25
Processed verse:  26
Processed verse:  27
Processed verse:  28
Processed verse:  29
Processed verse:  30
Processed verse:  31
Processed verse:  32-35
Processed verse:  36
Processed verse:  37-38
Processed verse:  39
Processed verse:  40
Processed verse:  41
Processed verse:  42
Processed verse:  43
Processed verse:  44
Processed verse:  45
Processed verse:  46
Writing the output File
Found link:  https://vedabase.io/en/library/bg/2/1/
Found link:  https://vedabase.io/en/library/bg/2/2/
Found link:  https://vedabase.io/en/library/bg/2/3