<a href="https://colab.research.google.com/github/amasick/WebScrapper/blob/main/Schools_Details.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Tamil Nadu Schools**

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re


def scrape_school_details(catbox_div):
    school_name = catbox_div.find('h2').find('a').text.strip()

    # Find the paragraph containing school details
    school_details_paragraph = catbox_div.find('p')

    # Check if the paragraph is found before accessing its content
    if school_details_paragraph:
        # Use regex to extract email
        email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', school_details_paragraph.text)
        email = email_match.group(0).strip() if email_match else None


        # Use regex to extract address
        address_match = re.search(r'Address of the school is:(.*?)PIN Code:', school_details_paragraph.text)
        address = address_match.group(1).strip() if address_match else None
        # print(address)

        # Use regex to extract PIN code
        pin_code_match = re.search(r'PIN Code:\s*([^<.]+)', school_details_paragraph.text)
        pin_code = pin_code_match.group(1).strip() if pin_code_match else None

        # Use regex to extract management
        management_match = re.search(r'The school is being managed by\s*([^<]+)\.', school_details_paragraph.text)
        management = management_match.group(1).strip() if management_match else None

        read_more_link = school_details_paragraph.find('a', class_='link')['href']

        return {
            'School Name': school_name,
            'Email': email,
            'Address': address,
            'PIN Code': pin_code,
            'Management': management,
            'Read More Link': read_more_link
        }
    else:
        print("School details paragraph not found.")
        return None



def scrape_all_schools(url):
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0'}

    # Make the GET request with custom headers
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        catbox_divs = soup.find_all('div', class_='catbox')

        all_schools_data = []
        for catbox_div in catbox_divs:
            school_data = scrape_school_details(catbox_div)
            all_schools_data.append(school_data)

        return all_schools_data

    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return None
def save_to_excel(data, output_file='Outside_India.xlsx'):
    df = pd.DataFrame(data)
    df.to_excel(output_file, index=False)
    print(f"Data saved to {output_file}")

if __name__ == "__main__":
    base_url = "https://www.cbseschool.org/location/outside-india/page/"
    all_schools_data = []

    for page_number in range(1,5):  # Assuming you have 24 pages
        url = f"{base_url}{page_number}"
        page_schools_data = scrape_all_schools(url)

        if page_schools_data:
            all_schools_data.extend(page_schools_data)

    if all_schools_data:
        save_to_excel(all_schools_data)



Data saved to Outside_India.xlsx


# **After Read More links**


In [None]:
import requests
from bs4 import BeautifulSoup

def scrape_school_details(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0'}

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the div containing school details
        details_div = soup.find('div', id='schooldetails')

        if details_div:
            # Extracting school name from table heading
            school_name = details_div.find('tr', class_='tableheading').find_all('td')[1].text.strip()
            # Extracting the values using a loop through all td elements
            details = {'School Name': school_name}
            for td_element in details_div.find_all('td', class_='field'):
                field_name = td_element.text.strip()
                field_value = td_element.find_next('td').text.strip() if td_element else None
                details[field_name] = field_value

            return details
        else:
            print("School details not found.")
            return None

    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return None

if __name__ == "__main__":
    url = "https://www.cbseschool.org/a-a-public-school-tamil-nadu/"
    school_data = scrape_school_details(url)

    if school_data:
        print(school_data)
    else:
        print("Failed to scrape school details.")


{'School Name': 'A A Public School', 'Affiliate ID': '1931043', 'Address': '17/8, Andiappan Gramani Street, Royapuram, Chennai', 'PIN Code': '600013', 'Office Phone': '04448595902', 'E-mail': 'aapublicschool@gmail.com', 'Foundation Year': '2017', 'Principal/Head of Institution': 'Mrs. S.gnana Jothi', 'School Status': 'Secondary School', 'Managing Trust/Society/Committee': 'Aladi Aruna Foundation'}


In [None]:
import requests
from bs4 import BeautifulSoup

def get_school_urls(base_url, num_pages):
    all_school_urls = []
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0'}

    for page_number in range(1, num_pages + 1):
        url = f"{base_url}/page/{page_number}/"
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all catbox divs
            catbox_divs = soup.find_all('div', class_='catbox')

            # Extract URLs from h2 tags
            page_school_urls = [catbox.find('h2').find('a')['href'] for catbox in catbox_divs]

            all_school_urls.extend(page_school_urls)
        else:
            print(f"Failed to fetch data from {url}. Status code: {response.status_code}")

    return all_school_urls

if __name__ == "__main__":
    base_url = "https://www.cbseschool.org/location/tamil-nadu"
    num_pages = 25

    school_urls = get_school_urls(base_url, num_pages)

    if school_urls:
        print("List of School URLs:")
        for school_url in school_urls:
            print(school_url)
    else:
        print("No URLs found.")


In [None]:
def scrape_all_schools(school_urls):
    all_school_data = []

    for url in school_urls:
        school_data = scrape_school_details(url)
        if school_data:
            all_school_data.append(school_data)

    return all_school_data

In [None]:
if __name__ == "__main__":
    base_url = "https://www.cbseschool.org/location/tamil-nadu"
    num_pages = 25

    school_urls = []
    for page_number in range(1, num_pages + 1):
        url = f"{base_url}/page/{page_number}/"
        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            catbox_divs = soup.find_all('div', class_='catbox')
            page_school_urls = [catbox.find('h2').find('a')['href'] for catbox in catbox_divs]
            school_urls.extend(page_school_urls)
        else:
            print(f"Failed to fetch data from {url}. Status code: {response.status_code}")

    all_school_data = scrape_all_schools(school_urls)

    if all_school_data:
        for school_data in all_school_data:
            print(school_data)
    else:
        print("No school data found.")

Failed to fetch data from https://www.cbseschool.org/location/tamil-nadu/page/1/. Status code: 406
Failed to fetch data from https://www.cbseschool.org/location/tamil-nadu/page/2/. Status code: 406
Failed to fetch data from https://www.cbseschool.org/location/tamil-nadu/page/3/. Status code: 406
Failed to fetch data from https://www.cbseschool.org/location/tamil-nadu/page/4/. Status code: 406
Failed to fetch data from https://www.cbseschool.org/location/tamil-nadu/page/5/. Status code: 406
Failed to fetch data from https://www.cbseschool.org/location/tamil-nadu/page/6/. Status code: 406
Failed to fetch data from https://www.cbseschool.org/location/tamil-nadu/page/7/. Status code: 406
Failed to fetch data from https://www.cbseschool.org/location/tamil-nadu/page/8/. Status code: 406
Failed to fetch data from https://www.cbseschool.org/location/tamil-nadu/page/9/. Status code: 406
Failed to fetch data from https://www.cbseschool.org/location/tamil-nadu/page/10/. Status code: 406
Failed to

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_school_urls(base_url, num_pages):
    all_school_urls = []
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0'}

    for page_number in range(1, num_pages + 1):
        url = f"{base_url}/page/{page_number}/"
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all catbox divs
            catbox_divs = soup.find_all('div', class_='catbox')

            # Extract URLs from h2 tags
            page_school_urls = [catbox.find('h2').find('a')['href'] for catbox in catbox_divs]

            all_school_urls.extend(page_school_urls)
        else:
            print(f"Failed to fetch data from {url}. Status code: {response.status_code}")

    return all_school_urls

def scrape_school_details(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0'}

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the div containing school details
        details_div = soup.find('div', id='schooldetails')

        if details_div:
            # Extracting school name from table heading
            school_name = details_div.find('tr', class_='tableheading').find_all('td')[1].text.strip()
            # Extracting the values using a loop through all td elements
            details = {'School Name': school_name}
            for td_element in details_div.find_all('td', class_='field'):
                field_name = td_element.text.strip()
                field_value = td_element.find_next('td').text.strip() if td_element else None
                details[field_name] = field_value

            return details
        else:
            print("School details not found.")
            return None

    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return None

def scrape_all_schools(school_urls):
    all_school_data = []

    for url in school_urls:
        school_data = scrape_school_details(url)
        if school_data:
            all_school_data.append(school_data)

    return all_school_data

def save_to_excel(data, output_file='TN_DetailedSchool_details.xlsx'):
    df = pd.DataFrame(data)
    df.to_excel(output_file, index=False)
    print(f"Data saved to {output_file}")

if __name__ == "__main__":
    base_url = "https://www.cbseschool.org/location/tamil-nadu"
    num_pages = 24

    school_urls = get_school_urls(base_url, num_pages)

    if school_urls:
        all_school_data = scrape_all_schools(school_urls)

        if all_school_data:
            save_to_excel(all_school_data)
        else:
            print("No school data found.")
    else:
        print("No URLs found.")


Data saved to TN_DetailedSchool_details.xlsx


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_school_urls(base_url, num_pages):
    all_school_urls = []
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0'}

    for page_number in range(1, num_pages + 1):
        url = f"{base_url}/page/{page_number}/"
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all catbox divs
            catbox_divs = soup.find_all('div', class_='catbox')

            # Extract URLs from h2 tags
            page_school_urls = [catbox.find('h2').find('a')['href'] for catbox in catbox_divs]

            all_school_urls.extend(page_school_urls)
        else:
            print(f"Failed to fetch data from {url}. Status code: {response.status_code}")

    return all_school_urls

def scrape_school_details(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0'}

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the div containing school details
        details_div = soup.find('div', id='schooldetails')

        if details_div:
            # Extracting school name from table heading
            school_name = details_div.find('tr', class_='tableheading').find_all('td')[1].text.strip()
            # Extracting the values using a loop through all td elements
            details = {'School Name': school_name}
            for td_element in details_div.find_all('td', class_='field'):
                field_name = td_element.text.strip()
                field_value = td_element.find_next('td').text.strip() if td_element else None
                details[field_name] = field_value

            return details
        else:
            print("School details not found.")
            return None

    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return None

def scrape_all_schools(school_urls):
    all_school_data = []

    for url in school_urls:
        school_data = scrape_school_details(url)
        if school_data:
            all_school_data.append(school_data)

    return all_school_data

def save_to_excel(data, output_file='OutsideIndiaSchool_details.xlsx'):
    df = pd.DataFrame(data)
    df.to_excel(output_file, index=False)
    print(f"Data saved to {output_file}")

if __name__ == "__main__":
    base_url = "https://www.cbseschool.org/location/outside-india/"
    num_pages = 5

    school_urls = get_school_urls(base_url, num_pages)

    if school_urls:
        all_school_data = scrape_all_schools(school_urls)

        if all_school_data:
            save_to_excel(all_school_data)
        else:
            print("No school data found.")
    else:
        print("No URLs found.")


Data saved to OutsideIndiaSchool_details.xlsx



# **Foriegn Schools**

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [None]:
def scrape_school_details(li_element):
    # Find the div containing school details
    school_details_div = li_element.find('div', class_='edu-school-detlist-container')

    if school_details_div:
        # Extracting the school name
        school_name = school_details_div.find('h2', class_='edu-school-det-heading').text.strip()
        print("School Name:", school_name)

        # Extracting the values using a loop through all label elements
        details = {'School Name': school_name}
        for label_element in school_details_div.find_all('label'):
            label_text = label_element.text.strip()
            detail_text = label_element.find_next('div', class_='edu-school-det-text').text.strip() if label_element else None
            details[label_text] = detail_text
            print(f"{label_text}: {detail_text}")

        return details
    else:
        print("School details not found.")
        return None



In [None]:
def scrape_all_schools(url):
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0'}

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the parent element containing all the li elements
        parent_element = soup.find('div', class_='edu-school-list-wrap')

        if parent_element:
            # Find all li elements within the parent element
            li_elements = parent_element.find_all('li')

            all_schools_data = []
            for li_element in li_elements:
                school_data = scrape_school_details(li_element)
                all_schools_data.append(school_data)

            return all_schools_data  # Corrected indentation

        else:
            print("Parent element not found.")
            return None

    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return None



In [None]:
def save_to_excel(data, output_file='all_schools_details133.xlsx'):
    # Adjust column names based on the provided data
    # columns = ['School Name', 'Address', 'Pincode', 'Contact', 'Email', 'Website']

    # Create a DataFrame from the data
    df = pd.DataFrame(data, columns=columns)

    # Save the DataFrame to an Excel file
    df.to_excel(output_file, index=False)
    print(f"Data saved to {output_file}")

if __name__ == "__main__":
    url = "https://www.careerindia.com/cbse-schools-in-foreign-schools-s11.html"
    all_schools_data = scrape_all_schools(url)

    # # Assuming you have 24 pages
    # for page_number in range(1, 25):
    #     url = f"{base_url}{page_number}"
        # page_schools_data = scrape_all_schools(url)

        # if page_schools_data:
        #     all_schools_data.extend(page_schools_data)

    if all_schools_data:
        save_to_excel(all_schools_data)


Data saved to all_schools_details133.xlsx


In [None]:
def save_to_excel(data, output_file='all_schools_details13.xlsx'):
    df = pd.DataFrame(data)
    df.to_excel(output_file, index=False)
    print(f"Data saved to {output_file}")

In [None]:
if __name__ == "__main__":
    url = "https://www.careerindia.com/cbse-schools-in-foreign-schools-s11.html"
    all_schools_data = []



    page_schools_data = scrape_all_schools(url)

    if page_schools_data:
        all_schools_data.extend(page_schools_data)

    if all_schools_data:
        save_to_excel(all_schools_data)


# **Outside India Schools**

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re


def scrape_school_details(catbox_div):
    school_name = catbox_div.find('h2').find('a').text.strip()

    # Find the paragraph containing school details
    school_details_paragraph = catbox_div.find('p')

    # Check if the paragraph is found before accessing its content
    if school_details_paragraph:
        # Use regex to extract email
        email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', school_details_paragraph.text)
        email = email_match.group(0).strip() if email_match else None


        # Use regex to extract address
        address_match = re.search(r'Address of the school is:(.*?)PIN Code:', school_details_paragraph.text)
        address = address_match.group(1).strip() if address_match else None
        # print(address)

        # Use regex to extract PIN code
        pin_code_match = re.search(r'PIN Code:\s*([^<.]+)', school_details_paragraph.text)
        pin_code = pin_code_match.group(1).strip() if pin_code_match else None

        # Use regex to extract management
        management_match = re.search(r'The school is being managed by\s*([^<]+)\.', school_details_paragraph.text)
        management = management_match.group(1).strip() if management_match else None

        read_more_link = school_details_paragraph.find('a', class_='link')['href']

        return {
            'School Name': school_name,
            'Email': email,
            'Address': address,
            'PIN Code': pin_code,
            'Management': management,
            'Read More Link': read_more_link
        }
    else:
        print("School details paragraph not found.")
        return None



def scrape_all_schools(url):
    # headers = {
    #     'User-Agent': 'Thunder Client (https://www.thunderclient.com)',
    #     'Accept': '/'
    # }
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0'}


    # Make the GET request with custom headers
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        catbox_divs = soup.find_all('div', class_='catbox')

        all_schools_data = []
        for catbox_div in catbox_divs:
            school_data = scrape_school_details(catbox_div)
            all_schools_data.append(school_data)

        return all_schools_data

    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return None

def save_to_excel(data, output_file='all_schools_details12.xlsx'):
    df = pd.DataFrame(data)
    df.to_excel(output_file, index=False)
    print(f"Data saved to {output_file}")

if __name__ == "__main__":
    base_url = "https://www.cbseschool.org/location/outside-india/"
    all_schools_data = []

    for page_number in range(1, 25):  # Assuming you have 24 pages
        url = f"{base_url}{page_number}"
        page_schools_data = scrape_all_schools(url)

        if page_schools_data:
            all_schools_data.extend(page_schools_data)

    if all_schools_data:
        save_to_excel(all_schools_data)





In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def scrape_school_details(li_element):
    # Find the div containing school details
    school_details_div = li_element.find('div', class_='edu-school-detlist-container')

    if school_details_div:
        # Extracting the school name
        school_name = school_details_div.find('h2', class_='edu-school-det-heading').text.strip()

        # Extracting the values using a loop through all label elements
        details = {'School Name': school_name}
        for label_element in school_details_div.find_all('label'):
            label_text = label_element.text.strip().rstrip(':')
            detail_text = label_element.find_next('div', class_='edu-school-det-text').text.strip() if label_element else None
            details[label_text] = detail_text

        return details
    else:
        print("School details not found.")
        return None

def scrape_all_schools(base_url, num_pages):
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0'}

    all_schools_data = []
    for page_number in range(1, num_pages + 1):
        url = f"{base_url}?page={page_number}"
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find the parent element containing all the li elements
            parent_element = soup.find('div', class_='edu-school-list-wrap')

            if parent_element:
                # Find all li elements within the parent element
                li_elements = parent_element.find_all('li')

                for li_element in li_elements:
                    school_data = scrape_school_details(li_element)
                    if school_data:
                        all_schools_data.append(school_data)

            else:
                print(f"Parent element not found on page {page_number}.")

        elif response.status_code == 404:
            print(f"Page not found. Status code: {response.status_code} for page {page_number}")

    return all_schools_data

def save_to_excel(data, output_file='all_schools_details0.xlsx'):
    # Extract column names from the keys of the first dictionary in the data list
    columns = list(data[0].keys())

    # Create a DataFrame from the data
    df = pd.DataFrame(data, columns=columns)

    # Save the DataFrame to an Excel file
    df.to_excel(output_file, index=False)
    print(f"Data saved to {output_file}")

if __name__ == "__main__":
    base_url = "https://www.careerindia.com/cbse-schools-in-foreign-schools-s11.html"
    num_pages = 24  # Assuming you have 25 pages

    all_schools_data = []
    for page_number in range(1, num_pages + 1):
        page_data = scrape_all_schools(base_url, page_number)
        if page_data:
            all_schools_data.extend(page_data)

    if all_schools_data:
        save_to_excel(all_schools_data)


Data saved to all_schools_details0.xlsx
