<a href="https://colab.research.google.com/github/aabhashbasnet/CollegeDetails/blob/main/WebScrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

# Starting URL
base_url = "https://edusanjal.com/"
url = base_url + "college/district/bhaktapur,kathmandu,lalitpur/level/bachelors/?page=1"

# Lists to store collected data
college_data = []

# Function to fetch college details for a single college URL
def fetch_college_details(college_url):
    if college_url.startswith('/'):
        college_url = base_url + college_url.lstrip('/')

    college_response = requests.get(college_url)
    college_soup = BeautifulSoup(college_response.content, 'html.parser')

    # Find the phone number
    phone_li = college_soup.find('li', title="Phone")
    phone_number = phone_li.find('span').find_all('span')[-1].text if phone_li else "Phone not found"

    # Find the email address
    email_li = college_soup.find('li', title="Email")
    email = email_li.find('span').find_all('span')[-1].text if email_li else "Email not found"

    return college_url.split('/')[-2], phone_number, email  # Extracting college name from URL

# Collect college links
all_links = []
while url:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.findAll("a", {"class": "text-xl font-semibold leading-6"})

    for link in links:
        all_links.append(link['href'])

    # Find the 'Next' button or link
    next_page = soup.find("a", {
        "class": "router-link-active router-link-exact-active relative inline-flex items-center px-2 py-2 rounded-r-md border border-gray-300 bg-white text-sm font-medium text-gray-600 hover:bg-gray-50"
    })

    if next_page:
        next_url = next_page['href']
        if next_url.startswith('http'):
            url = next_url
        else:
            url = base_url + next_url
    else:
        url = None

# Use ThreadPoolExecutor to fetch college details concurrently
with ThreadPoolExecutor(max_workers=5) as executor:
    future_to_url = {executor.submit(fetch_college_details, link): link for link in all_links}
    for future in as_completed(future_to_url):
        college_name, phone_number, email = future.result()
        college_data.append((college_name, phone_number, email))

# Print the collected college names, phone numbers, and emails
print("Collected College Data:")
for college_name, number, email in college_data:
    print(f"College: {college_name} | Phone Number: {number} | Email: {email}")


Collected College Data:
College: kathford-international-college-engineering-and-management | Phone Number: 01-5201241,01-5201911 | Email: admission@kathford.edu.np
College: texas-international-college-management-and-it | Phone Number: 01-4589134, 01-4588627 | Email: inquiry@texascollege.edu.np
College: ist-college | Phone Number: 01-4534350, 01-4534185, 01-4540930, 01-4517393 | Email: info@ist.org.np
College: ace-institute-management | Phone Number: 01-5970178 | Email: ace@ace.edu.np
College: thames-international-college | Phone Number: 01-5971224 | Email: info@thamescollege.edu.np
College: padmashree-international-college | Phone Number: 01-4112252 | 01-4112403 | Email: padmashreecollege@gmail.com
College: nepal-college-information-technology | Phone Number: 01-5186354 | 01-5186358 | 01-5186360 | Email: info@ncit.edu.np
College: silver-mountain-school-hotel-management | Phone Number: 01-4515038, 01-4544086 | Email: info@silvermountain.edu.np
College: liberty-college | Phone Number: 01