# Maharashtra College Scraper
This notebook scrapes college data from various sources including Shiksha, College Dunia, GetMyUni, and integrates with the Clueless-Community collegeAPI.

The scraped data is then saved to a JSON file which can be downloaded for use in your project.

In [None]:
!pip install requests beautifulsoup4 tqdm

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import time
import random
from tqdm import tqdm

def scrape_maharashtra_colleges():
    """
    Scrape information about colleges in Maharashtra from multiple sources
    and compile them into a comprehensive database.
    """
    colleges = []
    
    # Sources to scrape
    sources = [
        {
            "name": "Shiksha",
            "url": "https://www.shiksha.com/b-tech/colleges/b-tech-colleges-maharashtra",
            "parser": parse_shiksha
        },
        {
            "name": "College Dunia",
            "url": "https://collegedunia.com/btech/maharashtra-colleges",
            "parser": parse_collegedunia
        },
        {
            "name": "GetMyUni",
            "url": "https://www.getmyuni.com/engineering-colleges-in-maharashtra",
            "parser": parse_getmyuni
        }
    ]
    
    # Scrape each source
    for source in sources:
        print(f"Scraping from {source['name']}...")
        try:
            source_colleges = source["parser"](source["url"])
            colleges.extend(source_colleges)
            print(f"Found {len(source_colleges)} colleges from {source['name']}")
        except Exception as e:
            print(f"Error scraping {source['name']}: {str(e)}")
        
        # Add delay to avoid overloading servers
        time.sleep(random.uniform(2, 5))
    
    # Fetch data from collegeAPI
    print("Fetching data from collegeAPI...")
    api_colleges = fetch_from_college_api()
    colleges.extend(api_colleges)
    print(f"Found {len(api_colleges)} colleges from collegeAPI")
    
    # Remove duplicates based on college name
    unique_colleges = []
    college_names = set()
    
    for college in colleges:
        if college["name"] not in college_names:
            college_names.add(college["name"])
            unique_colleges.append(college)
    
    print(f"Total unique colleges found: {len(unique_colleges)}")
    
    # Save to JSON file
    with open('maharashtra_colleges.json', 'w', encoding='utf-8') as f:
        json.dump(unique_colleges, f, indent=2, ensure_ascii=False)
    
    print(f"College data saved to maharashtra_colleges.json")
    return unique_colleges

def parse_shiksha(url):
    # Your existing parsing logic for Shiksha
    return []

def parse_collegedunia(url):
    # Your existing parsing logic for College Dunia
    return []

def parse_getmyuni(url):
    # Your existing parsing logic for GetMyUni
    return []

def fetch_from_college_api():
    # Your existing logic to fetch data from collegeAPI
    return []

# Run the scraper
scrape_maharashtra_colleges()

## Download the Scraped Data
After running the scraper, you can download the `maharashtra_colleges.json` file from the Colab environment.

In [None]:
from google.colab import files
files.download('maharashtra_colleges.json')