In [6]:
import requests
from bs4 import BeautifulSoup

def scrape_links(url):
    """
    Scrape links and their text from a given website using BeautifulSoup.
    
    Args:
    url (str): The URL of the website to scrape.
    
    Returns:
    list: A list of tuples containing unique (link, link_text) pairs found on the page.
    """
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all 'a' tags and extract the 'href' attribute and text
        links = [(a['href'], a.text.strip()) for a in soup.find_all('a', href=True)]
        
        # Remove duplicates and return the list of link-text pairs
        return list(set(links))
    
    except requests.RequestException as e:
        print(f"An error occurred while fetching the URL: {e}")
        return []
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return []

# Example usage:
# url = "https://example.com"
# links_and_text = scrape_links(url)
# print(links_and_text)


In [7]:
# Example usage:
url = "https://igod.gov.in/sectors"
links = scrape_links(url)
print(links)

[('https://www.nic.in/', 'National Informatics Centre'), ('https://igod.gov.in/sectors', 'Sectors'), ('https://www.meity.gov.in/', 'Ministry of Electronics & Information Technology'), ('https://igod.gov.in/sector/CBNsIHQBsvhI6u6Q3tju/organizations', 'Finance, Banking & Insurance'), ('https://igod.gov.in/sector/_xNsIHQBsvhI6u6Q3tfu/organizations', 'Environment & Natural Resources'), ('javascript:void(0);', 'A-'), ('#', 'Bookmark'), ('#main-content', 'Skip to Main Content'), ('https://igod.gov.in/sector/FBNsIHQBsvhI6u6Q3tju/organizations', 'Home affairs & National Security'), ('https://igod.gov.in/sector/EBNsIHQBsvhI6u6Q3tju/organizations', 'Information & Broadcasting'), ('https://igod.gov.in/sector/CRNsIHQBsvhI6u6Q3tju/organizations', 'Transport & Infrastructure'), ('https://igod.gov.in/sector/FRNsIHQBsvhI6u6Q3tju/organizations', 'Governance & Administration'), ('https://www.nic.in/', ''), ('javascript:void(0);', 'A'), ('javascript:void(0);', 'A+'), ('https://igod.gov.in/sector/ChNsIHQB

In [8]:
import pandas as pd
import urllib.parse

# Filter links that end with "organizations" and extract sector name
filtered_links = [(link, text) for link, text in links if urllib.parse.urlparse(link).path.split('/')[-1] == "organizations"]

# Create a DataFrame with the filtered links and sector names
df_organizations = pd.DataFrame(filtered_links, columns=['organization_links', 'sector_name'])

# Display the first few rows of the DataFrame
display(df_organizations)

# Optional: Save the DataFrame to a CSV file
# df_organizations.to_csv('organization_links.csv', index=False)


Unnamed: 0,organization_links,sector_name
0,https://igod.gov.in/sector/CBNsIHQBsvhI6u6Q3tj...,"Finance, Banking & Insurance"
1,https://igod.gov.in/sector/_xNsIHQBsvhI6u6Q3tf...,Environment & Natural Resources
2,https://igod.gov.in/sector/FBNsIHQBsvhI6u6Q3tj...,Home affairs & National Security
3,https://igod.gov.in/sector/EBNsIHQBsvhI6u6Q3tj...,Information & Broadcasting
4,https://igod.gov.in/sector/CRNsIHQBsvhI6u6Q3tj...,Transport & Infrastructure
5,https://igod.gov.in/sector/FRNsIHQBsvhI6u6Q3tj...,Governance & Administration
6,https://igod.gov.in/sector/ChNsIHQBsvhI6u6Q3tj...,Food & Public Distribution
7,https://igod.gov.in/sector/DRNsIHQBsvhI6u6Q3tj...,Rural Development & Panchayati Raj
8,https://igod.gov.in/sector/ERNsIHQBsvhI6u6Q3tj...,Communications & Information Technology
9,https://igod.gov.in/sector/ABNsIHQBsvhI6u6Q3tj...,Coal & Mine


In [26]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import re

def scrape_organization_page(url):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    service = Service()

    driver = webdriver.Chrome(service=service, options=chrome_options)

    try:
        driver.get(url)
        
        wait = WebDriverWait(driver, 30)
        search_meta = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "search-meta")))
        result_count_text = search_meta.text.strip()
        result_count = int(re.search(r'\d+', result_count_text).group())
        
        links = []
        scroll_pause_time = 1
        last_height = driver.execute_script("return document.body.scrollHeight")
        
        while len(links) < result_count:
            # Scroll down
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            
            # Wait for new content to load
            time.sleep(scroll_pause_time)
            
            # Get page source and parse with BeautifulSoup
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            
            # Extract links and text
            for a in soup.find_all('a', href=True):
                links.append((a['href'], a.text.strip()))
            
            # Remove duplicates
            links = list(set(links))
            
            # Check if we've reached the bottom of the page
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                # Try to click "Load More" button if it exists
                try:
                    load_more = driver.find_element(By.XPATH, "//button[contains(text(), 'Load More')]")
                    load_more.click()
                    time.sleep(scroll_pause_time)
                except:
                    # If "Load More" doesn't exist, we've truly reached the end
                    break
            
            last_height = new_height
            
            print(f"Scraped {len(links)} links so far...")
        
        if len(links) < result_count:
            print(f"Warning: Only found {len(links)} links, expected {result_count}")
        
        return links, str(result_count)
    
    except Exception as e:
        print(f"An error occurred while scraping {url}: {str(e)}")
        return [], "0"
    
    finally:
        driver.quit()

# Create a new DataFrame to store the results
results = []

# Iterate through each organization link
for index, row in df_organizations.iterrows():
    org_link = row['organization_links']
    sector = row['sector_name']
    
    print(f"Scraping {sector}...")
    links, count = scrape_organization_page(org_link)
    
    for link, text in links:
        results.append({
            'sector': sector,
            'organization_link': org_link,
            'result_count': count,
            'link': link,
            'text': text
        })

# Create a DataFrame from the results
df_results = pd.DataFrame(results)

# Display the first few rows of the DataFrame
display(df_results)

# Optional: Save the DataFrame to a CSV file
# df_results.to_csv('organization_page_links.csv', index=False)


Scraping Finance, Banking & Insurance...
Scraped 72 links so far...
Scraped 79 links so far...
Scraped 88 links so far...
Scraped 102 links so far...
Scraped 116 links so far...
Scraped 125 links so far...
Scraped 136 links so far...
Scraped 146 links so far...
Scraped 156 links so far...
Scraped 162 links so far...
Scraped 172 links so far...
Scraped 181 links so far...
Scraped 191 links so far...
Scraped 201 links so far...
Scraped 209 links so far...
Scraped 219 links so far...
Scraped 229 links so far...
Scraped 238 links so far...
Scraped 248 links so far...
Scraped 258 links so far...
Scraped 267 links so far...
Scraped 276 links so far...
Scraped 285 links so far...
Scraped 294 links so far...
Scraped 298 links so far...
Scraped 307 links so far...
Scraped 315 links so far...
Scraped 330 links so far...
Scraping Environment & Natural Resources...
Scraped 72 links so far...
Scraped 77 links so far...
Scraped 87 links so far...
Scraped 99 links so far...
Scraped 118 links so far..

Unnamed: 0,sector,organization_link,result_count,link,text
0,"Finance, Banking & Insurance",https://igod.gov.in/sector/CBNsIHQBsvhI6u6Q3tj...,326,https://nhb.org.in,National Housing Bank (NHB)
1,"Finance, Banking & Insurance",https://igod.gov.in/sector/CBNsIHQBsvhI6u6Q3tj...,326,https://notary.gov.in,Notary Portal
2,"Finance, Banking & Insurance",https://igod.gov.in/sector/CBNsIHQBsvhI6u6Q3tj...,326,https://www.gicre.in,General Insurance Corporation of India (GIC)
3,"Finance, Banking & Insurance",https://igod.gov.in/sector/CBNsIHQBsvhI6u6Q3tj...,326,https://odishatax.gov.in,"Commissionerate of CT and GST, Odisha"
4,"Finance, Banking & Insurance",https://igod.gov.in/sector/CBNsIHQBsvhI6u6Q3tj...,326,https://igod.gov.in/website_policies,Website Policies
...,...,...,...,...,...
6684,Energy & Power,https://igod.gov.in/sector/ARNsIHQBsvhI6u6Q3tj...,97,https://rioneshillong.gov.in,"Regional Inspectorial Organisation, Shillong"
6685,Energy & Power,https://igod.gov.in/sector/ARNsIHQBsvhI6u6Q3tj...,97,https://beeindia.gov.in,Bureau of Energy Efficiency (BEE)
6686,Energy & Power,https://igod.gov.in/sector/ARNsIHQBsvhI6u6Q3tj...,97,https://swachhatahiseva.gov.in,
6687,Energy & Power,https://igod.gov.in/sector/ARNsIHQBsvhI6u6Q3tj...,97,#,


In [27]:
# Remove rows with 'igod.gov.in' in the link and remove '#' links
# df_results = df_results[~df_results['link'].str.contains('igod.gov.in', case=False, na=False)]
# df_results = df_results[df_results['link'] != '#']
df_results['result_count'] = df_results['result_count'].astype(int)
# Reset the index after filtering
# df_results.reset_index(drop=True, inplace=True)

# Display the updated DataFrame
display(df_results)

# Optional: Save the updated DataFrame to a CSV file
# df_results.to_csv('filtered_organization_page_links.csv', index=False)



Unnamed: 0,sector,organization_link,result_count,link,text
0,"Finance, Banking & Insurance",https://igod.gov.in/sector/CBNsIHQBsvhI6u6Q3tj...,326,https://nhb.org.in,National Housing Bank (NHB)
1,"Finance, Banking & Insurance",https://igod.gov.in/sector/CBNsIHQBsvhI6u6Q3tj...,326,https://notary.gov.in,Notary Portal
2,"Finance, Banking & Insurance",https://igod.gov.in/sector/CBNsIHQBsvhI6u6Q3tj...,326,https://www.gicre.in,General Insurance Corporation of India (GIC)
3,"Finance, Banking & Insurance",https://igod.gov.in/sector/CBNsIHQBsvhI6u6Q3tj...,326,https://odishatax.gov.in,"Commissionerate of CT and GST, Odisha"
4,"Finance, Banking & Insurance",https://igod.gov.in/sector/CBNsIHQBsvhI6u6Q3tj...,326,https://igod.gov.in/website_policies,Website Policies
...,...,...,...,...,...
6684,Energy & Power,https://igod.gov.in/sector/ARNsIHQBsvhI6u6Q3tj...,97,https://rioneshillong.gov.in,"Regional Inspectorial Organisation, Shillong"
6685,Energy & Power,https://igod.gov.in/sector/ARNsIHQBsvhI6u6Q3tj...,97,https://beeindia.gov.in,Bureau of Energy Efficiency (BEE)
6686,Energy & Power,https://igod.gov.in/sector/ARNsIHQBsvhI6u6Q3tj...,97,https://swachhatahiseva.gov.in,
6687,Energy & Power,https://igod.gov.in/sector/ARNsIHQBsvhI6u6Q3tj...,97,#,


In [28]:
df_results.groupby('sector').agg({'result_count': 'max','organization_link': 'count'}).sort_values('result_count', ascending=False)

Unnamed: 0_level_0,result_count,organization_link
sector,Unnamed: 1_level_1,Unnamed: 2_level_1
Education & Training,2117,1788
Governance & Administration,593,603
Commerce & Industry,332,337
"Finance, Banking & Insurance",326,330
Health & Family welfare,307,306
Communications & Information Technology,277,282
"Science, Technology & Research",270,275
Home affairs & National Security,268,271
Agriculture & Cooperation,257,264
Defence,246,198
