# This file is to scrape the scheme website filtered by agriculture and environment. 


## Try to scrape single-website content

In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

URL = "https://www.myscheme.gov.in/schemes/e-nam"
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.3'}

response = requests.get(URL, headers=HEADERS)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Locate the outer div using its class attributes
    outer_div = soup.find('div', class_='w-full flex flex-row justify-between items-start mb-5')
    
    if outer_div:
        # Extract state and scheme_name from within the outer div
        state_elem = outer_div.find('h2', class_='text-darkIndigo-900 text-sm')
        scheme_name_elem = outer_div.find('h1', class_='font-bold text-green-600 text-xl sm:text-2xl mt-1')

        # Use a ternary conditional to handle potential None values
        state = state_elem.text.strip() if state_elem else "N/A"
        scheme_name = scheme_name_elem.text.strip() if scheme_name_elem else "N/A"

        # Extract the 'sources' link
        sources_link = soup.find('a', class_='flex flex-row items-center py-4 justify-start hover:underline underline-offset-2')['href']

        # Find all divs with class 'pt-10'
        divs = soup.find_all('div', class_='pt-10')

        # Extracting the id and the content of the div
        data = {'url': URL, 'state': state, 'scheme_name': scheme_name, 'sources': sources_link}  # Initialize with URL, state, scheme_name, and sources_link
        for div in divs:
            div_id = div.get('id')
            if div_id:
                # Exclude content from <div class="mb-2" ...>
                excluded_content = div.find('div', class_='mb-2')
                if excluded_content:
                    excluded_content.extract()  # Remove the unwanted content

                # Extract content from <ol> and <li> elements
                ol_content = div.find('ol')
                if ol_content:
                    data[div_id] = ' '.join([li.text.strip() for li in ol_content.find_all('li')])

        # Convert the data dictionary to a DataFrame
        df = pd.DataFrame([data])

        # Save the data to a CSV file
        df.to_csv("scraped_data.csv", index=False)
        print("Data saved to 'scraped_data.csv'.")

    else:
        print("Couldn't find the outer div element.")

else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


Data saved to 'scraped_data.csv'.


In [14]:
df

Unnamed: 0,url,state,scheme_name,sources,details,eligibility,applicationProcess
0,https://www.myscheme.gov.in/schemes/e-nam,Ministry Of Agriculture and Farmers Welfare,National Agriculture Market,https://enam.gov.in/web/docs/namguidelines.pdf,To integrate markets first at the level of the...,Single trading license to be valid across the ...,Users can register by Clicking http://enam.go...


## Get all the relevant links (schemes)

### First page extract link adn keywords

In [12]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

# Initialize the Chrome webdriver
driver = webdriver.Chrome()

# Navigate to the website
page_first = "https://www.myscheme.gov.in/search"
driver.get(page_first)

# Wait for the elements to load
wait = WebDriverWait(driver, 20)

# Wait specifically for the first card to appear before continuing
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.mx-auto.rounded-xl.shadow-md')))

# Initialize a dictionary to store URLs as keys and keywords as values
url_keywords_map = {}

# Find all the main card divs on the page
card_divs = driver.find_elements(By.CSS_SELECTOR, 'div.mx-auto.rounded-xl.shadow-md')

for card in card_divs:
    # For each card, extract the URL
    url_element = card.find_element(By.CSS_SELECTOR, 'h2 a')
    url = url_element.get_attribute('href')

    # Extract all the keyword divs
    keyword_divs = card.find_elements(By.CSS_SELECTOR, 'div.text-gray-800.bg-gray-100.rounded')  # Notice the change here
    
    # Extract the text (i.e., the keyword) from each keyword div and store them in a list
    keywords = [div.text for div in keyword_divs]

    # Store the keywords list in our dictionary with the URL as the key
    url_keywords_map[url] = keywords

print(url_keywords_map)

# Close the browser
driver.quit()


{'https://www.myscheme.gov.in/schemes/bjrcy': ['Hostel', 'Student'], 'https://www.myscheme.gov.in/schemes/onorc': ['Migrant Workers', 'Ration Card'], 'https://www.myscheme.gov.in/schemes/nos-sc': ['Degree', 'International Education', 'PhD', 'Post-Graduation', 'Scheduled Caste', 'Scholarship', 'Student', 'Tuition Fees'], 'https://www.myscheme.gov.in/schemes/kvsy': ['Financial Assistance', 'Girl', 'Kanya', 'Marriage', 'Vivah'], 'https://www.myscheme.gov.in/schemes/uky': ['Deprivation Of Liberty', 'Domestic Violence', 'Mental Abuse', 'Physical Abuse', 'Sexual Abuse'], 'https://www.myscheme.gov.in/schemes/pmssu': ['Disabled Welfare', 'Education', 'Financial Assistance', 'Student Finance'], 'https://www.myscheme.gov.in/schemes/spaddap': ['Assistive Devices', 'Differently Abled Persons', 'Disability', 'PwD', 'Social Welfare'], 'https://www.myscheme.gov.in/schemes/famdpwog': ['Financial Asisstance', 'Girl Child', 'Marriage', 'Orphan', 'Widow'], 'https://www.myscheme.gov.in/schemes/jms-11and12

### Try it with all next pages. 

Extracted 10 items from the page.
Clicked to navigate to page 2
Extracted 10 items from the page.
Clicked to navigate to page 3
Extracted 10 items from the page.
Clicked to navigate to page 4
Extracted 10 items from the page.
Clicked to navigate to page 5
Extracted 10 items from the page.
Clicked to navigate to page 6
Extracted 10 items from the page.
Clicked to navigate to page 7
Extracted 10 items from the page.
Clicked to navigate to page 8
Extracted 10 items from the page.
Clicked to navigate to page 9
Extracted 10 items from the page.
Clicked to navigate to page 10
Extracted 10 items from the page.
Clicked to navigate to page 11
Extracted 10 items from the page.
Clicked to navigate to page 12
Extracted 10 items from the page.
Clicked to navigate to page 13
Extracted 10 items from the page.
Clicked to navigate to page 14
Extracted 10 items from the page.
Clicked to navigate to page 15
Extracted 10 items from the page.
Clicked to navigate to page 16
Extracted 10 items from the page.

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.common.exceptions import ElementClickInterceptedException
import time

page_first = "https://www.myscheme.gov.in/search"

def click_next_page(driver, current_page):
    try:
        # Click on the next page number based on the current_page
        next_page_num = current_page + 1
        next_page_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, f"//li[text()='{next_page_num}']")))
        
        # Try scrolling the element into view
        driver.execute_script("arguments[0].scrollIntoView();", next_page_button)
        time.sleep(2)  # Wait a bit for any potential overlay to disappear
        
        # Try JavaScript click if regular click fails
        try:
            next_page_button.click()
        except ElementClickInterceptedException:
            driver.execute_script("arguments[0].click();", next_page_button)
        
        print(f"Clicked to navigate to page {next_page_num}")  # Debugging Output
        return True
    except (NoSuchElementException, TimeoutException):
        print(f"Failed to navigate to page {next_page_num}")  # Debugging Output
        return False
    except ElementClickInterceptedException:
        # You might add more specific handling for this exception here if needed
        print(f"Element was obscured when trying to navigate to page {next_page_num}")
        return False

def extract_content_from_current_page(driver):
    wait = WebDriverWait(driver, 10)
    card_divs = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.mx-auto.rounded-xl.shadow-md')))
    
    url_keywords_map = {}

    for card in card_divs:
        url_element = card.find_element(By.CSS_SELECTOR, 'h2 a')
        url = url_element.get_attribute('href')
        keyword_divs = card.find_elements(By.CSS_SELECTOR, 'div.text-gray-800.bg-gray-100.rounded')
        keywords = [div.text for div in keyword_divs]
        url_keywords_map[url] = keywords

    print(f"Extracted {len(url_keywords_map)} items from the page.")  # Debugging Output

    return url_keywords_map

def main():
    # Initialize WebDriver
    driver = webdriver.Chrome()
    driver.get(page_first)

    current_page = 1
    all_content = []
    
    all_content.append(extract_content_from_current_page(driver))

    # Try navigating using pagination numbers
    while click_next_page(driver, current_page):
        current_page += 1
        # Wait until one of the expected elements of the new page is present
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.mx-auto.rounded-xl.shadow-md')))
        all_content.append(extract_content_from_current_page(driver))

    driver.close()

    for content in all_content:
        print(content)

if __name__ == "__main__":
    main()

### Click the checkbox as filter to find schmes.

In [59]:


driver = webdriver.Chrome()

driver.get("https://myscheme.gov.in/search")

# Wait for page to load
WebDriverWait(driver, 10).until(EC.title_contains("Search Schemes")) 

# Click checkbox by partial text match
checkboxes = driver.find_elements(By.XPATH, "//span[contains(text(),'gri')]")
for checkbox in checkboxes:
    checkbox.click()

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
import csv

# Setting up the Chrome driver
driver = webdriver.Chrome()
driver.get("https://www.myscheme.gov.in/search")

# Wait for the label to be present using WebDriverWait
try:
    label_element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//span[contains(text(), 'Agri')]")))
    label_element.click()
except (NoSuchElementException, TimeoutException):
    print("Label not found. The checkbox might not exist or the text might be different.")
    driver.quit()
    exit()

# Wait for a moment to ensure data loads
time.sleep(5)

# Extract URLs
schemes = driver.find_elements(By.XPATH, "//div[@class='scheme-box']//h2/a")
urls = [scheme.get_attribute("href") for scheme in schemes]

# Save to CSV
with open('scraped_data.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["URL"])
    for url in urls:
        writer.writerow([url])

# Close the driver
driver.quit()

print("Data saved to 'scraped_data.csv'.")


ElementNotInteractableException: Message: element not interactable
  (Session info: chrome=118.0.5993.70)
Stacktrace:
0   chromedriver                        0x0000000104b5c510 chromedriver + 4310288
1   chromedriver                        0x0000000104b544bc chromedriver + 4277436
2   chromedriver                        0x00000001047879c4 chromedriver + 293316
3   chromedriver                        0x00000001047ce798 chromedriver + 583576
4   chromedriver                        0x00000001047c2eac chromedriver + 536236
5   chromedriver                        0x00000001047c2774 chromedriver + 534388
6   chromedriver                        0x0000000104807e60 chromedriver + 818784
7   chromedriver                        0x00000001047c0fd0 chromedriver + 528336
8   chromedriver                        0x00000001047c1e7c chromedriver + 532092
9   chromedriver                        0x0000000104b22834 chromedriver + 4073524
10  chromedriver                        0x0000000104b267fc chromedriver + 4089852
11  chromedriver                        0x0000000104b26c58 chromedriver + 4090968
12  chromedriver                        0x0000000104b2c8f8 chromedriver + 4114680
13  chromedriver                        0x0000000104b27234 chromedriver + 4092468
14  chromedriver                        0x0000000104b01604 chromedriver + 3937796
15  chromedriver                        0x0000000104b43ee8 chromedriver + 4210408
16  chromedriver                        0x0000000104b44064 chromedriver + 4210788
17  chromedriver                        0x0000000104b54134 chromedriver + 4276532
18  libsystem_pthread.dylib             0x0000000189183034 _pthread_start + 136
19  libsystem_pthread.dylib             0x000000018917de3c thread_start + 8
