In [1]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait,Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
import pandas as pd
import logging
import time
import undetected_chromedriver as uc
from fake_useragent import UserAgent
import re
import warnings
from collections import OrderedDict
from selenium.webdriver.remote.remote_connection import LOGGER
import logging
from selenium.common.exceptions import TimeoutException
import os
from rapidfuzz import process, fuzz
from selenium.webdriver import ChromeOptions,Chrome


In [2]:
warnings.filterwarnings("ignore")
LOGGER.setLevel(logging.ERROR)
logging.getLogger("selenium").setLevel(logging.CRITICAL)
warnings.filterwarnings("ignore", category=UserWarning)
import logging

logging.basicConfig(
    filename="warnings.log",
    level=logging.WARNING,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

reverse_state_mapping = {
    "ANDAMAN AND NICOBAR ISLANDS": "35",
    "ANDHRA PRADESH": "28",
    "ARUNACHAL PRADESH": "12",
    "ASSAM": "18",
    "BIHAR": "10",
    "CHANDIGARH": "4",
    "CHHATTISGARH": "22",
    "DELHI": "7",
    "GOA": "30",
    "GUJARAT": "24",
    "HARYANA": "6",
    "HIMACHAL PRADESH": "2",
    "JAMMU AND KASHMIR": "1",
    "JHARKHAND": "20",
    "KARNATAKA": "29",
    "KERALA": "32",
    "LADAKH": "37",
    "LAKSHADWEEP": "31",
    "MADHYA PRADESH": "23",
    "MAHARASHTRA": "27",
    "MANIPUR": "14",
    "MEGHALAYA": "17",
    "MIZORAM": "15",
    "NAGALAND": "13",
    "ODISHA": "21",
    "PUDUCHERRY": "34",
    "PUNJAB": "3",
    "RAJASTHAN": "8",
    "SIKKIM": "11",
    "TAMIL NADU": "33",
    "TELANGANA": "36",
    "DADRA AND NAGAR HAVELI AND DAMAN AND DIU": "38",
    "TRIPURA": "16",
    "UTTAR PRADESH": "9",
    "UTTARAKHAND": "5",
    "WEST BENGAL": "19"
}



def scrape_udyam(niccode,state):
    href=f'https://udyamregistration.gov.in/SearchRegDetail.aspx?cod={niccode}&ty=2&si={reverse_state_mapping[state.upper()]}&di=0'  
    return href

def get_href(names, href):
    all_rows = []
    ua = UserAgent()
    random_user_agent = ua.random

    chrome_options = uc.ChromeOptions()
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--incognito")
    chrome_options.add_argument(f"user-agent={random_user_agent}")
    driver = uc.Chrome(options=chrome_options)

    wait = WebDriverWait(driver, 20)
    try:
        try:
            driver.get(href)
        except TimeoutException:
            print("Page load timed out. Trying to stop loading manually.")
            driver.execute_script("window.stop();")
        try:
            entries_dropdown = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="example1_length"]/label/select')))
            select = Select(entries_dropdown)
            select.select_by_visible_text("All")
        except:
            print("Dropdown not found or not interactable.")

        time.sleep(5)
        wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="example1"]')))
        try:
            table = driver.find_element(By.XPATH, '//*[@id="example1"]')
            rows = table.find_elements(By.TAG_NAME, "tr")
            headers = [th.text.strip() for th in rows[0].find_elements(By.TAG_NAME, "th")]
            headers.append("Business Email")  # Add email column
            data = []
            for row in rows[1:]:
                cols = row.find_elements(By.TAG_NAME, "td")
                row_data = [col.text.strip() for col in cols[:-1]]  # All except the last column
                try:
                    email_input = cols[-1].find_element(By.TAG_NAME, "input")
                    email = email_input.get_attribute("value")
                except:
                    email = ""
                row_data.append(email)
                data.append(dict(zip(headers, row_data)))
            all_rows = data
            print(f"Rows captured: {len(all_rows)}")
        except Exception as e:
            print('Table not found', {e})
    except Exception as e:
        print("Error occurred while processing the table.", e)
    finally:
        driver.quit()

    return all_rows

In [None]:
import pandas as pd
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import os
from datetime import datetime

# Setup logging and data directories
backup_dir = "backups"
os.makedirs(backup_dir, exist_ok=True)
backup_file = os.path.join(backup_dir, "running_backup.csv")  # Single backup file

df = pd.read_excel('NIC_2008.xlsx')
nic_codes = df.iloc[:, 0].astype(str)
valid_nic_codes = nic_codes[nic_codes.str.len() == 5].tolist()

test_state = "KARNATAKA"  # Change as needed

all_results = []
failed_nics = []

def scrape_nic(nic):
    href = scrape_udyam(nic, test_state)
    try:
        table_rows = get_href([], href)
        if not table_rows:  # If no rows were found
            failed_nics.append({"NIC": nic, "State": test_state, "Reason": "No data found"})
            return []
    except Exception as e:
        failed_nics.append({"NIC": nic, "State": test_state, "Reason": str(e)})
        logging.error(f"NIC: {nic}, State: {test_state}, Error: {e}")
        return []
    
    for row in table_rows:
        row["NIC_Code"] = nic
        row["State"] = test_state
    return table_rows

with ThreadPoolExecutor(max_workers=3) as executor:
    futures = {executor.submit(scrape_nic, nic): nic for nic in valid_nic_codes}
    for i, future in enumerate(tqdm(as_completed(futures), total=len(futures), desc="Scraping NICs")):
        nic = futures[future]
        try:
            result = future.result()
            all_results.extend(result)
            
            # Save/overwrite backup after each NIC code
            pd.DataFrame(all_results).to_csv(backup_file, index=False)
            print(f"Saved backup with {len(all_results)} results")
            
        except Exception as e:
            failed_nics.append({"NIC": nic, "State": test_state, "Reason": str(e)})
            logging.error(f"Failed to process NIC {nic}: {str(e)}")

# Save final results with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
pd.DataFrame(all_results).to_csv(f"test_nic_state_companies_fulltable_{timestamp}.csv", index=False)

# Save failed NICs
if failed_nics:
    pd.DataFrame(failed_nics).to_csv(f"failed_nics_{timestamp}.csv", index=False)
    print(f"Number of failed NICs: {len(failed_nics)}")

Scraping NICs:   0%|          | 1/1127 [00:01<31:00,  1.65s/it]

Saved backup with 0 results
Saved backup with 0 results


Scraping NICs:   0%|          | 3/1127 [00:02<14:54,  1.26it/s]

Saved backup with 0 results


Scraping NICs:   0%|          | 4/1127 [00:04<18:31,  1.01it/s]

Saved backup with 0 results
Error occurred while processing the table. ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))


Scraping NICs:   0%|          | 5/1127 [00:16<1:26:55,  4.65s/it]

Saved backup with 0 results
Dropdown not found or not interactable.
Dropdown not found or not interactable.
Error occurred while processing the table. Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=137.0.7151.105)
Stacktrace:
	GetHandleVerifier [0x0x4f3b03+62899]
	GetHandleVerifier [0x0x4f3b44+62964]
	(No symbol) [0x0x3210f3]
	(No symbol) [0x0x2fff59]
	(No symbol) [0x0x394f7e]
	(No symbol) [0x0x3af6a9]
	(No symbol) [0x0x38e306]
	(No symbol) [0x0x35d670]
	(No symbol) [0x0x35e4e4]
	GetHandleVerifier [0x0x754793+2556483]
	GetHandleVerifier [0x0x74fd02+2537394]
	GetHandleVerifier [0x0x51a2fa+220586]
	GetHandleVerifier [0x0x50aae8+157080]
	GetHandleVerifier [0x0x51141d+184013]
	GetHandleVerifier [0x0x4fba68+95512]
	GetHandleVerifier [0x0x4fbc10+95936]
	GetHandleVerifier [0x0x4e6b5a+9738]
	BaseThreadInitThunk [0x0x77a15d49+25]
	RtlInitializeExceptionChain [0x0x77e7d09b+107]
	RtlGetAppContainerNamedObjectPath [0x0x77e7d021+

Scraping NICs:   0%|          | 5/1127 [00:22<1:24:01,  4.49s/it]


Error occurred while processing the table. Message: invalid session id: session deleted as the browser has closed the connection
from disconnected: not connected to DevTools
  (Session info: chrome=137.0.7151.105)
Stacktrace:
	GetHandleVerifier [0x0x4f3b03+62899]
	GetHandleVerifier [0x0x4f3b44+62964]
	(No symbol) [0x0x3210f3]
	(No symbol) [0x0x3108c0]
	(No symbol) [0x0x32e87f]
	(No symbol) [0x0x39514c]
	(No symbol) [0x0x3af6a9]
	(No symbol) [0x0x38e306]
	(No symbol) [0x0x35d670]
	(No symbol) [0x0x35e4e4]
	GetHandleVerifier [0x0x754793+2556483]
	GetHandleVerifier [0x0x74fd02+2537394]
	GetHandleVerifier [0x0x51a2fa+220586]
	GetHandleVerifier [0x0x50aae8+157080]
	GetHandleVerifier [0x0x51141d+184013]
	GetHandleVerifier [0x0x4fba68+95512]
	GetHandleVerifier [0x0x4fbc10+95936]
	GetHandleVerifier [0x0x4e6b5a+9738]
	BaseThreadInitThunk [0x0x77a15d49+25]
	RtlInitializeExceptionChain [0x0x77e7d09b+107]
	RtlGetAppContainerNamedObjectPath [0x0x77e7d021+561]



In [None]:
# import pandas as pd

# df = pd.read_excel('NIC_2008.xlsx')
# nic_codes = df.iloc[:, 0].astype(str)
# valid_nic_codes = nic_codes[nic_codes.str.len() == 5].tolist()

# test_state = "TAMIL NADU"  # Change as needed

# all_results = []
# for nic in valid_nic_codes:
#     href = scrape_udyam(nic, test_state)
#     try:
#         table_rows = get_href([], href)
#     except Exception as e:
#         table_rows = [{"Error": f"{e}"}]

#     for row in table_rows:
#         row["NIC_Code"] = nic
#         row["State"] = test_state
#         all_results.append(row)

#         if len(all_results) % 1000 == 0:
#             pd.DataFrame(all_results).to_csv("test_nic_state_companies_backup.csv", index=False)



# pd.DataFrame(all_results).to_csv("test_nic_state_companies_fulltable.csv", index=False)