In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from fake_useragent import UserAgent
import undetected_chromedriver as uc
import pandas as pd
import logging
import time
import csv
import warnings
import re
from rapidfuzz import process, fuzz
from collections import OrderedDict
import os
import gc
import traceback

In [None]:
reverse_state_mapping = {
    "ANDAMAN AND NICOBAR ISLANDS": "35",
    "ANDHRA PRADESH": "28",
    "ARUNACHAL PRADESH": "12",
    "ASSAM": "18",
    "BIHAR": "10",
    "CHANDIGARH": "4",
    "CHHATTISGARH": "22",
    "DELHI": "7",
    "GOA": "30",
    "GUJARAT": "24",
    "HARYANA": "6",
    "HIMACHAL PRADESH": "2",
    "JAMMU AND KASHMIR": "1",
    "JHARKHAND": "20",
    "KARNATAKA": "29",
    "KERALA": "32",
    "LADAKH": "37",
    "LAKSHADWEEP": "31",
    "MADHYA PRADESH": "23",
    "MAHARASHTRA": "27",
    "MANIPUR": "14",
    "MEGHALAYA": "17",
    "MIZORAM": "15",
    "NAGALAND": "13",
    "ODISHA": "21",
    "PUDUCHERRY": "34",
    "PUNJAB": "3",
    "RAJASTHAN": "8",
    "SIKKIM": "11",
    "TAMIL NADU": "33",
    "TELANGANA": "36",
    "DADRA AND NAGAR HAVELI AND DAMAN AND DIU": "38",
    "TRIPURA": "16",
    "UTTAR PRADESH": "9",
    "UTTARAKHAND": "5",
    "WEST BENGAL": "19"
}


def scrape_udyam(niccode,state):
    href=f'https://udyamregistration.gov.in/SearchRegDetail.aspx?cod={niccode}&ty=2&si={reverse_state_mapping[state.upper()]}&di=0'  
    return href

def get_href(names,href):
    result=[]
    names=[re.sub(r'\b(LIMITED|LTD|PVT|LLP|INC|CORP|ENTERPRISE|FINANCE|PRIVATE|SERVICES|FINANCIAL|CAPITAL)\b', '', name.upper())for name in names]
      # remove if you want to see the browser
    ua = UserAgent()
    random_user_agent = ua.random

    chrome_options = uc.ChromeOptions()
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--incognito")
    # chrome_options.add_argument("--start-maximized")

    # Headless mode (optional)
    # chrome_options.headless = True
    # chrome_options.add_argument("--headless")

    # Set a realistic User-Agent
    chrome_options.add_argument(f"user-agent={random_user_agent}")
    driver = uc.Chrome(options=chrome_options)

    wait = WebDriverWait(driver, 20)
    try:
        driver.get(href)
        # Wait for dropdown to load and select 'All'
        try:
            entries_dropdown = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="example1_length"]/label/select')))
            select = Select(entries_dropdown)
            select.select_by_visible_text("All")
        except:
            print("Dropdown not found or not interactable.")
            raise Exception("Dropdown not found or not interactable (possible 503/404 or site error).")

        # Wait for table to update
        time.sleep(5)
        wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="example1"]')))

        # Get the table rows
        table = driver.find_element(By.XPATH, '//*[@id="example1"]')
        rows = table.find_elements(By.TAG_NAME, "tr")
        data=[]
        for row in rows[1:]:
            cols = row.find_elements(By.TAG_NAME, "td")
            comp=cols[1].text.strip()
            data.append(re.sub(r'\b(LIMITED|LTD|PVT|LLP|INC|CORP|ENTERPRISE|FINANCE|PRIVATE|SERVICES|FINANCIAL|CAPITAL)\b', '', comp.upper()))
        print(len(data))
        for name in names:
            match, score, idx = process.extractOne(
            name, 
            data, 
            scorer=fuzz.token_set_ratio)  
            print(name,match,score) 
            result.append(score>90)
    except:
        print("Error occurred while processing the table.")
        result.append(False)
    finally:
        driver.quit()

    return result

In [None]:
def process_companies(input_file, output_file, backup_file='rowwise_backup.csv', error_log='error_rows.csv'):
    # error log file
    error_fields = None
    if not os.path.exists(error_log):
        with open(error_log, mode='w', newline='', encoding='utf-8') as ef:
            pass 

    with open(input_file, mode='r', encoding='utf-8') as infile, \
         open(output_file, mode='w', newline='', encoding='utf-8') as outfile, \
         open(backup_file, mode='w', newline='', encoding='utf-8') as backupfile, \
         open(error_log, mode='a', newline='', encoding='utf-8') as errorfile:

        reader = csv.DictReader(infile)
        fieldnames = ['Company Name', 'Industry Codes (NIC 2008)', 'Main Activities Codes (NIC 2008)', 'State/County', 'Match Found']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        backup_writer = csv.DictWriter(backupfile, fieldnames=fieldnames)
        writer.writeheader()
        backup_writer.writeheader()

        error_writer = None

        logging.basicConfig(filename='process_log.txt', level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s')

        for row in reader:
            try:
                company_name = row['Company']
                print(f"Searching for company: {company_name}")
                nic_codes = [code.strip() for code in row['Industry Codes (NIC 2008)'].split(';')]
                main_activity_codes = [code.strip() for code in row['Main Activities Codes (NIC 2008)'].split(';')]
                state = row['State/County']

                # Skip if any NIC code is 99999
                if '99999' in nic_codes or '99999' in main_activity_codes:
                    if error_writer is None:
                        error_fields = list(row.keys()) + ['Error']
                        error_writer = csv.DictWriter(errorfile, fieldnames=error_fields)
                        if os.stat(error_log).st_size == 0:
                            error_writer.writeheader()
                    error_row = dict(row)
                    logging.info(f"NIC code 99999 found for {company_name}. Writing to error file.")
                    error_row['Error'] = "Skipped due to NIC code 99999"
                    error_writer.writerow(error_row)
                    errorfile.flush()
                    continue

                hrefs = []
                for nic_code in nic_codes:
                    for main_code in main_activity_codes:
                        hrefs.append(scrape_udyam(nic_code, state))
                        if nic_code != main_code:
                            hrefs.append(scrape_udyam(main_code, state))

                match_found = False
                for href in hrefs:
                    try:
                        if get_href([company_name], href)[0]:
                            match_found = True
                            break
                    finally:
                        time.sleep(2)  # Rate limiting

                out_row = {
                    'Company Name': company_name,
                    'Industry Codes (NIC 2008)': ';'.join(nic_codes),
                    'Main Activities Codes (NIC 2008)': ';'.join(main_activity_codes),
                    'State/County': state,
                    'Match Found': match_found
                }
                writer.writerow(out_row)
                if match_found:
                    backup_writer.writerow(out_row)
                    backupfile.flush()
                    logging.info(f"Company '{company_name}' found. Written to backup file.")
                else:
                    if error_writer is None:
                        error_fields = list(row.keys()) + ['Error']
                        error_writer = csv.DictWriter(errorfile, fieldnames=error_fields)
                        if os.stat(error_log).st_size == 0:
                            error_writer.writeheader()
                    error_row = dict(row)
                    error_row['Error'] = "Not found on website"
                    error_writer.writerow(error_row)
                    errorfile.flush()
                    logging.info(f"Company '{company_name}' not found. Written to error file.")

            except Exception as e:
                if error_writer is None:
                    error_fields = list(row.keys()) + ['Error']
                    error_writer = csv.DictWriter(errorfile, fieldnames=error_fields)
                    if os.stat(error_log).st_size == 0:
                        error_writer.writeheader()
                error_row = dict(row)
                error_row['Error'] = str(e) + "\n" + traceback.format_exc()
                error_writer.writerow(error_row)
                errorfile.flush()
                logging.info(f"Error occurred for company '{company_name}'. Written to error file.")

            gc.collect()

process_companies('set1.csv', 'output_matches.csv')

Searching for company: Danfoss Power Solutions India Private Limited
Searching for company: Geekay Wires Ltd.
Dropdown not found or not interactable.
Error occurred while processing the table.
Searching for company: Sainor Laboratories Private Limited
1351
SAINOR LABORATORIES   M/S SAINOR LABORATORIES   100.0
Searching for company: Girnar Finserv Private Limited
Dropdown not found or not interactable.
Error occurred while processing the table.
6357
GIRNAR FINSERV   M/S GIRNAR FINSERV   100.0
Searching for company: Hyoseong Electric India Private Limited
Dropdown not found or not interactable.
Error occurred while processing the table.
Searching for company: Federation Of Indian Chamber Of Commerce And Industry.
Searching for company: Vivek Tradelink Private Limited
Dropdown not found or not interactable.
Error occurred while processing the table.
Searching for company: Hometrail Buildtech Private Limited
Dropdown not found or not interactable.
Error occurred while processing the table.

KeyboardInterrupt: 

In [11]:
import csv

with open('set1.csv', newline='', encoding='utf-8-sig') as file:
    reader = csv.DictReader(file)
    print(reader.fieldnames)

['Num', 'Country', 'Company', 'Industry (NIC 2008)', 'Main Activities (NIC 2008)', 'Industry Codes (NIC 2008)', 'Main Activities Codes (NIC 2008)', 'Total operating revenue', 'Property, plant and equipment', 'City', 'State/County', 'Postal Code', 'Address', 'Phone', 'Email', 'Website', 'Company ID', 'Fiscal Year', 'Audited', 'Consolidated', 'Source']
