In [1]:
from concurrent.futures import ThreadPoolExecutor
import threading
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pyautogui
import time
import pandas as pd
import traceback
import random
import openpyxl
import zipfile
import base64
import os

def create_proxy_auth_extension(proxy_host, proxy_port, proxy_username, proxy_password):
    manifest_json = """
    {
        "version": "1.0.0",
        "manifest_version": 2,
        "name": "Chrome Proxy",
        "permissions": [
            "proxy",
            "tabs",
            "unlimitedStorage",
            "storage",
            "<all_urls>",
            "webRequest",
            "webRequestBlocking"
        ],
        "background": {
            "scripts": ["background.js"]
        },
        "minimum_chrome_version": "22.0.0"
    }
    """

    background_js = f"""
    var config = {{
        mode: "fixed_servers",
        rules: {{
            singleProxy: {{
                scheme: "http",
                host: "{proxy_host}",
                port: parseInt("{proxy_port}")
            }},
            bypassList: ["localhost"]
        }}
    }};

    chrome.proxy.settings.set({{value: config, scope: "regular"}}, function() {{}});

    function callbackFn(details) {{
        return {{
            authCredentials: {{
                username: "{proxy_username}",
                password: "{proxy_password}"
            }}
        }};
    }}

    chrome.webRequest.onAuthRequired.addListener(
        callbackFn,
        {{urls: ["<all_urls>"]}},
        ['blocking']
    );
    """

    extension_name = f'extensions/proxy_auth_plugin_{proxy_host}_{proxy_port}.zip'
    if os.path.exists(extension_name):

        return extension_name
    with zipfile.ZipFile(extension_name, 'w') as zp:
        zp.writestr("manifest.json", manifest_json)
        zp.writestr("background.js", background_js)
    
    return extension_name

def create_all_proxy_extensions():
    proxy_list = []
    with open("proxy_list.txt", "r") as file:
        for x in file.read().splitlines():
            if x:
                splits = x.split(":")
                proxy_list.append((splits[0], splits[1], splits[2], splits[3]))

    extensions = []
    for proxy in proxy_list:
        extension = create_proxy_auth_extension(proxy[0], proxy[1], proxy[2], proxy[3])
        extensions.append(extension)
    
    return extensions

extensions = create_all_proxy_extensions()


def get_driver(url = "https://arc-sos.state.al.us/CGI/CORPNAME.MBR/INPUT"):
    # read proxy list from proxy_list.txt and save it in a list. Also remove line that is empty

    extension = random.choice(extensions)
    chromeOptions = webdriver.ChromeOptions()

    chromeOptions.headless = False
    # Headless is faster. If headless is False then it opens a browser and you can see action of web driver. You can try making it False
    chromeOptions.add_argument("--log-level=3")

    

    chromeOptions.add_extension(extension)

    # installs chrome driver automatically if not present
    s = Service(ChromeDriverManager().install())

    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()), options=chromeOptions
    )

    #enter_proxy_auth(driver, proxy_username, proxy_password)
    
    time.sleep(2)
    
    return driver

def remove_matching_entity_ids(scraped_entity_ids, all_entity_ids):
    # remove matching entity ids
    for entity_id in scraped_entity_ids:
        all_entity_ids.remove(entity_id.replace("-", ""))
    return all_entity_ids

def random_sleep(min_sec=2, max_sec=4):
    # avoiding ban
    time.sleep(random.randint(min_sec, max_sec))

def entity_id_list():
    result = []
    for i in range(965):  # 000 to 964
        for j in range(1000):  # 000 to 999
            for k in range(1, 1000):  # 000 to 999
                current_string = f"{i:03d}{j:03d}{k:03d}"
                result.append(current_string)
                if current_string == "000964437":
                    return result

In [2]:
filename = 'result.csv'

entity_id_list = entity_id_list()
records = []
if os.path.exists(filename):
    df = pd.read_csv(filename)
    records = df.to_dict(orient='records')
    scraped_entity_ids = df["Entity ID Number"].tolist()
    # remove matching entity ids
    entity_id_list = remove_matching_entity_ids(scraped_entity_ids, entity_id_list)

In [3]:
def detailed_scrap(entity_id_list):
    # we have to visit each new link to scrap detaily so it takes time.
    url = "https://arc-sos.state.al.us/cgi/corpdetail.mbr/detail?corp="
    count = 0
    driver = get_driver()
    for entity_id in entity_id_list:
        try:
            '''
            count += 1
            if count % 10 == 0:
                driver.close()
                driver = get_driver()
            '''
            driver.get(url+entity_id)
            
            if driver.find_elements(By.ID, "main-frame-error"):
                driver.close()
                print("Proxy did not work so quit it. Using another proxy")
                driver = get_driver()
                driver.get(url+entity_id)
                
            random_sleep()
            info_dict = {}
            infos = []
            try:
                if driver.find_element(By.ID, "block-sos-content").text == 'No matches found.':
                    info_dict['Entity ID Number'] = entity_id[:3] + '-' + entity_id[3:6] + '-' + entity_id[6:9]
                    info_dict['Status'] = 'No matches found.'
                    records.append(info_dict)

                    continue
                infos = [info.find_elements(By.TAG_NAME, "td") for info in driver.find_element(By.TAG_NAME, "tbody").find_elements(By.TAG_NAME, "tr")]
            except:
                driver.close()
                driver = get_driver()
            
            if not infos:
                continue
            
            for info in infos:
                try:
                    key = info[0].text
                    value = info[1].text
                    info_dict[key] = value
                except IndexError:
                    pass
            
            info_dict['Entity Name'] = driver.find_element(By.TAG_NAME, 'td').text # first td is entity name

            records.append(info_dict)
            if len(records) % 50 == 0:
                print("Number of records scraped:", len(records))
                pd.DataFrame.from_records(records).to_csv(filename, index=False)
                pd.DataFrame.from_records(records).to_csv("backup_result.csv", index=False)
        
        except:
            pd.DataFrame.from_records(records).to_csv(filename, index=False)
            pd.DataFrame.from_records(records).to_csv("backup_result.csv", index=False)
            driver.save_screenshot('screenshot.png')
            print(driver.current_url)
            print(traceback.format_exc())
            driver.close()
            driver = get_driver()
        
    if len(records) > 0:
        pd.DataFrame.from_records(records).to_csv(filename, index=False)

    driver.quit()

In [4]:

# if you want to use multithreading use this. But be careful. Simultanus website visit from same IP can lead to ban. So I have avoided it after trying
max_workers = 16

each_list_len = int(len(entity_id_list)/max_workers)+1
list_of_list = []
for i in range(max_workers-1):

    list_of_list.append(entity_id_list[each_list_len * i : each_list_len* (i + 1)])
list_of_list.append(entity_id_list[each_list_len * (i + 1) :])

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    executor.map(detailed_scrap, list_of_list)

Proxy did not work so quit it. Using another proxy
Proxy did not work so quit it. Using another proxy
Proxy did not work so quit it. Using another proxy
Number of records scraped: 4200
Proxy did not work so quit it. Using another proxy
Proxy did not work so quit it. Using another proxy
Proxy did not work so quit it. Using another proxy
Proxy did not work so quit it. Using another proxy
Proxy did not work so quit it. Using another proxy
Proxy did not work so quit it. Using another proxy
Number of records scraped: 4250
Number of records scraped: 4300
Number of records scraped: 4350
Proxy did not work so quit it. Using another proxy
Proxy did not work so quit it. Using another proxy
Number of records scraped: 4400
Number of records scraped: 4450
Number of records scraped: 4500
Number of records scraped: 4550
Proxy did not work so quit it. Using another proxy
Number of records scraped: 4600
Number of records scraped: 4650
Number of records scraped: 4700
Number of records scraped: 4750
Numb