# For Scrapping entire data from the Exploits-db site (If it failed in between increase the driver wait time)

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import csv
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm  # For progress bar

# Setup Chrome options
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("--no-sandbox")

# Initialize the WebDriver
driver = webdriver.Chrome(options=options)

# Function to fetch and parse the page content with retries
def fetch_page(retries=3, delay=2):
    for attempt in range(retries):
        try:
            return BeautifulSoup(driver.page_source, 'html.parser')
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error fetching page content: {e}")
            time.sleep(delay)
    return None

# Function to scrape the main list of exploits
def scrape_exploit_list():
    soup = fetch_page()
    if not soup:
        return []

    exploit_data = []
    table = soup.find('table', {'id': 'exploits-table'})
    if table:
        rows = table.find_all('tr')[1:]  # Skip header row
        for row in rows:
            cols = row.find_all('td')
            if len(cols) > 1:
                date = cols[0].text.strip() if cols[0] else 'N/A'
                exploit_link = cols[1].find('a', href=True)['href'] if cols[1].find('a', href=True) else 'N/A'
                description = cols[4].find('a').text.strip() if cols[4].find('a') else 'N/A'
                category = cols[5].find('a').text.strip() if cols[5].find('a') else 'N/A'
                platform = cols[6].find('a').text.strip() if cols[6].find('a') else 'N/A'
                author = cols[7].find('a').text.strip() if cols[7].find('a') else 'N/A'
                
                if exploit_link and exploit_link != 'N/A':
                    exploit_link = f"https://www.exploit-db.com{exploit_link}"

                exploit_data.append({
                    'date': date,
                    'exploit_link': exploit_link,
                    'description': description,
                    'category': category,
                    'platform': platform,
                    'author': author
                })
    return exploit_data

# Function to scrape details of each exploit
def scrape_exploit_details(exploit_link):
    try:
        driver.get(exploit_link)
        soup = fetch_page()
        if not soup:
            return {}
        
        title = soup.find('h1').text.strip() if soup.find('h1') else 'N/A'
        platform_detail = soup.find('div', class_='platform').text.strip() if soup.find('div', class_='platform') else 'N/A'
        
        return {
            'exploit_link': exploit_link,
            'title': title,
            'platform_detail': platform_detail
        }
    except Exception as e:
        print(f"Error scraping {exploit_link}: {e}")
        return {}

# Function to click the next page button with retries
def click_next_page(retries=3, delay=2):
    for attempt in range(retries):
        try:
            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, '//li[@class="paginate_button page-item next"]/a'))
            )
            next_button.click()
            WebDriverWait(driver, 10).until(EC.staleness_of(next_button))
            return True
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error clicking next page: {e}")
            time.sleep(delay)
    return False

def set_entries_per_page():
    try:
        select_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.NAME, 'exploits-table_length'))
        )
        select = Select(select_element)
        select.select_by_value('120')
        WebDriverWait(driver, 10).until(EC.staleness_of(select_element))
        print("Set entries per page to 120.")
    except Exception as e:
        print(f"Error setting entries per page: {e}")

# Main scraping loop with visualization
def main():
    base_url = 'https://www.exploit-db.com'
    driver.get(base_url)

    set_entries_per_page()

    with open('exploits1.csv', 'w', newline='', encoding='utf-8') as file:
        fieldnames = ['date', 'exploit_link', 'description', 'category', 'platform', 'author', 'title', 'platform_detail']
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()

        page_number = 1
        while True:
            print(f'Scraping page {page_number} at {driver.current_url}')
            exploit_links = scrape_exploit_list()

            valid_links = [link['exploit_link'] for link in exploit_links if link['exploit_link'] and link['exploit_link'] != 'N/A']

            if not valid_links:
                print("No valid exploit links found on this page.")
                continue

            with ThreadPoolExecutor(max_workers=5) as executor:
                results = list(tqdm(executor.map(scrape_exploit_details, valid_links), total=len(valid_links), desc="Scraping Exploits"))

            for exploit in exploit_links:
                details = next((item for item in results if item['exploit_link'] == exploit['exploit_link']), {})
                if details:
                    exploit.update(details)
                    writer.writerow(exploit)

            if not click_next_page():
                break

            page_number += 1

    print('Scraping completed. Data saved to exploits.csv.')

if __name__ == '__main__':
    main()

# Close the WebDriver
driver.quit()


Error setting entries per page: Message: 

Scraping page 1 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:13<00:00,  1.63it/s]


Scraping page 2 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:21<00:00,  1.47it/s]


Scraping page 3 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:16<00:00,  1.57it/s]


Scraping page 4 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:20<00:00,  1.49it/s]


Scraping page 5 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:18<00:00,  1.52it/s]


Scraping page 6 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:22<00:00,  1.45it/s]


Scraping page 7 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:17<00:00,  1.55it/s]


Scraping page 8 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:16<00:00,  1.57it/s]


Scraping page 9 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:17<00:00,  1.55it/s]


Scraping page 10 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:21<00:00,  1.48it/s]


Scraping page 11 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:22<00:00,  1.45it/s]


Scraping page 12 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:23<00:00,  1.43it/s]


Scraping page 13 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:26<00:00,  1.39it/s]


Scraping page 14 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:32<00:00,  1.30it/s]


Scraping page 15 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:18<00:00,  1.52it/s]


Scraping page 16 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:17<00:00,  1.56it/s]


Scraping page 17 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:23<00:00,  1.43it/s]


Scraping page 18 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:15<00:00,  1.60it/s]


Scraping page 19 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:21<00:00,  1.47it/s]


Scraping page 20 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:23<00:00,  1.43it/s]


Scraping page 21 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:18<00:00,  1.52it/s]


Scraping page 22 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:15<00:00,  1.58it/s]


Scraping page 23 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:47<00:00,  1.11it/s]


Scraping page 24 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:25<00:00,  1.40it/s]


Scraping page 25 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:24<00:00,  1.42it/s]


Scraping page 26 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:20<00:00,  1.50it/s]


Scraping page 27 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:26<00:00,  1.39it/s]


Scraping page 28 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:20<00:00,  1.49it/s]


Scraping page 29 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:15<00:00,  1.59it/s]


Scraping page 30 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:17<00:00,  1.54it/s]


Scraping page 31 at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:23<00:00,  1.44it/s]


Attempt 1: Error clicking next page: Message: 
Stacktrace:
#0 0x58ec823e2a5a <unknown>
#1 0x58ec81ef02f0 <unknown>
#2 0x58ec81f3f235 <unknown>
#3 0x58ec81f3f451 <unknown>
#4 0x58ec81f84dc4 <unknown>
#5 0x58ec81f63bed <unknown>
#6 0x58ec81f8211e <unknown>
#7 0x58ec81f63963 <unknown>
#8 0x58ec81f30eec <unknown>
#9 0x58ec81f3213e <unknown>
#10 0x58ec823af1cf <unknown>
#11 0x58ec823b32dd <unknown>
#12 0x58ec8239d6d7 <unknown>
#13 0x58ec823b3a51 <unknown>
#14 0x58ec82384d1e <unknown>
#15 0x58ec823d1538 <unknown>
#16 0x58ec823d173a <unknown>
#17 0x58ec823e16cc <unknown>
#18 0x7982a73f7ac3 <unknown>

Attempt 2: Error clicking next page: Message: 
Stacktrace:
#0 0x58ec823e2a5a <unknown>
#1 0x58ec81ef02f0 <unknown>
#2 0x58ec81f3f235 <unknown>
#3 0x58ec81f3f451 <unknown>
#4 0x58ec81f84dc4 <unknown>
#5 0x58ec81f63bed <unknown>
#6 0x58ec81f8211e <unknown>
#7 0x58ec81f63963 <unknown>
#8 0x58ec81f30eec <unknown>
#9 0x58ec81f3213e <unknown>
#10 0x58ec823af1cf <unknown>
#11 0x58ec823b32dd <unknown>
#1