# filter the Exploits data related to Router, Camera, Switch or NVRs then scrap(need to update the filter values for each type)

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import csv
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm  # For progress bar
import os

# Setup Chrome options
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
options.add_argument("--no-sandbox")

# Initialize the WebDriver
driver = webdriver.Chrome(options=options)

# Function to fetch and parse the page content with retries
def fetch_page(retries=5, delay=3):
    for attempt in range(retries):
        try:
            return BeautifulSoup(driver.page_source, 'html.parser')
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error fetching page content: {e}")
            time.sleep(delay)
    return None

# Function to scrape the main list of exploits
def scrape_exploit_list():
    soup = fetch_page()
    if not soup:
        print("Failed to fetch page content.")
        return []

    exploit_data = []
    table = soup.find('table', {'id': 'exploits-table'})
    if table:
        rows = table.find_all('tr')[1:]  # Skip header row
        if not rows:
            print("No rows found in table.")
        for row in rows:
            cols = row.find_all('td')
            if len(cols) > 1:
                try:
                    date = cols[0].text.strip() if cols[0] else 'N/A'
                    exploit_link = cols[1].find('a', href=True)['href'] if cols[1].find('a', href=True) else 'N/A'
                    description = cols[4].find('a').text.strip() if cols[4].find('a') else 'N/A'
                    category = cols[5].find('a').text.strip() if cols[5].find('a') else 'N/A'
                    platform = cols[6].find('a').text.strip() if cols[6].find('a') else 'N/A'
                    author = cols[7].find('a').text.strip() if cols[7].find('a') else 'N/A'
                except Exception as e:
                    print(f"Error processing row: {e}")
                    date, exploit_link, description, category, platform, author = ['N/A'] * 6

                if exploit_link and exploit_link != 'N/A':
                    exploit_link = f"https://www.exploit-db.com{exploit_link}"

                exploit_data.append({
                    'date': date,
                    'exploit_link': exploit_link,
                    'description': description,
                    'category': category,
                    'platform': platform,
                    'author': author
                })
    else:
        print("Exploit table not found.")
    
    return exploit_data

# Function to scrape details of each exploit
def scrape_exploit_details(exploit_link):
    try:
        driver.get(exploit_link)
        soup = fetch_page()
        if not soup:
            return {}
        
        title = soup.find('h1').text.strip() if soup.find('h1') else 'N/A'
        platform_detail = soup.find('div', class_='platform').text.strip() if soup.find('div', class_='platform') else 'N/A'
        
        return {
            'exploit_link': exploit_link,
            'title': title,
            'platform_detail': platform_detail
        }
    except Exception as e:
        print(f"Error scraping {exploit_link}: {e}")
        return {}

# Function to wait for page content to fully load
def wait_for_page_to_load():
    WebDriverWait(driver, 30).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "table#exploits-table"))
    )
    time.sleep(2)  # Optional sleep to ensure full rendering of content

# Function to click the next page button with retries
def click_next_page(retries=5, delay=3):
    for attempt in range(retries):
        try:
            next_button = WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.XPATH, '//li[@class="paginate_button page-item next"]/a'))
            )
            next_button.click()
            WebDriverWait(driver, 20).until(EC.staleness_of(next_button))
            return True
        except Exception as e:
            print(f"Attempt {attempt + 1}: Error clicking next page: {e}")
            time.sleep(delay)
    return False

# Function to set the search filter
def set_search_filter(filter_text):
    search_input = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "input[type='search']"))
    )
    search_input.clear()
    search_input.send_keys(filter_text)
    search_input.send_keys(Keys.RETURN)  # Press Enter to submit search

# Function to set entries per page
def set_entries_per_page():
    try:
        select_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.NAME, 'exploits-table_length'))
        )
        select = Select(select_element)
        select.select_by_value('120')
        WebDriverWait(driver, 10).until(EC.staleness_of(select_element))
        print("Set entries per page to 120.")
    except Exception as e:
        print(f"Error setting entries per page: {e}")

# Function to check if CSV file exists and write the header only once
def write_csv_header(file_path, fieldnames):
    if not os.path.exists(file_path):
        with open(file_path, 'w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()
            print("CSV header written.")

# Main scraping loop with individual filter
def main():
    base_url = 'https://www.exploit-db.com'
    driver.get(base_url)

    set_entries_per_page()

    # List of filters to be applied one by one
    filters = ['Router']  # Modify as needed

    file_path = 'exploits_filtered.csv'
    fieldnames = ['date', 'exploit_link', 'description', 'category', 'platform', 'author', 'title', 'platform_detail']
    
    # Write header if the CSV file doesn't exist
    write_csv_header(file_path, fieldnames)

    with open(file_path, 'a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        for filter_text in filters:
            print(f"Applying filter: {filter_text}")
            set_search_filter(filter_text)

            page_number = 1
            while True:
                wait_for_page_to_load()  # Wait for content to load
                print(f'Scraping page {page_number} with filter "{filter_text}" at {driver.current_url}')
                exploit_links = scrape_exploit_list()

                if not exploit_links:
                    print(f"No exploits found on page {page_number} with filter '{filter_text}'.")
                    break

                valid_links = [link['exploit_link'] for link in exploit_links if link['exploit_link'] and link['exploit_link'] != 'N/A']

                if not valid_links:
                    print(f"No valid exploit links found on page {page_number} with filter '{filter_text}'.")
                    break

                with ThreadPoolExecutor(max_workers=3) as executor:  # Reduced workers to avoid overloading
                    results = list(tqdm(executor.map(scrape_exploit_details, valid_links), total=len(valid_links), desc="Scraping Exploits"))

                for exploit in exploit_links:
                    details = next((item for item in results if item['exploit_link'] == exploit['exploit_link']), {})
                    if details:
                        exploit.update(details)
                        writer.writerow(exploit)

                if not click_next_page():
                    print(f"Ending scraping for filter '{filter_text}' after page {page_number}.")
                    break

                page_number += 1

    print('Scraping completed. Data saved to exploits_filtered.csv.')

if __name__ == '__main__':
    main()

# Close the WebDriver
driver.quit()


Error setting entries per page: Message: 

Applying filter: Router
Scraping page 1 with filter "Router" at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:17<00:00,  1.56it/s]


Scraping page 2 with filter "Router" at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 120/120 [01:19<00:00,  1.51it/s]


Scraping page 3 with filter "Router" at https://www.exploit-db.com/


Scraping Exploits: 100%|██████████| 102/102 [01:09<00:00,  1.48it/s]


Attempt 1: Error clicking next page: Message: 
Stacktrace:
#0 0x598e8b618a5a <unknown>
#1 0x598e8b1262f0 <unknown>
#2 0x598e8b175235 <unknown>
#3 0x598e8b175451 <unknown>
#4 0x598e8b1badc4 <unknown>
#5 0x598e8b199bed <unknown>
#6 0x598e8b1b811e <unknown>
#7 0x598e8b199963 <unknown>
#8 0x598e8b166eec <unknown>
#9 0x598e8b16813e <unknown>
#10 0x598e8b5e51cf <unknown>
#11 0x598e8b5e92dd <unknown>
#12 0x598e8b5d36d7 <unknown>
#13 0x598e8b5e9a51 <unknown>
#14 0x598e8b5bad1e <unknown>
#15 0x598e8b607538 <unknown>
#16 0x598e8b60773a <unknown>
#17 0x598e8b6176cc <unknown>
#18 0x7390ce2bbac3 <unknown>

Attempt 2: Error clicking next page: Message: 
Stacktrace:
#0 0x598e8b618a5a <unknown>
#1 0x598e8b1262f0 <unknown>
#2 0x598e8b175235 <unknown>
#3 0x598e8b175451 <unknown>
#4 0x598e8b1badc4 <unknown>
#5 0x598e8b199bed <unknown>
#6 0x598e8b1b811e <unknown>
#7 0x598e8b199963 <unknown>
#8 0x598e8b166eec <unknown>
#9 0x598e8b16813e <unknown>
#10 0x598e8b5e51cf <unknown>
#11 0x598e8b5e92dd <unknown>
#1

# Set the and load the exploits codes

In [11]:
import pandas as pd
import os

# Specify the CSV file path
file_path = './data/exploits_filtered.csv'

# Specify the local directory where files are stored
local_dir = './downloads'

# Load the CSV file into a DataFrame
data = pd.read_csv(file_path)

# Function to update exploit_link to local paths
def update_to_local_paths(row):
    file_name = os.path.basename(row['exploit_link'])
    local_path = os.path.join(local_dir, file_name)
    return local_path

# Update the exploit_link column
data['exploit_link'] = data.apply(update_to_local_paths, axis=1)

# Save the updated DataFrame to a new CSV file
updated_csv_path = '../data/exploits_filtered_updated.csv'
data.to_csv(updated_csv_path, index=False)

print(f"Updated CSV saved to {updated_csv_path}")


Updated CSV saved to ./exploits_filtered_updated.csv


# update the the file data into the csv file

In [12]:
import os
import pandas as pd
import shutil

# Define the download folder path (you can change this to your specific folder)
download_folder = './downloads/'

# Define the output CSV file path
output_csv = './data/extracted_data.csv'

# Example data frame (replace this with your actual data)
data = pd.read_csv('./data/exploits_filtered_updated.csv')

# Convert the data to a DataFrame
columns = ['date', 'exploit_link', 'description', 'category', 'platform', 'author', 'title']
df = pd.DataFrame(data, columns=columns)

# Function to extract files based on the exploit_link
def extract_files_from_links(df, download_folder, output_csv):
    extracted_data = []

    for index, row in df.iterrows():
        file_link = row['exploit_link']
        file_path = os.path.join(download_folder, file_link.split('/')[-1])  # Get file name from the link

        if os.path.exists(file_path):  # Check if file exists
            # Assuming the file is a text file or any readable file
            with open(file_path, 'r') as file:
                file_data = file.read()

            # You can modify this logic if you want to process file contents
            extracted_data.append(row.tolist() + [file_data])  # Add file content to the row data
        else:
            extracted_data.append(row.tolist() + ['File Not Found'])  # If the file is not found

    # Create a new DataFrame for extracted data
    extracted_columns = columns + ['file_data']
    extracted_df = pd.DataFrame(extracted_data, columns=extracted_columns)

    # Save to CSV
    extracted_df.to_csv(output_csv, index=False)
    print(f"Data extracted and saved to {output_csv}")

# Run the extraction function
extract_files_from_links(df, download_folder, output_csv)


Data extracted and saved to ./extracted_data.csv
