In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
from bs4 import BeautifulSoup
import pandas as pd

def extract_row_data(html_row):
    # Parse the HTML content
    soup = BeautifulSoup(html_row, 'html.parser')

    # Find the main row-layout div
    row_div = soup.find("div", class_="row-layout")
    # Extract relevant data from each div inside the row-layout
    data = []
    for div in row_div.find_all("div", recursive=False):
        text = ''.join(div.stripped_strings).replace('</d...', '').strip()
        data.append(text)

    # Ensure data is cleaned and aligned correctly
    if len(data) < 9:
        data.extend(['_'] * (9 - len(data)))  # Fill missing fields with '_'

    # Create a dictionary to hold the final data
    row_data = {
        "Variant ID": data[0] if len(data) > 0 else '_',
        "Gene": data[1] if len(data) > 1 else '_',
        "Consequence": data[2] if len(data) > 2 else '_',
        "Variant Type": data[3] if len(data) > 3 else '_',
        "ClinVar Significance": data[4] if len(data) > 4 else '_',
        "Allele Count": data[5] if len(data) > 5 else '_',
        "Allele Number": data[6] if len(data) > 6 else '_',
        "Allele Frequency": data[7] if len(data) > 7 else '_',
        "Homozygote Count": data[8] if len(data) > 8 else '_'
    }

    return row_data

def process_html_rows(html_rows):
    # Process multiple rows and create a dataframe
    processed_rows = [extract_row_data(row) for row in html_rows]
    df = pd.DataFrame(processed_rows)
    return df


# Define the boundaries and the total number of rows
boundaries = "20000000-20002000"
total_rows = 810  # Total number of rows expected
rows_per_scroll = 200  # Number of rows loaded per scroll

# Set up the WebDriver with headless options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
#chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration
chrome_options.add_argument("--no-sandbox")  # Bypass OS security model (for Linux)
chrome_options.add_argument("--window-size=1920x1080")  # Set window size

driver = webdriver.Chrome(options=chrome_options)

# Navigate to the specific URL
driver.get(f"https://databrowser.researchallofus.org/variants/chr22:{boundaries}")

# Initialize a variable to store the HTML rows
all_html_rows = set()  # Using a set to avoid duplicates

try:
    # Wait for the scrollable area to load
    scroll_area = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "scroll-area"))
    )
    # Loop until the desired number of rows is loaded
    while len(all_html_rows) < total_rows:
        # Scroll to the bottom of the scrollable area
        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scroll_area)
        if len(all_html_rows)%1000 == 0:
            print(len(all_html_rows))
        # Wait for new data to load
        #time.sleep(0.01)  # Adjust based on load time
        
        # Parse the page source after scrolling
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        # Extract rows with the class "row-layout"
        rows = soup.find_all("div", class_="row-layout")

        # Add only new rows to the set to avoid duplicates
        all_html_rows.update(str(row) for row in rows)

        # Stop if we have loaded the expected number of rows
        if len(all_html_rows) >= total_rows:
            break
    
    # Process the collected rows
    if all_html_rows:
        df = process_html_rows(list(all_html_rows))  # Convert the set back to a list
        print(len(df))
        
        # Save the dataframe to an Excel file with the boundaries in the filename
        filename = f"variants_{boundaries}.csv"
        df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")
    else:
        print("No rows found in the page.")

finally:
    # Close the WebDriver
    driver.quit()


0
810
Data saved to variants_20000000-20002000.csv
