In [4]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException 
import time
import os
import pandas as pd
from tqdm import tqdm 

In [12]:
def liveCrawler (csv_path, output_dir):

    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    df = pd.read_csv(csv_path)
    os.makedirs(output_dir, exist_ok=True)

    timeout = 200 #maximum wait for a website to load
    driver.set_page_load_timeout(timeout)


    # Load previously downloaded websites from a log file
    log_file = 'downloaded_websites.txt'
    if os.path.exists(log_file):
        with open(log_file, 'r') as log:
            downloaded_websites = set(log.read().splitlines())
    else:
        downloaded_websites = set()

    # Use tqdm to create a progress bar
    total_websites = len(df)
    with tqdm(total=total_websites, desc="Downloading Websites", unit="website") as pbar:
        for index, row in df.iterrows():

            max_retries = 5
            current_try = 0

            while True:
                website = row['website']  
                file_name = f'{website}.html'
                file_path = os.path.join(output_dir, file_name)
                
                # Check if the website has already been downloaded
                if website in downloaded_websites:
                    print(f"Skipping already downloaded: {website}")
                    pbar.update(1)  # Update progress bar
                    break
                
                try:
                    print(f"Downloading {website}...")
                    
                    driver.get(f'http://{website}')
                    # Save the page source to a file
                    with open(file_path, 'w', encoding='utf-8') as file:
                        file.write(driver.page_source)
                    
                    print(f"Saved {file_name}")
                    
                    # Log the downloaded website
                    with open(log_file, 'a') as log:
                        log.write(website + '\n')
                    break
                        
                except TimeoutException:
                    print(f"Timeout while downloading {website}, moving to next website.")
                    with open(log_file, 'a') as log:
                        log.write(website + '\n')
                    break

                except Exception as e:
                    print(f"Failed to download {website}: {e}")
                    if current_try>=max_retries:
                        with open(log_file, 'a') as log:
                            log.write(website + '\n')
                        break
                    else:
                        current_try+=1

            # Update progress bar
            pbar.update(1)

    driver.quit()

download_dir = "live_websites"
liveCrawler('tranco_top_10k.csv',download_dir)


Downloading Websites:   0%|          | 0/10000 [00:00<?, ?website/s]

Skipping already downloaded: google.com
Skipping already downloaded: amazonaws.com
Skipping already downloaded: microsoft.com
Skipping already downloaded: facebook.com
Skipping already downloaded: akamai.net
Skipping already downloaded: a-msedge.net
Skipping already downloaded: googleapis.com
Skipping already downloaded: apple.com
Skipping already downloaded: youtube.com
Skipping already downloaded: root-servers.net
Skipping already downloaded: azure.com
Skipping already downloaded: akamaiedge.net
Skipping already downloaded: twitter.com
Skipping already downloaded: cloudflare.com
Skipping already downloaded: instagram.com
Skipping already downloaded: gstatic.com
Skipping already downloaded: office.com
Skipping already downloaded: linkedin.com
Skipping already downloaded: tiktokcdn.com
Skipping already downloaded: live.com
Skipping already downloaded: googletagmanager.com
Skipping already downloaded: googlevideo.com
Skipping already downloaded: akadns.net
Skipping already downloaded: g

Downloading Websites:   1%|          | 88/10000 [00:07<14:44, 11.20website/s]


KeyboardInterrupt: 