In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
import time
import requests
from datetime import datetime, timedelta
import os
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor, as_completed
from dateutil.relativedelta import relativedelta
from selenium.common.exceptions import TimeoutException, WebDriverException
from requests_html import AsyncHTMLSession


# Script to download all versions of a pixel source code given its ID

In [None]:
#Returns a list of all cdx records of a website (also saves them in a text file)
def basicGetCdxRecords(url, filename): #works best for small scale queries only
    base_url = "https://web.archive.org/cdx/search/cdx" # Base URL of the CDX Server API
    params = {
        'url': url,  # URL to fetch
        'output': 'json',      # Output format
        'matchType': 'prefix'  # Match URLs that start with this prefix
        #no limit variable to fetch as many versions available
    }


    # GET request to the API
    response = requests.get(base_url, params=params) # each esponse contains the following information: ["urlkey","timestamp","original","mimetype","statuscode","digest","length"]

    allCdxRecords = []
    # Checks whether the requeust was successful
    if response.status_code == 200:
        data = response.json()  # Parse the JSON response
        with open(filename,'w') as f:
            for record in data:
                f.write(f"{record}\n")
                allCdxRecords.append(record)
                # Each record represents an archived version
    else:
        print(f"Failed to retrieve data: {response.status_code}")
    return allCdxRecords

#Returns a list of all cdx records of a website, saves them in a textfile named according to the filename, and maintaint track of the progress of the progress so far in the progress_file to continue fetching records from where they were left
def getCdxRecords(url, filename, progress_file): #works for large-scale queries
    base_url = "https://web.archive.org/cdx/search/cdx"
    limit = 100000  # Limit per request
    params = {
        'url': url,
        'output': 'json',
        'matchType': 'prefix',
        'limit': limit,
        'showResumeKey': True, #The last entry returned is the resume key, which is then used to begin fetching records exactly from where they were left
        # 'pageSize': 1  # Smallest page size
    }

    # Load progress from the progress file
    resume_key = None
    if os.path.exists(progress_file): #checking the progress file to see whether a resume key exists to continue progress from
        with open(progress_file, 'r') as f:
            resume_key = f.read().strip()
    
    all_cdx_records = []
    while True:
        if resume_key:
            params['resumeKey'] = resume_key #updating the resume key
        
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            if not data:
                break
            # print("Fetched Data: ",data)
            
            # Write the records to the file and append to the list
            with open(filename, 'a') as f:
                for record in data[:-1]:  # Last item may be the resume key
                    f.write(f"{record}\n")
                    all_cdx_records.append(record)

            # Update the resume key and save it to the progress file
            resume_key = data[-1]
            print(f"Successfully fetched: {len(data)} records. Resume key: {resume_key}")
            with open(progress_file, 'w') as f:
                f.write(resume_key[0])
            
            # If no more results are available, exit the loop
            if 'resumeKey' not in params or not resume_key:
                break
            time.sleep(3) #To avoid sending too many requests to the server which then ends up refusing the connection
        else:
            print(f"Failed to retrieve data: {response.status_code}")

    return all_cdx_records

# #downloads all webpages inside the all_archives_versions folder 
def downloadArchivedVersions(fileWithRecords, archivedDirectory): #fileWithRecords is the name of the text file with all cdx records to download, archivedDirectory is the name of directory where to save all the web pages.

    wayback_base_url = "https://web.archive.org/web/"
    save_dir = archivedDirectory
    os.makedirs(save_dir, exist_ok=True)

    with open(fileWithRecords, 'r') as f:
        records = f.readlines()

    for record in records:
        record = eval(record.strip())  # Convert the string back to a list
        timestamp = record[1]
        original_url = record[2]

        wayback_url = f"{wayback_base_url}{timestamp}/{original_url}" #A resource at the wayback has a url of this format
        filename = f"{timestamp}.html"
        filepath = os.path.join(save_dir, filename)

        # Check if file already exists
        if os.path.exists(filepath):
            print(f"File already exists: {filepath}") #to continue from saved progress
            continue

        while True:
            try:
                response = requests.get(wayback_url)
                if response.status_code == 200:
                    with open(filepath, 'w', encoding='utf-8') as f:
                        f.write(response.text)
                    print(f"Downloaded and saved: {filepath}")
                    break
                elif response.status_code == 404:
                    print(f"File not found (404): {wayback_url}. Skipping...")
                    break  # Stop retrying on 404 errors since it just doesn't exist
                else:
                    print(f"Failed to download {wayback_url}: {response.status_code}")
                    time.sleep(5) #wait before retrying
            except requests.exceptions.RequestException as e:
                print(f"Error downloading {wayback_url}: {e}")
                time.sleep(5)


# Example usage
url = "https://connect.facebook.net/signals/config/" #the url of the meta pixel
filename = "allPixelRecords.txt"
progress_file = "progress.txt"
cdx_records = getCdxRecords(url, filename, progress_file)

# downloadArchivedVersions(filename,'all_archived_versions')



In [20]:
df

Unnamed: 0,Website Name,Pixel ID,Fetched Date
0,https://onepathnetwork.com/?gad_source=1&gclid...,128726134153194,20240913
1,https://react-portfolio-alpha-nine-57.vercel.app,25826907853621873,20240913


## Extract pixel IDs from Tranc's top 10k websites. Can look at snapshots to find the ID as well.

### Downloading the latest top 1 million websites from Tranco.

In [None]:
import requests
import pandas as pd
from requests.auth import HTTPBasicAuth

# Replace with your Tranco credentials
username = os.getenv('USERNAME')
api_token = os.getenv('TRANCO_TOKEN')

def get_latest_list_metadata():
    url = 'https://tranco-list.eu/api/lists/date/latest'
    try:
        response = requests.get(url, auth=HTTPBasicAuth(username, api_token))
        response.raise_for_status()
        data = response.json()
        if data['available']:
            return data['download']
        else:
            print("No list available at the moment.")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching latest list metadata: {e}")
        return None

def download_list(download_url):
    try:
        response = requests.get(download_url, auth=HTTPBasicAuth(username, api_token))
        response.raise_for_status()
        with open('tranco_top_10k.csv', 'wb') as file:
            file.write(response.content)
        print("Tranco Top 10k list downloaded successfully.")
        # Load into DataFrame
        df = pd.read_csv('tranco_top_10k.csv', header=None, names=['Rank', 'Domain'])
        return df
    except requests.exceptions.RequestException as e:
        print(f"Error downloading Tranco list: {e}")
        return None

download_url = get_latest_list_metadata()

# If a valid URL is found, download the list
if download_url:
    df = download_list(download_url)
    if df is not None:
        print(df.head())

In [14]:

def getPixelID(website, driver, max_retries=5, delay=5):
    for attempt in range(max_retries):
        try:
            print(f"Attempt {attempt + 1} to fetch Pixel ID from {website}")

            time.sleep(4.5)
            if not website.startswith(('http://', 'https://')): #Adding the protocol since the tranco list does not contain the schema in the urls
                try:
                    driver.get('https://' + website)
                except:
                    driver.get('http://' + website)
                
            else:
                driver.get(website)
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            scripts = soup.find_all('script')

            pattern = re.compile(r'connect\.facebook\.net/signals/config/(\d+)')

            all_pixels = []
            
            for script in scripts:
                if script.has_attr('src'):
                    src = script['src']
                    if "connect.facebook.net/signals/config" in src:
                        match = pattern.search(src)
                        if match:
                            print(f"Pixel ID found: {match.group(1)}")
                            all_pixels.append(match.group(1))
            
            if len(all_pixels)==0:
                print(f"No Pixel ID found on {website}")
                return None
            else:
                return all_pixels
        
        except Exception as e:
            print(f"Error fetching Pixel ID from {website}: {e}")
            print(f"Retrying in {delay} seconds...")
            time.sleep(delay)
    print(f"Failed to fetch Pixel ID from {website} after {max_retries} attempts.")
    return None

def get_wayback_snapshot(website, date, max_retries=5):
    url = f"http://archive.org/wayback/available?url={website}&timestamp={date}"
    
    for attempt in range(max_retries):
        try:
            print(f"Attempt {attempt + 1}: Getting snapshot for {date}")
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()
            print("Check: ",data, " for ",date)
            if 'archived_snapshots' in data and data['archived_snapshots']:
                return (data['archived_snapshots']['closest']['url'],data['archived_snapshots']['closest']['timestamp'])
            return None
        except requests.exceptions.RequestException as e:
            print(f"Error: {e}. Retrying in 3 seconds...")
            time.sleep(5)
    print(f"Failed to get snapshot for {date} after {max_retries} attempts.")
    return None

def check_past_versions(website, driver):
    current_date = datetime.now()
    for i in range(1): #set to 60 later for going back every month for 5 years
        past_date = (current_date - timedelta(days=i*600)).strftime('%Y%m%d') #set days i*30 for 1 month
        snapshot_url = get_wayback_snapshot(website, past_date)
        if snapshot_url:
            print(f"Searching snapshot: {snapshot_url[0]}")
            pixelID = getPixelID(snapshot_url[0], driver)
            if pixelID:
                print(f"Found pixel ID: {pixelID} at the snapsot: {snapshot_url[1]} ")
                return pixelID, snapshot_url[1]
    return None, None

def load_progress(filename):
    if os.path.exists(filename):
        return pd.read_csv(filename)
    else:
        return pd.DataFrame(columns=["Website Name", "Pixel ID", "Fetched Date"])

def save_progress(df, filename):
    df.to_csv(filename, index=False)

def populateDataframe(urls, progress_file):
    # Load previous progress if any
    progress_df = load_progress(progress_file)
    completed_urls = progress_df['Website Name'].tolist()

    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    total_urls = len(urls)
    
    for idx, website in enumerate(urls):
        if website in completed_urls:
            print(f"Skipping {website}, already processed.")
            continue
        
        # Process the URL
        pixelID = getPixelID(website, driver)
        print("Pixel ID fetched directly: ",pixelID)
        fetch_date = datetime.now().strftime('%Y%m%d')  # Use current date if found in current version

        if not pixelID:  # If no pixel ID found, check Wayback Machine
            pixelID, past_date = check_past_versions(website, driver)
            fetch_date = past_date if past_date else fetch_date  # Use past date if found in Wayback Machine
        
        # Append the result to the DataFrame
        new_record = {"Website Name": website, "Pixel ID": pixelID, "Fetched Date": fetch_date}
        progress_df = pd.concat([progress_df, pd.DataFrame([new_record])], ignore_index=True)

        # Save progress after each URL is processed
        save_progress(progress_df, progress_file)

        # Print progress
        print(f"Processed {idx + 1}/{total_urls}: {website} (Pixel ID: {pixelID})")

    driver.quit()
    return progress_df


progress_file = 'scraping_progress.csv'
# urls = pd.read_csv('tranco_top_10k.csv')['website'].to_list()
urls = ['idhfsin.com']

df = populateDataframe(urls, progress_file)
print(df)


Pixel ID fetched directly:  None
Attempt 1: Getting snapshot for 20240914
Check:  {'url': 'idhfsin.com', 'archived_snapshots': {}, 'timestamp': '20240914'}  for  20240914
Processed 1/1: idhfsin.com (Pixel ID: None)
  Website Name              Pixel ID    Fetched Date
0  riteaid.com  ['1264059003707256']  20240910210119
1     bruh.com                   NaN        20240914
2  idhfsin.com                  None        20240914


### Crawler only but using archive.org API to fetch valid snapshots (obsolete)

In [27]:
def get_wayback_snapshot(website, date, max_retries=5):
    url = f"http://archive.org/wayback/available?url={website}&timestamp={date}"
    
    for attempt in range(max_retries):
        try:
            print(f"Attempt {attempt + 1}: Getting snapshot for {date}")
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()
            print("Check: ",data, " for ",date)
            if 'archived_snapshots' in data and data['archived_snapshots']:
                return (data['archived_snapshots']['closest']['url'],data['archived_snapshots']['closest']['timestamp'])
            return None
        except requests.exceptions.RequestException as e:
            print(f"Error: {e}. Retrying in 3 seconds...")
            time.sleep(5)
    print(f"Failed to get snapshot for {date} after {max_retries} attempts.")
    return None

def download_past_versions(website, max_retries = 10):
    current_date = datetime.now()
    for i in range(60): #set to 60 later for going back every month for 5 years
        past_date = (current_date - timedelta(days=i*30)).strftime('%Y%m%d') #set days i*30 for 1 month
        snapshot = get_wayback_snapshot(website, past_date)
        if snapshot:
            wayback_url = snapshot[0]
            wayback_timestamp = snapshot[1]
            print(f"Downloading snapshot: {wayback_url}")

            if (not os.path.exists(website)):
                os.makedirs(website, exist_ok=True)

            filename = f"{wayback_timestamp}.html"
            filepath = os.path.join(website,filename)
        
            for attempt in range(max_retries):
                print(f"Attempt {attempt + 1}: Getting snapshot for {wayback_timestamp}")
            
                try:
                    time.sleep(3)
                    response = requests.get(wayback_url)
                    if response.status_code == 200:
                        with open(filepath, 'w', encoding='utf-8') as f:
                            f.write(response.text)
                        print(f"Downloaded and saved: {filepath}")
                        break
                    elif response.status_code == 404:
                        print(f"File not found (404): {wayback_url}. Skipping...")
                        break  # Stop retrying on 404 errors since it just doesn't exist
                    else:
                        print(f"Failed to download {wayback_url}: {response.status_code}")
                        time.sleep(5) #wait before retrying
                except requests.exceptions.RequestException as e:
                    print(f"Error downloading {wayback_url}: {e}")
                time.sleep(5)

        else:
            print(f"No snapshot available {website} : {past_date}")

    return None, None

def load_progress(filename):
    if os.path.exists(filename):
        return pd.read_csv(filename)
    else:
        return pd.DataFrame(columns=["Website Name", "Completed"])

def save_progress(df, filename):
    df.to_csv(filename, index=False)

def crawlWayback(urls, progress_file):
    # Load previous progress if any
    progress_df = load_progress(progress_file)
    completed_urls = progress_df['Website Name'].tolist()

    total_urls = len(urls)
    
    for idx, website in enumerate(urls):
        if website in completed_urls:
            print(f"Skipping {website}, already processed.")
            continue
        
        download_past_versions(website)    
        # Save progress after each URL is processed
        record = [website,"completed"]
        progress_df.loc[len(progress_df)] = record

        save_progress(progress_df, progress_file)

        # Print progress
        print(f"Processed {idx + 1}/{total_urls}: {website}")

    return progress_df


progress_file = 'scraping_progress.csv'
urls = pd.read_csv('tranco_top_10k.csv')['website'].to_list()

df = crawlWayback(urls, progress_file)
print(df)


Skipping google.com, already processed.
Skipping amazonaws.com, already processed.
Skipping microsoft.com, already processed.
Skipping facebook.com, already processed.
Attempt 1: Getting snapshot for 20240914
Check:  {'url': 'akamai.net', 'archived_snapshots': {'closest': {'status': '200', 'available': True, 'url': 'http://web.archive.org/web/19990208011229/http://www.akamai.net:80/', 'timestamp': '19990208011229'}}, 'timestamp': '20240914'}  for  20240914
Downloading snapshot: http://web.archive.org/web/19990208011229/http://www.akamai.net:80/
Attempt 1: Getting snapshot for 19990208011229
Downloaded and saved: akamai.net/19990208011229.html
Attempt 1: Getting snapshot for 20240815
Check:  {'url': 'akamai.net', 'archived_snapshots': {'closest': {'status': '200', 'available': True, 'url': 'http://web.archive.org/web/19990208011229/http://www.akamai.net:80/', 'timestamp': '19990208011229'}}, 'timestamp': '20240815'}  for  20240815
Downloading snapshot: http://web.archive.org/web/1999020

KeyboardInterrupt: 

In [24]:

def split_dataframe(df, num_splits):
    chunk_size = len(df) // num_splits

    for i in range(num_splits):
        # Calculate the start and end index for each chunk
        start_idx = i * chunk_size
        # Make sure the last chunk includes any leftover rows
        if i == num_splits - 1:
            end_idx = len(df)
        else:
            end_idx = (i + 1) * chunk_size
        
        chunk_df = df.iloc[start_idx:end_idx]
        chunk_df.to_csv(f'tranco_10k_{i+1}.csv', index=False)

        print(f'Part {i+1} saved with rows {start_idx} to {end_idx-1}')

# Example usage
df = pd.read_csv('tranco_top_10k.csv')
split_dataframe(df, 4)

Part 1 saved with rows 0 to 2499
Part 2 saved with rows 2500 to 4999
Part 3 saved with rows 5000 to 7499
Part 4 saved with rows 7500 to 9999


### Crawling using requests library and CDX API for valid results - requests library does not load the webpage entirely before downloading

In [37]:
import requests
import os
from datetime import datetime, timedelta

# CDX API to get snapshots for the past 5 years, month by month
def get_wayback_snapshots_cdx(website, start_date, end_date):
    print(f"Fetching for {website}")
    url = f"http://web.archive.org/cdx/search/cdx?url={website}&from={start_date}&to={end_date}&filter=statuscode:200&output=json&collapse=timestamp:6&limit=61" # Collapse to one per month
    response = requests.get(url)
    print("Here: ",response)
    
    
    if response.status_code == 200:
        snapshots = response.json()
        if len(snapshots) > 1:  # First item in snapshots is the header
            return snapshots[1:]  # Skip the header
    return None

# Function to download and save the archived snapshots
def download_webpage(snapshot, website_folder):
    timestamp = snapshot[1]
    wayback_url = f"http://web.archive.org/web/{timestamp}/{snapshot[2]}"
    print(f"Downloading {wayback_url}")
    
    try:
        response = requests.get(wayback_url)
        time.sleep
        if response.status_code == 200:
            # Save the webpage with timestamp
            filename = f"{timestamp}.html"
            filepath = os.path.join(website_folder, filename)
            with open(filepath, 'w', encoding='utf-8') as file:
                file.write(response.text)
            print(f"Saved {filepath}")
        else:
            print(f"Failed to download {wayback_url}: Status {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {wayback_url}: {e}")

# Main function to process website over 5 years and download monthly snapshots
def download_past_versions_cdx(website):
    # Calculate 5 years back
    end_date = datetime.now().strftime('%Y%m%d')
    start_date = (datetime.now() - timedelta(days=5*365)).strftime('%Y%m%d')

    # Fetch all snapshots month by month using CDX API
    print(start_date,end_date)
    snapshots = get_wayback_snapshots_cdx(website, start_date, end_date)
    
    if snapshots:
        print(f"Found {len(snapshots)} snapshots for {website}")
        
        # Create a folder to store snapshots
        if not os.path.exists(website):
            os.makedirs(website)
        
        # Loop through each snapshot and download it
        for snapshot in snapshots:
            download_webpage(snapshot, website)
    else:
        print(f"No snapshots available for {website}")

# Example usage
# urls = pd.read_csv('tranco_top_10k.csv')['website'].to_list()
urls = ["googleapis.com"]
for url in urls:
    download_past_versions_cdx(url)


20190916 20240914
Fetching for googleapis.com
Here:  <Response [200]>
No snapshots available for googleapis.com


### Crawling using CDX API and Selenium - works perfectly but slow

In [2]:
def add_month(date_str):
    date_obj = datetime.strptime(date_str, '%Y%m%d')
    new_date_obj = date_obj + relativedelta(months=1)
    new_date_str = new_date_obj.strftime('%Y%m%d')
    return new_date_str

# Function to get snapshots with pagination
def get_all_snapshots(url, start_date, end_date, limit=100000):
    all_snapshots = []
    current_start = start_date
    while current_start <= end_date:
        api_url = f"http://web.archive.org/cdx/search/cdx?url={url}&from={current_start}&to={end_date}&output=json&limit={limit}"
        for i in range(5): #max tries are 5
            try:
                response = requests.get(api_url)
                
                if response.status_code == 200:
                    data = response.json()
                    snapshots = data[1:]  # Excluding the header
                    if not snapshots:
                        print("No snapshots to fetch")
                        current_start = add_month(end_date)
                        break
                    monthly_end = snapshots[-1][1][:8]  # Use the timestamp of the last snapshot to set the new start date
                    if monthly_end !=current_start:
                        current_start = monthly_end

                    all_snapshots.append(filter_snapshots_by_month(snapshots))
                    break
            
                else:
                    print("Failed to fetch data")
                    continue
            except:
                continue
        current_start = add_month(current_start)

        
    return all_snapshots

# Function to filter snapshots to one per month
def filter_snapshots_by_month(snapshots):
    snapshots_by_month = {}
    
    for snapshot in snapshots:
        timestamp = snapshot[1]
        date_str = timestamp[:6]  # Extract YYYYMM
        if date_str not in snapshots_by_month:
            snapshots_by_month[date_str] = snapshot
    
    return snapshots_by_month


# Function to generate a date range
def generate_date_range(years=5):
    current_date = datetime.now()
    start_date = (current_date - timedelta(days=years*365)).strftime('%Y%m%d')
    end_date = current_date.strftime('%Y%m%d')
    return start_date, end_date

def generateAllSnapshots(url):
    start_date, end_date = generate_date_range()
    snapshots = get_all_snapshots(url, start_date, end_date, limit=100000)
    all_snapshots = []
    for snap in snapshots:
        for month, snapshot in snap.items():
            all_snapshots.append(snapshot)
    return all_snapshots


def download_past_versions(website,driver ,max_retries = 5):
    # print(f"Processing {website}")
    wayback_base_url = "https://web.archive.org/web/"

    snapshots = generateAllSnapshots(website)
    print("Generated all snapshots")
    if snapshots:
        for snapshot in snapshots:

            timestamp = snapshot[1]
            original_url = snapshot[2]
            wayback_url = f"{wayback_base_url}{timestamp}/{original_url}" #A resource at the wayback has a url of this format
            wayback_timestamp = snapshot[1]
            # print(f"Downloading snapshot: {wayback_url}")

            if (not os.path.exists(website)):
                os.makedirs(website, exist_ok=True)

            filename = f"{wayback_timestamp}.html"
            filepath = os.path.join(website,filename)
        
            for attempt in range(max_retries):
                # print(f"Attempt {attempt + 1}: Getting snapshot for {wayback_timestamp}")
            
                try:
                    driver.get(wayback_url)
                    page_source = driver.page_source

                    with open(filepath, 'w', encoding='utf-8') as f:
                        f.write(page_source)
                    print(f"Downloaded and saved: {filepath}")
                    break
                except requests.exceptions.RequestException as e:
                    print(f"Error downloading {wayback_url}: {e}")
                    time.sleep(5)
                except TimeoutException as e:
                    print(f"TimeoutException on attempt {attempt + 1} for {wayback_url}: {e}")
                    time.sleep(5)  # Retry after a delay
                    break
                except WebDriverException as e:
                    print(f"WebDriverException on attempt {attempt + 1} for {wayback_url}: {e}")
                    time.sleep(5)
                except Exception as e:
                    print(f"General error on attempt {attempt + 1} for {wayback_url}: {e}")
                    time.sleep(5)
    

    else:
        print(f"No snapshots available for {website}")

    return None, None

def load_progress(filename):
    if os.path.exists(filename):
        return pd.read_csv(filename)
    else:
        return pd.DataFrame(columns=["Website Name", "Completed"])

def save_progress(df, filename):
    df.to_csv(filename, index=False)


def crawlWayback(urls, progress_file):
    # Load previous progress if any
    progress_df = load_progress(progress_file)
    completed_urls = progress_df['Website Name'].tolist()

    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


    total_urls = len(urls)
    
    for idx, website in enumerate(urls):
        if website in completed_urls:
            print(f"Skipping {website}, already processed.")
            continue
        print("Processing ",website)
        
        download_past_versions(website,driver)    
        # Save progress after each URL is processed
        record = [website,"completed"]
        progress_df.loc[len(progress_df)] = record

        save_progress(progress_df, progress_file)
        # Print progress
        print(f"Processed {idx + 1}/{total_urls}: {website}")

    driver.quit()
    return progress_df

urls = pd.read_csv('tranco_top_10k.csv')['website'].to_list()
progress_file = "progress.csv"
crawlWayback(urls,progress_file)

Skipping google.com, already processed.
Skipping amazonaws.com, already processed.
Skipping microsoft.com, already processed.
Generated all snapshots
Downloaded and saved: facebook.com/20190916002521.html
Downloaded and saved: facebook.com/20191001000810.html
Downloaded and saved: facebook.com/20191101002347.html
Downloaded and saved: facebook.com/20191201000258.html
Downloaded and saved: facebook.com/20200101000525.html
Downloaded and saved: facebook.com/20200201000203.html
Downloaded and saved: facebook.com/20200301000302.html
Downloaded and saved: facebook.com/20200401000113.html
Downloaded and saved: facebook.com/20200501000250.html
Downloaded and saved: facebook.com/20200623000139.html
TimeoutException on attempt 1 for https://web.archive.org/web/20200701000026/https://www.facebook.com/: Message: timeout: Timed out receiving message from renderer: 295.217
  (Session info: chrome-headless-shell=122.0.6261.111)
Stacktrace:
#0 0x561eec45af33 <unknown>
#1 0x561eec152ce6 <unknown>
#2 0

In [8]:
from requests_html import HTMLSession
import asyncio

def add_month(date_str):
    date_obj = datetime.strptime(date_str, '%Y%m%d')
    new_date_obj = date_obj + relativedelta(months=1)
    new_date_str = new_date_obj.strftime('%Y%m%d')
    return new_date_str

# Function to get snapshots with pagination
def get_all_snapshots(url, start_date, end_date, limit=100000):
    all_snapshots = []
    current_start = start_date

    while current_start <= end_date:
        api_url = f"http://web.archive.org/cdx/search/cdx?url={url}&from={current_start}&to={end_date}&output=json&limit={limit}"
        for i in range(5): #max tries are 5
            try:
                response = requests.get(api_url)
                if response.status_code == 200:
                    data = response.json()
                    snapshots = data[1:]  # Excluding the header
                    if not snapshots:
                        break
                    monthly_end = snapshots[-1][1][:8]  # Use the timestamp of the last snapshot to set the new start date
                    if monthly_end !=current_start:
                        current_start = monthly_end
                    current_start = add_month(current_start)

                    all_snapshots.append(filter_snapshots_by_month(snapshots))
                    break
            
                else:
                    print("Failed to fetch data")
                    continue
            except:
                continue

    return all_snapshots

# Function to filter snapshots to one per month
def filter_snapshots_by_month(snapshots):
    snapshots_by_month = {}
    
    for snapshot in snapshots:
        timestamp = snapshot[1]
        date_str = timestamp[:6]  # Extract YYYYMM
        if date_str not in snapshots_by_month:
            snapshots_by_month[date_str] = snapshot
    
    return snapshots_by_month


# Function to generate a date range
def generate_date_range(years=5):
    current_date = datetime.now()
    start_date = (current_date - timedelta(days=years*365)).strftime('%Y%m%d')
    end_date = current_date.strftime('%Y%m%d')
    return start_date, end_date

def generateAllSnapshots(url):
    start_date, end_date = generate_date_range()
    snapshots = get_all_snapshots(url, start_date, end_date, limit=100000)
    all_snapshots = []
    for snap in snapshots:
        for month, snapshot in snap.items():
            all_snapshots.append(snapshot)
    return all_snapshots


async def download_past_versions(website,session ,max_retries = 5):
    # print(f"Processing {website}")
    wayback_base_url = "https://web.archive.org/web/"

    snapshots = generateAllSnapshots(website)
    print("Generated all snapshots")
    if snapshots:
        for snapshot in snapshots:

            timestamp = snapshot[1]
            original_url = snapshot[2]
            wayback_url = f"{wayback_base_url}{timestamp}/{original_url}" #A resource at the wayback has a url of this format
            wayback_timestamp = snapshot[1]
            # print(f"Downloading snapshot: {wayback_url}")

            if (not os.path.exists("experimnt-2")):
                os.makedirs(f"expriment-2 {website}", exist_ok=True)

            filename = f"{wayback_timestamp}.html"
            filepath = os.path.join(website,filename)
        
            for attempt in range(max_retries):
                # print(f"Attempt {attempt + 1}: Getting snapshot for {wayback_timestamp}")
            
                try:
                    response = await session.get(wayback_url)
                    await response.html.arender(timeout=10)

                    with open(filepath, 'w', encoding='utf-8') as f:
                        f.write(response.html.html)
                    print(f"Downloaded and saved: {filepath}")
                    break
                except requests.exceptions.RequestException as e:
                    print(f"Error downloading {wayback_url}: {e}")
                    time.sleep(5)
                except TimeoutException as e:
                    print(f"TimeoutException on attempt {attempt + 1} for {wayback_url}: {e}")
                    time.sleep(5)  # Retry after a delay
                except WebDriverException as e:
                    print(f"WebDriverException on attempt {attempt + 1} for {wayback_url}: {e}")
                    time.sleep(5)
                except Exception as e:
                    print(f"General error on attempt {attempt + 1} for {wayback_url}: {e}")
                    time.sleep(5)
    

    else:
        print(f"No snapshots available for {website}")

    return None, None

def load_progress(filename):
    if os.path.exists(filename):
        return pd.read_csv(filename)
    else:
        return pd.DataFrame(columns=["Website Name", "Completed"])

def save_progress(df, filename):
    df.to_csv(filename, index=False)


async def crawlWayback(urls, progress_file):
    # Load previous progress if any
    session = AsyncHTMLSession()
    progress_df = load_progress(progress_file)
    completed_urls = progress_df['Website Name'].tolist()

    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


    total_urls = len(urls)
    
    for idx, website in enumerate(urls):
        if website in completed_urls:
            print(f"Skipping {website}, already processed.")
            continue
        
        await download_past_versions(website,session)    
        # Save progress after each URL is processed
        record = [website,"completed"]
        progress_df.loc[len(progress_df)] = record

        save_progress(progress_df, progress_file)
        # Print progress
        print(f"Processed {idx + 1}/{total_urls}: {website}")

    driver.quit()
    return progress_df

async def main():
    urls = pd.read_csv('tranco_top_10k.csv')['website'].to_list()
    filepath = 'progress.csv'
    await crawlWayback(urls, filepath)

if __name__ == "__main__":
    # Check if an event loop is running, and use it if necessary
    try:
        loop = asyncio.get_running_loop()
    except RuntimeError:  # No event loop running
        loop = None

    if loop and loop.is_running():
        # If there is a running loop, use 'await' instead of 'asyncio.run()'
        await main()
    else:
        asyncio.run(main())


Skipping google.com, already processed.
Skipping amazonaws.com, already processed.
Skipping microsoft.com, already processed.


  obj, end = self.scan_once(s, idx)


Generated all snapshots


[INFO] Starting Chromium download.
100%|██████████| 183M/183M [01:21<00:00, 2.25Mb/s] 
[INFO] Beginning extraction
[INFO] Chromium extracted to: /home/abdullah/.local/share/pyppeteer/local-chromium/1181205


Downloaded and saved: facebook.com/20230915000216.html
Downloaded and saved: facebook.com/20231001000730.html
Downloaded and saved: facebook.com/20231101000017.html
Downloaded and saved: facebook.com/20231201000106.html
General error on attempt 1 for https://web.archive.org/web/20240129000310/https://www.facebook.com/: net::ERR_HTTP2_SERVER_REFUSED_STREAM at https://web.archive.org/web/20240129000310/https://www.facebook.com/
Downloaded and saved: facebook.com/20240129000310.html
Downloaded and saved: facebook.com/20240201000041.html
Downloaded and saved: facebook.com/20240301000433.html
Downloaded and saved: facebook.com/20240401000309.html
Downloaded and saved: facebook.com/20240501000331.html
General error on attempt 1 for https://web.archive.org/web/20240615000050/https://www.facebook.com/: net::ERR_HTTP2_SERVER_REFUSED_STREAM at https://web.archive.org/web/20240615000050/https://www.facebook.com/
Downloaded and saved: facebook.com/20240615000050.html
Downloaded and saved: facebook

  continue
