In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
import time
import requests
from datetime import datetime, timedelta
import os
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor, as_completed


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Script to download all versions of a pixel source code given its ID

In [None]:
#Returns a list of all cdx records of a website (also saves them in a text file)
def basicGetCdxRecords(url, filename): #works best for small scale queries only
    base_url = "https://web.archive.org/cdx/search/cdx" # Base URL of the CDX Server API
    params = {
        'url': url,  # URL to fetch
        'output': 'json',      # Output format
        'matchType': 'prefix'  # Match URLs that start with this prefix
        #no limit variable to fetch as many versions available
    }


    # GET request to the API
    response = requests.get(base_url, params=params) # each esponse contains the following information: ["urlkey","timestamp","original","mimetype","statuscode","digest","length"]

    allCdxRecords = []
    # Checks whether the requeust was successful
    if response.status_code == 200:
        data = response.json()  # Parse the JSON response
        with open(filename,'w') as f:
            for record in data:
                f.write(f"{record}\n")
                allCdxRecords.append(record)
                # Each record represents an archived version
    else:
        print(f"Failed to retrieve data: {response.status_code}")
    return allCdxRecords

#Returns a list of all cdx records of a website, saves them in a textfile named according to the filename, and maintaint track of the progress of the progress so far in the progress_file to continue fetching records from where they were left
def getCdxRecords(url, filename, progress_file): #works for large-scale queries
    base_url = "https://web.archive.org/cdx/search/cdx"
    limit = 100000  # Limit per request
    params = {
        'url': url,
        'output': 'json',
        'matchType': 'prefix',
        'limit': limit,
        'showResumeKey': True, #The last entry returned is the resume key, which is then used to begin fetching records exactly from where they were left
        # 'pageSize': 1  # Smallest page size
    }

    # Load progress from the progress file
    resume_key = None
    if os.path.exists(progress_file): #checking the progress file to see whether a resume key exists to continue progress from
        with open(progress_file, 'r') as f:
            resume_key = f.read().strip()
    
    all_cdx_records = []
    while True:
        if resume_key:
            params['resumeKey'] = resume_key #updating the resume key
        
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            data = response.json()
            if not data:
                break
            # print("Fetched Data: ",data)
            
            # Write the records to the file and append to the list
            with open(filename, 'a') as f:
                for record in data[:-1]:  # Last item may be the resume key
                    f.write(f"{record}\n")
                    all_cdx_records.append(record)

            # Update the resume key and save it to the progress file
            resume_key = data[-1]
            print(f"Successfully fetched: {len(data)} records. Resume key: {resume_key}")
            with open(progress_file, 'w') as f:
                f.write(resume_key[0])
            
            # If no more results are available, exit the loop
            if 'resumeKey' not in params or not resume_key:
                break
            time.sleep(3) #To avoid sending too many requests to the server which then ends up refusing the connection
        else:
            print(f"Failed to retrieve data: {response.status_code}")

    return all_cdx_records

# #downloads all webpages inside the all_archives_versions folder 
def downloadArchivedVersions(fileWithRecords, archivedDirectory): #fileWithRecords is the name of the text file with all cdx records to download, archivedDirectory is the name of directory where to save all the web pages.

    wayback_base_url = "https://web.archive.org/web/"
    save_dir = archivedDirectory
    os.makedirs(save_dir, exist_ok=True)

    with open(fileWithRecords, 'r') as f:
        records = f.readlines()

    for record in records:
        record = eval(record.strip())  # Convert the string back to a list
        timestamp = record[1]
        original_url = record[2]

        wayback_url = f"{wayback_base_url}{timestamp}/{original_url}" #A resource at the wayback has a url of this format
        filename = f"{timestamp}.html"
        filepath = os.path.join(save_dir, filename)

        # Check if file already exists
        if os.path.exists(filepath):
            print(f"File already exists: {filepath}") #to continue from saved progress
            continue

        while True:
            try:
                response = requests.get(wayback_url)
                if response.status_code == 200:
                    with open(filepath, 'w', encoding='utf-8') as f:
                        f.write(response.text)
                    print(f"Downloaded and saved: {filepath}")
                    break
                elif response.status_code == 404:
                    print(f"File not found (404): {wayback_url}. Skipping...")
                    break  # Stop retrying on 404 errors since it just doesn't exist
                else:
                    print(f"Failed to download {wayback_url}: {response.status_code}")
                    time.sleep(5) #wait before retrying
            except requests.exceptions.RequestException as e:
                print(f"Error downloading {wayback_url}: {e}")
                time.sleep(5)


# Example usage
url = "https://connect.facebook.net/signals/config/" #the url of the meta pixel
filename = "allPixelRecords.txt"
progress_file = "progress.txt"
cdx_records = getCdxRecords(url, filename, progress_file)

# downloadArchivedVersions(filename,'all_archived_versions')



In [47]:
def getPixelID(website, driver, max_retries=5, delay=5):
    for attempt in range(max_retries):
        try:
            # Try to open the website
            print(f"Attempt {attempt + 1} to fetch Pixel ID from {website}")
            time.sleep(4.5) #Wayback has a rate limit of 15 requests per minute, otherwise it blocks your IP
            driver.get(website)
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            scripts = soup.find_all('script')

            # Search for the Facebook config URL pattern
            pattern = re.compile(r'connect\.facebook\.net/signals/config/(\d+)')
            
            for script in scripts:
                if script.has_attr('src'):
                    src = script['src']
                    if "connect.facebook.net/signals/config" in src:
                        match = pattern.search(src)
                        if match:
                            print(f"Pixel ID found: {match.group(1)}")
                            return match.group(1)  # Return the found Pixel ID
            
            # If no Pixel ID is found, return None after all scripts are checked
            print(f"No Pixel ID found on {website}")
            return None
        
        except Exception as e:
            # Handle exceptions (e.g., connection issues, timeouts)
            print(f"Error fetching Pixel ID from {website}: {e}")
            print(f"Retrying in {delay} seconds...")
            time.sleep(delay)  # Wait for 'delay' seconds before retrying

    # If all retries are exhausted, return None
    print(f"Failed to fetch Pixel ID from {website} after {max_retries} attempts.")
    return None

# Get snapshot from Wayback Machine for a given date
def get_wayback_snapshot(website, date, max_retries=5):
    url = f"http://archive.org/wayback/available?url={website}&timestamp={date}"
    
    for attempt in range(max_retries):
        try:
            print(f"Attempt {attempt + 1}: Getting snapshot for {date}")
            response = requests.get(url)
            response.raise_for_status()  # Raise an exception for 4xx/5xx errors
            data = response.json()
            if 'archived_snapshots' in data and data['archived_snapshots']:
                return data['archived_snapshots']['closest']['url']
            return None
        except requests.exceptions.RequestException as e:
            print(f"Error: {e}. Retrying in 3 seconds...")
            time.sleep(3)  # Wait for 3 seconds before retrying
    print(f"Failed to get snapshot for {date} after {max_retries} attempts.")
    return None

# Loop back 5 years, month by month
def check_past_versions(website, driver):
    current_date = datetime.now()
    for i in range(60):  # 60 months = 5 years
        past_date = (current_date - timedelta(days=i*30)).strftime('%Y%m%d')
        snapshot_url = get_wayback_snapshot(website, past_date)
        if snapshot_url:
            print(f"Checking snapshot: {snapshot_url}")
            pixelID = getPixelID(snapshot_url, driver)
            if pixelID:
                # Return the pixel ID and the date when it was fetched (the past date)
                return pixelID, past_date
    return None, None

# Function to resume progress from a file
def load_progress(filename):
    if os.path.exists(filename):
        return pd.read_csv(filename)
    else:
        return pd.DataFrame(columns=["Website Name", "Pixel ID", "Fetched Date"])

# Function to track progress in a file
def save_progress(df, filename):
    df.to_csv(filename, index=False)

# Populate the DataFrame with pixel IDs and dates, track progress
def populateDataframe(urls, progress_file):
    # Load previous progress if any
    progress_df = load_progress(progress_file)
    completed_urls = progress_df['Website Name'].tolist()

    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    total_urls = len(urls)
    
    for idx, website in enumerate(urls):
        if website in completed_urls:
            print(f"Skipping {website}, already processed.")
            continue
        
        # Process the URL
        # pixelID = getPixelID(website, driver)
        pixelID = None
        print("Check pixel id: ",pixelID)
        fetch_date = datetime.now().strftime('%Y%m%d')  # Use current date if found in current version

        if not pixelID:  # If no pixel ID found, check Wayback Machine
            pixelID, past_date = check_past_versions(website, driver)
            fetch_date = past_date if past_date else fetch_date  # Use past date if found in Wayback Machine
        
        # Append the result to the DataFrame
        new_record = {"Website Name": website, "Pixel ID": pixelID, "Fetched Date": fetch_date}
        progress_df = pd.concat([progress_df, pd.DataFrame([new_record])], ignore_index=True)

        # Save progress after each URL is processed
        save_progress(progress_df, progress_file)

        # Print progress
        print(f"Processed {idx + 1}/{total_urls}: {website} (Pixel ID: {pixelID})")

    driver.quit()
    return progress_df

# List of URLs to scrape
urls = [
    "https://www.riteaid.com/",
]

# Define the progress file to store results
progress_file = 'scraping_progress.csv'

# Run the scraper and track progress
df = populateDataframe(urls, progress_file)
print(df)


Check pixel id:  None
Attempt 1: Getting snapshot for 20240913
Checking snapshot: http://web.archive.org/web/20240910210119/https://www.riteaid.com/
Attempt 1 to fetch Pixel ID from http://web.archive.org/web/20240910210119/https://www.riteaid.com/
Pixel ID found: 1264059003707256
Processed 1/1: https://www.riteaid.com/ (Pixel ID: 1264059003707256)
               Website Name          Pixel ID Fetched Date
0  https://www.riteaid.com/  1264059003707256     20240913


In [20]:
df

Unnamed: 0,Website Name,Pixel ID,Fetched Date
0,https://onepathnetwork.com/?gad_source=1&gclid...,128726134153194,20240913
1,https://react-portfolio-alpha-nine-57.vercel.app,25826907853621873,20240913


## Extract pixel IDs from Tranc's top 10k websites. Can look at snapshots to find the ID as well.

In [2]:

def getPixelID(website, driver, max_retries=5, delay=5):
    for attempt in range(max_retries):
        try:
            print(f"Attempt {attempt + 1} to fetch Pixel ID from {website}")

            time.sleep(4.5)
            if not website.startswith(('http://', 'https://')): #Adding the protocol since the tranco list does not contain the schema in the urls
                try:
                    driver.get('https://' + website)
                except:
                    driver.get('http://' + website)
                
            else:
                driver.get(website)
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            scripts = soup.find_all('script')

            pattern = re.compile(r'connect\.facebook\.net/signals/config/(\d+)')
            
            for script in scripts:
                if script.has_attr('src'):
                    src = script['src']
                    if "connect.facebook.net/signals/config" in src:
                        match = pattern.search(src)
                        if match:
                            print(f"Pixel ID found: {match.group(1)}")
                            return match.group(1)
            
            print(f"No Pixel ID found on {website}")
            return None
        
        except Exception as e:
            print(f"Error fetching Pixel ID from {website}: {e}")
            print(f"Retrying in {delay} seconds...")
            time.sleep(delay)
    print(f"Failed to fetch Pixel ID from {website} after {max_retries} attempts.")
    return None

def get_wayback_snapshot(website, date, max_retries=5):
    url = f"http://archive.org/wayback/available?url={website}&timestamp={date}"
    
    for attempt in range(max_retries):
        try:
            print(f"Attempt {attempt + 1}: Getting snapshot for {date}")
            response = requests.get(url)
            response.raise_for_status()
            data = response.json()
            if 'archived_snapshots' in data and data['archived_snapshots']:
                return data['archived_snapshots']['closest']['url']
            return None
        except requests.exceptions.RequestException as e:
            print(f"Error: {e}. Retrying in 3 seconds...")
            time.sleep(5)
    print(f"Failed to get snapshot for {date} after {max_retries} attempts.")
    return None

def check_past_versions(website, driver):
    current_date = datetime.now()
    for i in range(1,2): #set to 60 later for going back every month for 5 years
        past_date = (current_date - timedelta(days=i*600)).strftime('%Y%m%d') #set days i*30 for 1 month
        snapshot_url = get_wayback_snapshot(website, past_date)
        if snapshot_url:
            print(f"Searching snapshot: {snapshot_url}")
            pixelID = getPixelID(snapshot_url, driver)
            if pixelID:
                print(f"Found pixel ID: {pixelID} at the snapsot: {past_date} ")
                return pixelID, past_date
    return None, None

def load_progress(filename):
    if os.path.exists(filename):
        return pd.read_csv(filename)
    else:
        return pd.DataFrame(columns=["Website Name", "Pixel ID", "Fetched Date"])

def save_progress(df, filename):
    df.to_csv(filename, index=False)

def populateDataframe(urls, progress_file):
    # Load previous progress if any
    progress_df = load_progress(progress_file)
    completed_urls = progress_df['Website Name'].tolist()

    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    total_urls = len(urls)
    
    for idx, website in enumerate(urls):
        if website in completed_urls:
            print(f"Skipping {website}, already processed.")
            continue
        
        # Process the URL
        pixelID = getPixelID(website, driver)
        print("Pixel ID fetched directly: ",pixelID)
        fetch_date = datetime.now().strftime('%Y%m%d')  # Use current date if found in current version

        if not pixelID:  # If no pixel ID found, check Wayback Machine
            pixelID, past_date = check_past_versions(website, driver)
            fetch_date = past_date if past_date else fetch_date  # Use past date if found in Wayback Machine
        
        # Append the result to the DataFrame
        new_record = {"Website Name": website, "Pixel ID": pixelID, "Fetched Date": fetch_date}
        progress_df = pd.concat([progress_df, pd.DataFrame([new_record])], ignore_index=True)

        # Save progress after each URL is processed
        save_progress(progress_df, progress_file)

        # Print progress
        print(f"Processed {idx + 1}/{total_urls}: {website} (Pixel ID: {pixelID})")

    driver.quit()
    return progress_df


progress_file = 'scraping_progress.csv'
urls = pd.read_csv('tranco_top_10k.csv')['website'].to_list()

df = populateDataframe(urls, progress_file)
print(df)


Skipping google.com, already processed.
Skipping amazonaws.com, already processed.
Skipping microsoft.com, already processed.
Skipping facebook.com, already processed.
Skipping akamai.net, already processed.
Skipping a-msedge.net, already processed.
Skipping googleapis.com, already processed.
Skipping apple.com, already processed.
Skipping youtube.com, already processed.
Skipping root-servers.net, already processed.
Skipping azure.com, already processed.
Skipping akamaiedge.net, already processed.
Skipping twitter.com, already processed.
Skipping cloudflare.com, already processed.
Skipping instagram.com, already processed.
Skipping gstatic.com, already processed.
Skipping office.com, already processed.
Skipping linkedin.com, already processed.
Skipping tiktokcdn.com, already processed.
Skipping live.com, already processed.
Skipping googletagmanager.com, already processed.
Skipping googlevideo.com, already processed.
Skipping akadns.net, already processed.
Skipping gtld-servers.net, alr

In [12]:
options = webdriver.ChromeOptions()
options.add_argument("--headless")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

driver.get('https://google.com')

page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
scripts = soup.find_all('script')
print(scripts)

[<script async="" nonce="" src="https://apis.google.com/_/scs/abc-static/_/js/k=gapi.gapi.en.h-1D-JOvizc.O/m=gapi_iframes,googleapis_client/rt=j/sv=1/d=1/ed=1/am=AABA/rs=AHpOoo_3dbjO7NaEjkPT0PwzLRJUFrcOJQ/cb=gapi.loaded_0"></script>, <script nonce="">window._hst=Date.now();performance&&performance.mark&&performance.mark("SearchHeadStart");</script>, <script nonce="">(function(){var _g={kEI:'aNbjZrnLB6ekptQPqqjJqQU',kEXPI:'31',kBL:'j12v',kOPI:89978449};(function(){var a;((a=window.google)==null?0:a.stvsc)?google.kEI=_g.kEI:window.google=_g;}).call(this);})();(function(){google.sn='webhp';google.kHL='en-PK';})();(function(){
var h=this||self;function l(){return window.google!==void 0&&window.google.kOPI!==void 0&&window.google.kOPI!==0?window.google.kOPI:null};var m,n=[];function p(a){for(var b;a&&(!a.getAttribute||!(b=a.getAttribute("eid")));)a=a.parentNode;return b||m}function q(a){for(var b=null;a&&(!a.getAttribute||!(b=a.getAttribute("leid")));)a=a.parentNode;return b}function r(a){/

In [18]:
import requests
import pandas as pd
from requests.auth import HTTPBasicAuth

# Replace with your Tranco credentials
username = 'abdullahghani581@gmail.com'
api_token = '35e3dc052e114bf1bc940d721b72f733'

def get_latest_list_metadata():
    url = 'https://tranco-list.eu/api/lists/date/latest'
    try:
        response = requests.get(url, auth=HTTPBasicAuth(username, api_token))
        response.raise_for_status()
        data = response.json()
        if data['available']:
            return data['download']
        else:
            print("No list available at the moment.")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching latest list metadata: {e}")
        return None

def download_list(download_url):
    try:
        response = requests.get(download_url, auth=HTTPBasicAuth(username, api_token))
        response.raise_for_status()
        with open('tranco_top_10k.csv', 'wb') as file:
            file.write(response.content)
        print("Tranco Top 10k list downloaded successfully.")
        # Load into DataFrame
        df = pd.read_csv('tranco_top_10k.csv', header=None, names=['Rank', 'Domain'])
        return df
    except requests.exceptions.RequestException as e:
        print(f"Error downloading Tranco list: {e}")
        return None

# Fetch the download URL for the latest list
download_url = get_latest_list_metadata()

# If a valid URL is found, download the list
if download_url:
    df = download_list(download_url)
    if df is not None:
        print(df.head())

Tranco Top 10k list downloaded successfully.
   Rank         Domain
0     1     google.com
1     2  amazonaws.com
2     3  microsoft.com
3     4   facebook.com
4     5     akamai.net
