In [None]:
import datetime
import httpx
import pandas as pd


def get_wayback_machine_captures(url: str):
    """
    Get a list of all the captures for a given URL in the Wayback Machine.
    """
    try:
        resp = httpx.get(
            "http://web.archive.org/cdx/search/cdx",
            params={"url": url, "output": "json"},
            timeout=60  # Increased timeout to 60 seconds
        )
    except httpx.ReadTimeout:
        print("The request timed out. Please try again later.")
        return []
    resp.raise_for_status()

    # What if we don't get any results?
    if resp.json() == []:
        return []

    fields, *remainder = resp.json()

    captures = []
    for row in remainder:
        data = dict(zip(fields, row))

        data["length"] = int(data["length"])

        # The format used for Wayback captures is yyyyMMddhhmmss
        # e.g. 20200814072506
        data["time"] = datetime.datetime.strptime(data["timestamp"], "%Y%m%d%H%M%S")

        # The capture URL is of the format:
        # https://web.archive.org/web/{timestamp}/{original}
        data["raw_url"] = f"https://web.archive.org/web/{data['timestamp']}if_/{data['original']}"
        data["web_url"] = f"https://web.archive.org/web/{data['timestamp']}/{data['original']}"

        captures.append(data)

    return captures


def filter_captures_last_week(captures):
    """
    Filter captures to include only those from the previous week.
    """
    one_week_ago = datetime.datetime.now() - datetime.timedelta(days=7)
    filtered_captures = [
        capture for capture in captures if one_week_ago <= capture["time"] <= datetime.datetime.now()
    ]
    return pd.DataFrame(filtered_captures)


# Get captures and filter them
captures = get_wayback_machine_captures(url="https://www.youtube.com/channel/UCYfdidRxbB8Qhf0Nx7ioOYw")
df = filter_captures_last_week(captures)

print(df)

                                           urlkey       timestamp  \
0   com,youtube)/channel/ucyfdidrxbb8qhf0nx7iooyw  20250415033055   
1   com,youtube)/channel/ucyfdidrxbb8qhf0nx7iooyw  20250415053525   
2   com,youtube)/channel/ucyfdidrxbb8qhf0nx7iooyw  20250415074011   
3   com,youtube)/channel/ucyfdidrxbb8qhf0nx7iooyw  20250415094849   
4   com,youtube)/channel/ucyfdidrxbb8qhf0nx7iooyw  20250416000531   
5   com,youtube)/channel/ucyfdidrxbb8qhf0nx7iooyw  20250416020714   
6   com,youtube)/channel/ucyfdidrxbb8qhf0nx7iooyw  20250416042851   
7   com,youtube)/channel/ucyfdidrxbb8qhf0nx7iooyw  20250416065718   
8   com,youtube)/channel/ucyfdidrxbb8qhf0nx7iooyw  20250417021039   
9   com,youtube)/channel/ucyfdidrxbb8qhf0nx7iooyw  20250417061831   
10  com,youtube)/channel/ucyfdidrxbb8qhf0nx7iooyw  20250417083634   
11  com,youtube)/channel/ucyfdidrxbb8qhf0nx7iooyw  20250418011732   
12  com,youtube)/channel/ucyfdidrxbb8qhf0nx7iooyw  20250418031927   
13  com,youtube)/channel/ucyfdidrx

In [5]:
pip install beautifulsoup4

Defaulting to user installation because normal site-packages is not writeable
Collecting beautifulsoup4
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl (187 kB)
[K     |████████████████████████████████| 187 kB 1.4 MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2
  Downloading soupsieve-2.7-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.13.4 soupsieve-2.7
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
from bs4 import BeautifulSoup
import requests


url = 'https://web.archive.org/web/20140307210234/https://www.youtube.com/channel/UCYfdidRxbB8Qhf0Nx7ioOYw'

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content of the page
soup = BeautifulSoup(response.text, 'html.parser')

# Example: Extract all the links on the page
links = soup.find_all('title')

titles = soup.find_all('title')
# Print the links
for link in links:
    href = link.get('href')
    print(href)

None
