In [46]:
import numpy as np
import pandas as pd
import requests
import os
import urllib.robotparser
from urllib.parse import urlparse, urljoin
import random
from datetime import datetime, timedelta

from selenium import webdriver
from selenium.webdriver.chrome.service import Service


import time
import lxml
from bs4 import BeautifulSoup



In [12]:
web_url = pd.read_excel("Website Samples.xlsx").to_numpy().flatten()

#vars associated with Chrome
service = Service()
options = webdriver.ChromeOptions()
options.add_argument('--headless')

# This code helps us get the wayback machine URLs

In [8]:
def get_wayback(url, timestamp):
    url = f"http://archive.org/wayback/available?url={url}&timestamp={timestamp}"
    print("GET_WAYBACK: " + url)
    res = requests.get(url).json()
     
    if(len(res['archived_snapshots']) == 0):
        return ""
    
    return res['archived_snapshots']['closest']['url']


def gen_url(urls):
    current_date = datetime.now()
    start_date_1_year_ago = current_date - timedelta(days=1*365)

    first_url, second_url = "", ""

    i1, i2 = 0, 0

    first_random_date = None
    second_random_date = None; 


    while (i1 <= 10 and first_url == ""):
        first_random_date = generate_random_date(start_date_1_year_ago, current_date)
        first_url = get_wayback(urls, first_random_date.strftime('%Y%m%d'))
        i1 = i1 + 1



    while(i2 <= 10 and second_url == "" and first_url != ""):
        start_date_before_first_date = first_random_date - timedelta(days=(1.5*364))
        end_date_before_first_date = first_random_date - timedelta(days=0.5*364)
        second_random_date = generate_random_date(start_date_before_first_date, end_date_before_first_date)

        second_url = get_wayback(urls, second_random_date.strftime('%Y%m%d'))
        i2 = i2 + 1
        
    if(first_url != "" and second_url != ""):
        return [first_url, second_url, first_random_date.strftime('%Y%m%d'), second_random_date.strftime('%Y%m%d')]
    
    return None

# The code belows allows us to determine if we can scrape from a website based on robots.txt (lots of webpages have JS protections on them)

In [47]:
def is_scrapeable(url):
    # Parse the URL to get the base URL
    parsed_url = urlparse(url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"

    # Create a RobotFileParser object
    robot_parser = urllib.robotparser.RobotFileParser()
    robots_url = urljoin(base_url, "robots.txt")  # Construct the robots.txt URL
    robot_parser.set_url(robots_url)
    robot_parser.read()

    # Check if the URL is allowed to be scraped
    return robot_parser.can_fetch("*", url)

In [48]:
is_scrapeable("https://www.google.com/search/about")

False

# This code below helps us randomize dates/times

In [4]:
def generate_random_date(start_date, end_date):
    time_difference = end_date - start_date
    random_days = random.randint(0, time_difference.days)
    random_date = start_date + timedelta(days=random_days)
    return random_date


# This code is what gets the HTML and saves it to the file

In [24]:


def save_html_with_id(url, name, unique_id):
    # Create a folder name based on the URL
    folder_name = "./Data/" + name
    
    # Create the folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    
       
    # options.add_argument('--headless')
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(url)
    time.sleep(2)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Create a filename with the URL and ID
    filename = f"{unique_id}.html"
    
    # Save the HTML content to the generated filename
    file_path = os.path.join(folder_name, filename)
    with open(file_path, "w", encoding="utf-8") as html_file:
        html_file.write(str(soup))
    
    driver.quit()
    
    print(f"SAVE_HTML: HTML page saved as '{filename}' in '{folder_name}' folder.")

# Actual Loop

In [30]:

for url in web_url:
    print("Processing " + url)
    url_arr = gen_url(url)
    
    if(url_arr == None):
        continue
    
    
    print(f"URL Arr: {url_arr}\n")
    
    
    save_html_with_id(url_arr[0], url.split(".")[0], "new")
    save_html_with_id(url_arr[1], url.split(".")[0], "old")



Processing orangehrm.com
GET_WAYBACK: http://archive.org/wayback/available?url=orangehrm.com&timestamp=20221020
GET_WAYBACK: http://archive.org/wayback/available?url=orangehrm.com&timestamp=20211015
URL Arr: ['http://web.archive.org/web/20221010123502/https://www.orangehrm.com/', 'http://web.archive.org/web/20211015223005/https://www.orangehrm.com/', '20221020', '20211015']

SAVE_HTML: HTML page saved as 'old.html' in './Data/orangehrm' folder.
SAVE_HTML: HTML page saved as 'new.html' in './Data/orangehrm' folder.
Processing stripe.com
GET_WAYBACK: http://archive.org/wayback/available?url=stripe.com&timestamp=20230513
GET_WAYBACK: http://archive.org/wayback/available?url=stripe.com&timestamp=20221020
GET_WAYBACK: http://archive.org/wayback/available?url=stripe.com&timestamp=20220531
URL Arr: ['http://web.archive.org/web/20230513182042/https://stripe.com/', 'http://web.archive.org/web/20220601001359/https://stripe.com/', '20230513', '20220531']

SAVE_HTML: HTML page saved as 'old.html' 

In [26]:
save_html_with_id("https://www.google.com", "google", "old")

SAVE_HTML: HTML page saved as 'old.html' in './Data/google' folder.


In [33]:
url = "stripe.com"
print("Processing " + url)
url_arr = gen_url(url)
    
    
print(f"URL Arr: {url_arr}\n")
    
    
save_html_with_id(url_arr[0], url.split(".")[0], "old")
save_html_with_id(url_arr[1], url.split(".")[0], "new")

Processing stripe.com
GET_WAYBACK: http://archive.org/wayback/available?url=stripe.com&timestamp=20220930
GET_WAYBACK: http://archive.org/wayback/available?url=stripe.com&timestamp=20220906
GET_WAYBACK: http://archive.org/wayback/available?url=stripe.com&timestamp=20210409
URL Arr: ['http://web.archive.org/web/20220906235416/https://stripe.com/', 'http://web.archive.org/web/20210409213739/https://stripe.com/', '20220906', '20210409']

SAVE_HTML: HTML page saved as 'old.html' in './Data/stripe' folder.
SAVE_HTML: HTML page saved as 'new.html' in './Data/stripe' folder.
