In [3]:
from pyppeteer import launch
from sentence_transformers import SentenceTransformer, util
from PIL import Image

import asyncio
import datetime
import difflib
import glob
import os

In [4]:
#Confirm list of websites to currently monitor
list_of_websites = ['http://shielded-harbor-71309.herokuapp.com/login-pc']
print("Current Websites:")
for website in list_of_websites:
    print(website)

print("\n")
action = input("Any new websites to input? (y/n)").strip()
if action == 'y':
    number = int(input("How many? ").strip())
    for i in range(number):
        new_website = input("Full link of new website: ").strip()
        list_of_websites.append(new_website)


Current Websites:
http://shielded-harbor-71309.herokuapp.com/login-pc


Any new websites to input? (y/n)y
How many?2
Full link of new website: https://main.scta.org.sg/
Full link of new website: http://scanme.nmap.org/


In [None]:
#Check if report is wanted, and if yes, since what date and time.
report_wanted = False

action = input("Do you want to generate a report? (y/n)").strip()
if action == 'y':
    report_wanted = True
    global start_datetime = input("Start date and time (24h format): (yyyy-mm-dd hh_mm)").strip()
    

In [None]:
#Scraping of website for screenshot, html and textcontent
async def main(website):
    browser = await launch(headless=True)
    page = await browser.newPage()

    await page.goto(website, waitUntil="load")
    
    
    #Creates folders if this is a new webpage
    newpath = website.replace("https://", "").replace("http://", "")
    newpath = newpath.replace("/", "_").strip("_")
    if not os.path.exists("screenshots/" + newpath):
        os.makedirs("screenshots/" + newpath)
        os.makedirs("screenshots/" + newpath + "/html")
        os.makedirs("screenshots/" + newpath + "/content")
        os.makedirs("screenshots/" + newpath + "/images")
        
    if not os.path.exists("defacement/" + newpath):
        os.makedirs("defacement/" + newpath)
        os.makedirs("defacement/" + newpath + "/html")
        os.makedirs("defacement/" + newpath + "/content")
        os.makedirs("defacement/" + newpath + "/images")


    #Scraping of website
    filename = f"screenshots/{newpath}/images/{datetime.datetime.now()}.png".replace(":", "_")
    await page.screenshot({'path': filename, 'fullPage': True}) #Screenshot page
    textcontent = await page.evaluate('document.body.textContent', force_expr=True) #Scraping textcontent
    html = await page.content() #Scraping html 
    
    #Writing scraped content into files
    with open(f"screenshots\\{newpath}\\content\\{datetime.datetime.now()}.txt".replace(":", "_"), "w+") as file:
        file.write(textcontent)
    
    with open(f"screenshots\\{newpath}\\html\\{datetime.datetime.now()}.html".replace(":", "_"), "w+") as file:
        file.write(html)
        
    await browser.close()

    
#for website in list_of_websites:
for website in list_of_websites:
    asyncio.get_event_loop().run_until_complete(main(website))

In [None]:
#Comparing screenshots for each website

for website in list_of_websites:
    # Load the OpenAI CLIP Model
    print('Loading CLIP Model...')
    model = SentenceTransformer('clip-ViT-B-32')

    # Next we compute the embeddings
    # To encode an image, you can use the following code:
    # from PIL import Image
    # encoded_image = model.encode(Image.open(filepath))
    image_names = list(glob.glob(f'screenshots/{website}/images/*.png'))[-2:] #compare 2 most recent files
    print("Images:", len(image_names))
    encoded_image = model.encode([Image.open(filepath) for filepath in image_names], batch_size=128, convert_to_tensor=True, show_progress_bar=True)

    # Now we run the clustering algorithm. This function compares images aganist 
    # all other images and returns a list with the pairs that have the highest 
    # cosine similarity score
    processed_images = util.paraphrase_mining_embeddings(encoded_image)
    NUM_SIMILAR_IMAGES = 10 

    # =================
    # DUPLICATES
    # =================
    print('Finding duplicate images...')
    # Filter list for duplicates. Results are triplets (score, image_id1, image_id2) and is scorted in decreasing order
    # A duplicate image will have a score of 1.00
    # It may be 0.9999 due to lossy image compression (.jpg)
    duplicates = [image for image in processed_images if image[0] >= 0.999]

    # Output the top X duplicate images
    for score, image_id1, image_id2 in duplicates[0:NUM_SIMILAR_IMAGES]:
        print("\nScore: {:.3f}%".format(score * 100))
        print(image_names[image_id1])
        print(image_names[image_id2])

    # =================
    # NEAR DUPLICATES
    # =================
    print('Finding near duplicate images...')
    # Use a threshold parameter to identify two images as similar. By setting the threshold lower, 
    # you will get larger clusters which have less similar images in it. Threshold 0 - 1.00
    # A threshold of 1.00 means the two images are exactly the same. Since we are finding near 
    # duplicate images, we can set it at 0.99 or any number 0 < X < 1.00.
    threshold = 0.99
    near_duplicates = [image for image in processed_images if image[0] < threshold]

    #for score, image_id1, image_id2 in near_duplicates[0:NUM_SIMILAR_IMAGES]:
        #print("\nScore: {:.3f}%".format(score * 100))
        #print(image_names[image_id1])
        #print(image_names[image_id2])

        if score < 1.0:
            print("Changes detected in screenshot.\n")
            with open(f"defacement\\{website}\\images\\Changes detected at {datetime.datetime.now()}.txt".replace(":", "_"), "w+") as file:
                file.write("\nScore: {:.3f}%\n".format(score * 100))
                file.write(image_names[image_id1] + "\n")
                file.write(image_names[image_id2] + "\n")

In [None]:
#Comparing html files

for website in list_of_websites:
    #Compare 2 most recent html files
    html_names = list(glob.glob(f'screenshots/{website}/html/*.html'))[-2:]
    print("Files:", len(html_names))

    with open(html_names[0]) as file_1:
        file_1_text = file_1.readlines()

    with open(html_names[1]) as file_2:
        file_2_text = file_2.readlines()

    # Find and save the diff:
    diff = ""
    for line in difflib.unified_diff(file_1_text, file_2_text, fromfile='file1.txt', tofile='file2.txt', lineterm=''):
        diff += line + "\n"

    if diff:
        print("Changes detected in html.\n")
        with open(f"defacement\\{website}\\html\\Changes detected at {datetime.datetime.now()}.txt".replace(":", "_"), "w+") as file:
            file.write(diff)


In [None]:
#Comparing textcontent files

for website in list_of_websites:
    #Compare 2 most recent content files    
    content_names = list(glob.glob(f'screenshots/{website}/content/*.txt'))[-2:]
    print("Files:", len(content_names))

    with open(content_names[0]) as file_1:
        file_1_text = file_1.readlines()

    with open(content_names[1]) as file_2:
        file_2_text = file_2.readlines()

    # Find and save the diff:
    diff = ""
    for line in difflib.unified_diff(file_1_text, file_2_text, fromfile='file1.txt', tofile='file2.txt', lineterm=''):
        diff += line +"\n"

    if diff:
        print("Changes detected in textcontent.\n")
        with open(f"defacement\\{website}\\content\\Changes detected at {datetime.datetime.now()}.txt".replace(":", "_"), "w+") as file:
            file.write(diff)

In [None]:
#Generating a report


In [None]:
#References

"""
1. https://www.geeksforgeeks.org/compare-two-files-line-by-line-in-python/
2. https://github.com/pyppeteer/pyppeteer
3. https://stackoverflow.com/questions/11541154/checking-images-for-similarity-with-opencv/71634759#71634759
"""