In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from bs4 import BeautifulSoup, Tag
from sentence_transformers import SentenceTransformer
import os
import re

In [None]:

model = SentenceTransformer('all-MiniLM-L6-v2')

sample_elements_data = pd.read_excel("./sample_data/Sample_Element_Comp.xlsx")
sample_old_soup = get_html_soup("./sample_data/oldpage.html")
sample_new_soup = get_html_soup("./sample_data/newpage.html")


# This script is used to retrieve the individual HTMLs as Beautiful Soup files

In [None]:
def get_html_soup(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            soup = BeautifulSoup(content, 'html.parser')
            return soup
    except FileNotFoundError:
        print(f"File {file_path} not found.")
        return None

# All the SOUP Codes

In [None]:
def find_elements_by_type(soup, element_type):
    elements = soup.find_all(element_type)
    return elements

# Excel Codes

In [None]:
def write_to_path(df, file_path, excel_file_name, excel_sheet_name): 
    excel_file_path = file_path + excel_file_name
    excel_writer = None
    
    if os.path.isfile(excel_file_path):
        excel_writer = pd.ExcelWriter(excel_file_path, engine="openpyxl", mode='a')
    else:
        excel_writer = pd.ExcelWriter(excel_file_path, engine="openpyxl")
        
    df.to_excel(excel_writer, sheet_name=str(excel_sheet_name), index=False, encoding="utf-8")
    excel_writer.save()

# This code finds the Jaccard index amongst the attribute sets

In [None]:
def jaccard_index_of_attribute_set(elem1, elem2): 
    attr1 = set(elem1.attrs.keys())
    attr2 = set(elem2.attrs.keys())
    
    intersection_size = len(attr1 & attr2)
    union = len(attr1.union(attr2))
    
    if(union == 0):
        return 1
    
    return (intersection_size/union)
    
    

# This code finds the cosine similarity amongst each of the shared elements' attributes

In [None]:
def get_cos_sim_attributes(elem1, elem2): 
    attr1 = elem1.attrs
    attr2 = elem2.attrs
    
    intersection = set(attr1.keys()) & set(attr2.keys())
    
    cos_sims = {}

    
    for attr_name in intersection: 
        enc_attr_1 = attr1[attr_name]
        enc_attr_2 = attr2[attr_name]
        
        if(isinstance(enc_attr_1, list)):
            enc_attr_1 = " ".join(attr1[attr_name])
        
        if(isinstance(enc_attr_2, list)):
            enc_attr_2 = " ".join(attr2[attr_name])
        
        if((len(enc_attr_1) == 0 and len(enc_attr_2) != 0) or (len(enc_attr_1) != 0 and len(enc_attr_2) == 0)):
            continue
 
        enc1 = model.encode(enc_attr_1).reshape(1, -1)
        enc2 = model.encode(enc_attr_2).reshape(1, -1)
        
        
        cos_sims[attr_name] = [cosine_similarity(enc1, enc2)[0][0]]

            
            
    return cos_sims
    

# This gets the cosine similarity of tall the conents

In [None]:
def get_cos_sim_contents(elem1, elem2): 
    arr_elem1 = [str(element).replace('\n', '') for element in elem1 if not None and str(element) != "\n"]
    arr_elem2 = [str(element).replace('\n', '') for element in elem2 if not None and str(element) != "\n"]

    if(len(arr_elem1) == 0 and len(arr_elem2) == 0):
        return 1
    elif(len(arr_elem1) == 0 or len(arr_elem2) == 0):
        return None
    
    enc1 = model.encode(arr_elem1)
    enc2 = model.encode(arr_elem2)
    
    return cosine_similarity(enc1, enc2)[0][0]

 # This is the function which generates the data given a single website excel

In [None]:
def run_test_on_web(old_soup, new_soup, elements_data, file_path):
    curr_sheet_id = 1
    
    for index, row in elements_data.iterrows():
        old_target_elem_arr = old_soup.find_all(attrs={row[1]: row[0]})
        
        if(len(old_target_elem_arr) == 0):
            msg = f"{curr_sheet_id}: UNABLE TO FIND ELEMENT ON WEBSITE\nELEM NAME: {row[0]}\nELEM DATA: {row[1]}\n\n"
            print(msg)
            with open(file_path + 'info.txt', 'a') as f:
                f.write(msg)
            curr_sheet_id += 1
            continue
           
        old_target_elem = old_target_elem_arr[0]
        new_elem_arr = new_soup.find_all(old_target_elem.name)


        df_jaccard = pd.DataFrame({"Element": [], "Jaccard Index": []})
        df_cosine_content = pd.DataFrame({"Element": [], "Content Cosine Similarity": []})
        df_cosine_attr = pd.DataFrame({"Element": []})


        for new_elem in new_elem_arr:
            new_jaccard_row = pd.DataFrame({"Element": [str(new_elem)], "Jaccard Index": [jaccard_index_of_attribute_set(old_target_elem, new_elem)]})
            df_jaccard = pd.concat([df_jaccard, new_jaccard_row], ignore_index=True)

            new_cos_content_row = pd.DataFrame(({"Element": [str(new_elem)], "Content Cosine Similarity": [get_cos_sim_contents(old_target_elem, new_elem)]}))
            df_cosine_content = pd.concat([df_cosine_content, new_cos_content_row], ignore_index=True)
            
            
            new_cosine_attr_dict = get_cos_sim_attributes(old_target_elem, new_elem)
            new_cosine_attr_dict["Element"] = str(new_elem)
            
            new_cosine_attr = pd.DataFrame(new_cosine_attr_dict, index=[1])
            df_cosine_attr = pd.concat([df_cosine_attr, new_cosine_attr], ignore_index=True)
        
        # FILL ALL NaN (when one cos is 0) with NaN
        df_cosine_attr = df_cosine_attr.fillna('NaN')
        df_jaccard = df_jaccard.fillna('NaN')
        df_cosine_content = df_cosine_content.fillna('NaN')
        
        write_to_path(df_cosine_attr, file_path, "COSINE_ATTR.xlsx", curr_sheet_id)
        write_to_path(df_jaccard, file_path, "JACCARD.xlsx", curr_sheet_id) 
        write_to_path(df_cosine_content, file_path, "COSINE_CONTENT.xlsx", curr_sheet_id)
                
        with open(file_path + 'info.txt', 'a') as f:
            f.write(f"{curr_sheet_id}: {str(old_target_elem)}\n")
        
        curr_sheet_id += 1
        

# Actual code below

In [None]:
subdirectories = [subdir for subdir in os.listdir("./Data") if os.path.isdir(os.path.join("./Data", subdir))]


for web in subdirectories:
    file_path = f"./Data/{web}/"
    
    
    with open(file_path + "old.html", "r", encoding="utf-8") as html_file:
        old_html_content = html_file.read()
        
    old_soup = BeautifulSoup(old_html_content, "html.parser")
    
    
    with open(file_path + "new.html", "r", encoding="utf-8") as html_file:
        new_html_content = html_file.read()
        
    new_soup = BeautifulSoup(new_html_content, "html.parser")
    
    elements_data = pd.read_excel("./Web Element Data.xlsx", sheet_name=web)
    
    
    print(f"Running {web}")
    run_test_on_web(old_soup, new_soup, elements_data, file_path)
    print("Finished!\n\n")
    
    

print("DONE!")