# Mount Google Drive

In [None]:
import os

def list_folder_tree(startpath=None, max_files_per_dir=None):
    if startpath is None:
        startpath = os.getcwd()  # Set default to current working directory

    tree = []
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        tree.append(f"{indent}{os.path.basename(root)}/")
        subindent = ' ' * 4 * (level + 1)

        # Apply the file listing limit
        files_to_show = files if max_files_per_dir is None else files[:max_files_per_dir]
        for f in files_to_show:
            tree.append(f"{subindent}{f}")

        # Optionally show a message if there are more files than the max_files_per_dir
        if max_files_per_dir is not None and len(files) > max_files_per_dir:
            tree.append(f"{subindent}...and {len(files) - max_files_per_dir} more files")

    return "\n".join(tree)

In [None]:
from google.colab import drive
import os

gdrive_path='/content/gdrive/MyDrive/SWQ'

# This will mount your google drive under 'MyDrive'
drive.mount('/content/gdrive')
# In order to access the files in this notebook we have to navigate to the correct folder
os.chdir(gdrive_path)
# Check manually if all files are present
#print(list_folder_tree(max_files_per_dir=0))

Mounted at /content/gdrive


# Imports

In [None]:
from PIL import Image, ImageDraw
from IPython.display import display
import requests
import time
import base64

import pandas as pd

from tqdm import tqdm

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
!apt update --quiet
!apt install chromium-chromedriver --quiet
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium --quiet

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [973 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [2,591 kB]
Hit:8 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:11 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,125 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [1,994 kB

In [None]:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [None]:
from bs4 import BeautifulSoup, Comment

In [None]:
from src.util import ScreenshotTaker, get_thumbnail_for_llava, concat_images_with_line, concat_images_with_line_full_res

# Organizing Files

In [None]:
import os
import shutil
from tqdm import tqdm

def move_files(source_dir, target_dir):
    # First, collect all files that will be moved
    files_to_move = []
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.startswith('fullres'):
                files_to_move.append((root, file))
    # Use tqdm to display progress
    for root, file in tqdm(files_to_move, desc='Moving files'):
        rel_path = os.path.relpath(root, source_dir)
        source_file = os.path.join(root, file)
        dest_dir = os.path.join(target_dir, rel_path)
        dest_file = os.path.join(dest_dir, file)

        # Ensure the destination directory exists
        if not os.path.exists(dest_dir):
            os.makedirs(dest_dir)

        # Move the file
        shutil.move(source_file, dest_file)

In [None]:
# Path to the raw and processed directories
source_dir = f"data/processed/{appname}/screenshots/concat"
target_dir = f"data/processed/{appname}/screenshots/concat/fullres"

# Call the function to move files
#move_files(source_dir, target_dir)

Moving files: 100%|██████████| 8515/8515 [00:44<00:00, 192.23it/s]


# Screenshots

In [None]:
all_states_df = pd.read_csv("data/SS.csv")

In [None]:
appname = "phoenix"

In [None]:
current_app_states_df = all_states_df[all_states_df['appname'] == appname]

In [None]:
screenshot_taker = ScreenshotTaker()

directory = f"data/raw/{appname}"
save_to_directory = f"data/processed/{appname}/screenshots"

for filename in tqdm(os.listdir(directory)):
  if filename.endswith(".html"):

    file_path = os.path.join(directory, filename)

    # Extract directory and file parts
    filename_with_extension = os.path.basename(file_path)
    filename_without_extension, file_extension = os.path.splitext(filename_with_extension)

    # Ensure the destination directory exists
    if not os.path.exists(save_to_directory):
      os.makedirs(save_to_directory)

    # Construct the new file path
    save_to_path = os.path.join(save_to_directory, f"{filename_without_extension}.png")

    width, height = screenshot_taker.take_screenshot(file_path, save_to_path)
    if width > 667 or height > 336:
      get_thumbnail_for_llava(save_to_path)

screenshot_taker.close()

100%|██████████| 167/167 [01:52<00:00,  1.48it/s]


In [None]:
images_folder: str = f"data/processed/{appname}/screenshots"
save_folder: str = f"data/processed/{appname}/screenshots/concat"

# ensure concat folder exists
if not os.path.exists(save_folder):
  os.makedirs(save_folder)

for index, row in tqdm(current_app_states_df.iterrows(), total=current_app_states_df.shape[0]):

  state1_thumb = f"{images_folder}/{row['state1']}_thumbnail.png"
  state2_thumb = f"{images_folder}/{row['state2']}_thumbnail.png"

  if not os.path.exists(state1_thumb):
    state1_thumb = f"{images_folder}/{row['state1']}.png"

  if not os.path.exists(state2_thumb):
    state2_thumb = f"{images_folder}/{row['state2']}.png"

  output_path = f"{save_folder}/concat_{row['state1']}_{row['state2']}.png"

  if not os.path.exists(output_path):
    concat_images_with_line(state1_thumb, state2_thumb, output_path)

100%|██████████| 11175/11175 [08:08<00:00, 22.88it/s]


In [None]:
save_folder: str = f"data/processed/{appname}/screenshots/concat"

print(os.path.isfile(f"data/processed/petclinic/screenshots/concat/concat_state518_state714.png"))

True


In [None]:
images_folder: str = f"data/processed/{appname}/screenshots"
save_folder: str = f"data/processed/{appname}/screenshots/concat"

# ensure concat folder exists
if not os.path.exists(save_folder):
  os.makedirs(save_folder)

for index, row in tqdm(current_app_states_df.iterrows(), total=current_app_states_df.shape[0]):

  state1_thumb = f"{images_folder}/{row['state1']}.png"
  state2_thumb = f"{images_folder}/{row['state2']}.png"

  output_path = f"{save_folder}/fullres_concat_{row['state1']}_{row['state2']}.png"

  concat_images_with_line_full_res(state1_thumb, state2_thumb, output_path)

100%|██████████| 8515/8515 [43:15<00:00,  3.28it/s]


# HTML Preprocessing

In [None]:
def remove_long_comments_and_tags(html, max_length=50):
    soup = BeautifulSoup(html, 'html.parser')

    # Remove long comments
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    for comment in comments:
        if len(comment) > max_length:
            comment.extract()

    # Remove specified tags
    for tag in soup(["style", "meta", "script"]):
        tag.extract()

    # Remove class attributes from all tags
    for tag in soup.find_all():
        if 'class' in tag.attrs:
            del tag.attrs['class']
        if 'style' in tag.attrs:
            del tag.attrs['style']
        if 'data-reactid' in tag.attrs:
            del tag.attrs['data-reactid']

    # Extract only the body content, prettify and return it
    return soup.body.prettify()

In [None]:
def get_content_from_html(html):
    soup = BeautifulSoup(html, 'html.parser')

    text = soup.getText()

    return ' '.join(text.split())

In [None]:
html_folder: str = f"data/raw/{appname}"
target_folder: str = f"data/processed/{appname}/"

if not os.path.exists(target_folder):
        os.makedirs(target_folder)

# Loop over all files in the source folder
for filename in tqdm(os.listdir(html_folder)):
    if filename.endswith(".html"):
        # Construct full file path
        source_file_path = os.path.join(html_folder, filename)
        target_file_path = os.path.join(target_folder, f"{filename}.body.nostyle.noscript.nometa.nolongcomment")

        # Read the HTML file
        with open(source_file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()

        # Clean the HTML content
        cleaned_html = remove_long_comments_and_tags(html_content)

        # Save the cleaned HTML to new file in target folder
        with open(target_file_path, 'w', encoding='utf-8') as file:
            file.write(cleaned_html)

100%|██████████| 167/167 [00:08<00:00, 19.32it/s]


In [None]:
print(os.path.isfile(f"data/processed/phoenix/state123.html.body.nostyle.noscript.nometa.nolongcomment.data-reactid"))

True


In [None]:
html_folder: str = f"data/raw/{appname}"

# Opening the html file
HTMLFile = open(f"{html_folder}/state10.html", "r")

# Reading the file
html_content = HTMLFile.read()

cleaned_html = remove_long_comments_and_tags(html_content)

print(cleaned_html)
print(len(cleaned_html))