In [1]:
from src.extract_elements import extract_ui_elements, load_model
from src.UIElementClassifier import UIElementClassifier
from src.UNET import UNet as SegmentationModel
from src.generate_prompt import generate_prompt_for_llm
from src.askgpt35 import get_model_response
import warnings
warnings.filterwarnings("ignore") # About zerodivizion at the beggining. It's not a problem

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np
import ast

def calculate_number_accuracy(predicted, actual):
    predicted = np.array(predicted, dtype=np.float32)
    actual = np.array(actual, dtype=np.float32)
    accuracy = np.mean(predicted == actual)
    mae = np.mean(np.abs(predicted - actual))
    return accuracy, mae

def calculate_yes_no_accuracy(predicted, actual):
    predicted = np.array(predicted)
    actual = np.array(actual)
    accuracy = np.mean(predicted == actual)
    return accuracy

def calculate_string_accuracy(predicted, actual):
    predicted = np.array(predicted)
    actual = np.array(actual)
    accuracy = np.mean(predicted == actual)
    return accuracy

def calculate_coordinates_accuracy(predicted, actual):
    inside = []
    for pred, act_str in zip(predicted, actual):
        # Convert the string representation of tuples to actual tuples
        act = ast.literal_eval(act_str)
        # Flatten the list of tuples into a single tuple
        xmin, ymin, xmax, ymax = [coord for tup in act for coord in tup]
        
        px, py = pred
        inside.append(xmin <= px <= xmax and ymin <= py <= ymax)
    
    if inside:
        accuracy = np.mean(inside)
    else:
        accuracy = 0
    
    # print(f"Predicted: {predicted}")
    # print(f"Actual (processed): {actual}")
    # print(f"Inside: {inside}")
    return accuracy


In [3]:
import pandas as pd
questions_df = pd.read_csv("data/MAC/ui_questions_test.tsv", sep='\t')
questions_df

Unnamed: 0,App bundle,App name,Screen id,Question,Answer,Answer Type
0,com.onmyway133.Almighty-setapp,Almighty,1707228310,"On the visible screen, how many toggle buttons...",0,number
1,com.onmyway133.Almighty-setapp,Almighty,1707228310,"How many ""+"" buttons are there?",1,number
2,com.onmyway133.Almighty-setapp,Almighty,1707228237,How many toggle buttons are turned on?,0,number
3,com.onmyway133.Almighty-setapp,Almighty,1707228237,How many search bars are on the screen?,1,number
4,com.onmyway133.Almighty-setapp,Almighty,1707228237,Is there a button to take a screenshot?,No,yes/no
...,...,...,...,...,...,...
223,com.devuap.Typing-Mind-setapp,Typing-Mind,1707209841,What is the title of current window?,AI Characters,string
224,com.devuap.Typing-Mind-setapp,Typing-Mind,1707209807,"I want to change the model type, where should ...","[(752, 72), (994, 152)]",coordinates
225,com.devuap.Typing-Mind-setapp,Typing-Mind,1707209975,Is Ukrainian among human-approved languages?,No,yes/no
226,com.devuap.Typing-Mind-setapp,Typing-Mind,1707209975,How many language options are provided?,11,number


In [4]:
questions_df['img_folder'] = 'data/MAC/test/' + questions_df['App name'] + '/' + questions_df['Screen id'].astype(str) + '/'
questions_df.drop(columns=['App name', 'Screen id', 'App bundle'], inplace=True)
questions_df

Unnamed: 0,Question,Answer,Answer Type,img_folder
0,"On the visible screen, how many toggle buttons...",0,number,data/MAC/test/Almighty/1707228310/
1,"How many ""+"" buttons are there?",1,number,data/MAC/test/Almighty/1707228310/
2,How many toggle buttons are turned on?,0,number,data/MAC/test/Almighty/1707228237/
3,How many search bars are on the screen?,1,number,data/MAC/test/Almighty/1707228237/
4,Is there a button to take a screenshot?,No,yes/no,data/MAC/test/Almighty/1707228237/
...,...,...,...,...
223,What is the title of current window?,AI Characters,string,data/MAC/test/Typing-Mind/1707209841/
224,"I want to change the model type, where should ...","[(752, 72), (994, 152)]",coordinates,data/MAC/test/Typing-Mind/1707209807/
225,Is Ukrainian among human-approved languages?,No,yes/no,data/MAC/test/Typing-Mind/1707209975/
226,How many language options are provided?,11,number,data/MAC/test/Typing-Mind/1707209975/


In [5]:
from transformers import GPT2Tokenizer

# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def trim_text_to_fit(text, max_tokens):
    """
    Trims the text to fit within a specified maximum number of tokens.
    This is a basic implementation that cuts off the text at the max_tokens limit.
    More sophisticated methods might involve more nuanced text reductions.
    """
    tokens = tokenizer.encode(text)
    if len(tokens) > max_tokens:
        # Cut off the tokens that exceed the limit and decode back to string
        trimmed_tokens = tokens[:max_tokens]
        return tokenizer.decode(trimmed_tokens, clean_up_tokenization_spaces=True)
    return text



In [None]:
import os
import glob
import torch
import json
from tqdm import tqdm


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
classification_model = load_model(UIElementClassifier, 'models_checkpoints/best_model_classification.pth', num_classes=3, device=device)
segmentation_model = load_model(SegmentationModel, 'models_checkpoints/checkpoint_UNET.pt', num_classes=1, device=device)
responses = {'number': [], 'yes/no': [], 'string': [], 'coordinates': []}
correct_answers = {'number': [], 'yes/no': [], 'string': [], 'coordinates': []}

for index, row in questions_df.iterrows():
    img_folder = row['img_folder']
    
    # Find image file in the folder (adjust pattern as necessary for specific image types)
    image_files = glob.glob(os.path.join(img_folder, '*.png')) + glob.glob(os.path.join(img_folder, '*.jpg'))
    json_files = glob.glob(os.path.join(img_folder, '*.json'))
    
    image_file = image_files[0] if len(image_files) > 0 else None
    json_file = json_files[0] if len(json_files) > 0 else None
    try:
        ui_elements = extract_ui_elements(image_file, segmentation_model, classification_model, device)
        text_repr = generate_prompt_for_llm(ui_elements, img_folder.split('/')[-2])
    except:
        continue
    if json_file is not None:
        with open(json_file, 'r') as f:
            data = json.load(f)
        text_repr = text_repr + "\n\nHere also goes the JSON accesibility tree data.\n" + json.dumps(data, indent=4)
    # Get the question from the DataFrame
    question = row['Question']
    answer_type = row['Answer Type']
    correct_answer = row['Answer']
    
    # Get the response from the model
    try:
        trimmed_text_repr = trim_text_to_fit(text_repr, 16000)
        response_from_llm = get_model_response(answer_type, question, text_repr)
    except:
        # print(f"Too much tokens. Trimming the text to fit.")
        trimmed_text_repr = trim_text_to_fit(text_repr, 15000)
        response_from_llm = get_model_response(answer_type, question, trimmed_text_repr)

    responses[answer_type].append(response_from_llm)
    correct_answers[answer_type].append(row['Answer'])
    
    current_number_accuracy, current_number_mae = calculate_number_accuracy(responses['number'], correct_answers['number'])
    current_yes_no_accuracy = calculate_yes_no_accuracy(responses['yes/no'], correct_answers['yes/no'])
    current_string_accuracy = calculate_string_accuracy(responses['string'], correct_answers['string'])
    current_coordinates_accuracy = calculate_coordinates_accuracy(responses['coordinates'], correct_answers['coordinates'])


number_accuracy, number_mae = calculate_number_accuracy(responses['number'], correct_answers['number'])
yes_no_accuracy = calculate_yes_no_accuracy(responses['yes/no'], correct_answers['yes/no'])
string_accuracy = calculate_string_accuracy(responses['string'], correct_answers['string'])
coordinates_accuracy = calculate_coordinates_accuracy(responses['coordinates'], correct_answers['coordinates'])

In [3]:
print(f"Number Accuracy: {number_accuracy:.5f}, Number MAE: {number_mae:.5f}")
print(f"Yes/No Accuracy: {yes_no_accuracy:.5f}")
print(f"String Accuracy: {string_accuracy:.5f}")
print(f"Coordinates Accuracy: {coordinates_accuracy:.5f}")

Number Accuracy: 0.73241, Number MAE: 3.13232
Yes/No Accuracy: 0.86785
String Accuracy: 0.53214
Coordinates Accuracy: 0.70242
