In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 44em; }</style>"))

from typing import Text, Generator, Tuple, List, Optional, Dict, Set
import pandas as pd
import numpy as np
from ast import literal_eval
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import json
import re
import uuid
from mpl_toolkits.axes_grid1 import ImageGrid
import math
import copy
sns.set_theme()

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 5000)
pd.set_option('display.max_colwidth', 5000)

# 1. Load GUI Datasets

In [None]:
data_path = "../data/"
rico_path = "../data/rico/unique_uis/combined/"

In [None]:
all_guis_with_comps = pd.read_csv(data_path + "all_guis.csv")
all_guis_with_comps['data'] = all_guis_with_comps['data'].apply(literal_eval)

In [None]:
all_guis_with_comps[:2]

In [None]:
#help function for plotting multiple retrieved images
def show_images(ranked, img_path, relevances=None):
    fig = plt.figure(figsize=(15,15)) # specifying the overall grid size
    grid = ImageGrid(fig, 111,  # similar to subplot(111)
                 nrows_ncols=(int(math.ceil(len(ranked)/3)),3),
                 axes_pad=0.3,  share_all=True
                 )
    if relevances is None:
        relevances = np.zeros(len(ranked))
    for ax, rank, relevance in zip(grid, ranked, relevances):
        rico_id = rank[0]
        img = Image.open(img_path + str(rico_id) + ".jpg")
        img = img.resize((1080, 1920))
        ax.imshow(np.array(img))
        ax.grid(False)
        title = str(rico_id) + ", " + str(round(rank[1], 2))
        if relevance != 0:
            title += ", " + relevance
        ax.title.set_text(title)

    plt.show()

In [None]:
FEAT_METHOD_TEXT_ONLY = 'feat_method_text_only'
FEAT_METHOD_TEXT_COMP_TYPE = 'feat_method_text_comp_type'
FEAT_METHOD_TEXT_COMP_TYPE_RES_ID = 'feat_method_text_comp_type_res_id'
FEAT_METHOD_HTML = 'feat_method_html'

STRUCT_METHOD_SIMPLE_BULLETS = 'struct_method_simple_bullets'
STRUCT_METHOD_SIMPLE_BULLETS_SORTED = 'struct_method_simple_bullets_sorted'
STRUCT_METHOD_TWO_LEVEL_BULLETS = 'struct_method_two_level_bullets'
STRUCT_METHOD_TWO_LEVEL_HTML = 'struct_method_two_level_html'

STYLE_SIZE = 'style_size'
STYLE_BOUNDS = 'style_bounds'
STYLE_RICO_ID = 'style_rico_id'
STYLE_BACK_COLOR = 'style_back_color'
STYLE_FONT_COLOR = 'style_font_color'
STYLE_FONT_SIZE = 'style_font_size'

stop_words_r_ids = {'main', 'content', 'navigation', 'bar', 'background', 'status',
                    'checkbox', 'widget', 'frame', 'container', 'action', 'btn', 'menu',
                    'label', 'root', 'toolbar', 'view', 'button', 'activity', 'layout',
                    'drawer', 'actionbar', 'icon', 'text', 'banner'}

html_comp_mapping = {'Web View': ('<div', '</div>'),
                     'Icon': ('<i class="material-icons"', '</i>'),
                     'Button': ('<button type="button"', '</button>'),
                     'Label': ('<p', '</p>'),
                     'Video': ('<video ', '</video> '),
                     'Image': ('<img src="example.jpg"', ''),
                     'Background Image': ('<img src="example.jpg"', ''),
                     'Text': ('<p>', '</p>'),
                     'Checkbox': ('<input type="checkbox"', '</input>'),
                     'Switch': ('<input type="checkbox"', '</input>'),
                     'Text Input': ('<input type="text"', '</input>'),
                     'Input': ('<input type="text"', '</input>'),
                     'Advertisement': ('<div', '</div>'),
                     'Slider': ('<input type="range" min="1" max="100"', '</input>'),
                     'Radio Button': ('<input type="radio"', '</input>'),
                     'Pager Indicator': ('<div', '</div>'),
                     'Map View': ('<div', '</div>')}

html_comp_group_mapping = {
    'List Item': ('<li', '</li>'),
    'Card': ('<div', '</div>'),
    'Modal': ('<div class="modal"', '</div>'),
    'Map View': ('<div class="map"', '</div>'),
    'Toolbar': ('<menu', '</menu>'),
    'Multi-Tab': ('<div class="tab"', '</div>'),
    'Layout': ('<div class="layout"', '</div>')
}

def normalize_resource_id(resource_id: Text, filter_tokens: Optional[Set[Text]] = None,
                          tokenize: Optional[bool] = False) -> List[Text]:
    stopwords = filter_tokens if filter_tokens else stop_words_r_ids
    name_split = resource_id.split('/')
    name = name_split[len(name_split) - 1]
    norm_name = [token for token in snake_camel_case_split(name) if token.lower() not in stopwords]
    return norm_name if tokenize else ' '.join(norm_name)

def camel_case_split(identifier: Text) -> List[Text]:
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
    return [m.group(0) for m in matches]

def snake_case_split(identifier: Text) -> List[Text]:
    return identifier.split('_')

def snake_camel_case_split(identifier: Text) -> List[Text]:
    snake_cases = snake_case_split(identifier)
    splits = [cc for sc in snake_cases
              for cc in camel_case_split(sc)]
    return splits

def get_refined_comp_type(comp):
    if comp['componentLabel'] == 'On/Off Switch':
        return 'Switch'
    if comp['componentLabel'] == 'Input':
        clazz_name = comp['class'].lower()
        if 'edittext' in clazz_name:
            # return 'Edit Text'
            return 'Text Input'
        elif 'checkbox' in clazz_name:
            return 'Checkbox'
        elif 'switch' in clazz_name:
            return 'Switch'
        else:
            return 'Input'
    if comp['componentLabel'] == 'Text Button':
        clazz_name = comp['class'].lower()
        if 'checkbox' in clazz_name:
            return 'Checkbox'
        else:
            return 'Button'
    if comp['componentLabel'] == 'Text':
        return 'Label'
    return comp['componentLabel']

def feat_method_text_only(gui, n, m, to_lower, quote, style):
    features = [(ui_comp['id'], ' '.join(ui_comp['text'].split(' ')[:m]))
                for ui_comp in gui['ui_comps'] if ui_comp.get('text')][:n]
    if to_lower:
        features = [(feat[0], feat[1].lower()) for feat in features]
    if quote:
        features = [(feat[0], '"' + feat[1] + '"') for feat in features]
    return {feat[0]: feat[1] for feat in features}

def feat_method_text_comp_type(gui, n, m, to_lower, quote, style):
    features = []
    for ui_comp in gui['ui_comps']:
        uic_text = ui_comp.get('text').strip() if ui_comp.get('text') else ui_comp.get('text_updated').strip() if ui_comp.get('text_updated') else ''
        uic_text = '"' + uic_text + '"' if quote else uic_text
        feat_str = ''
        if ui_comp.get('componentLabel') == 'Icon':
            icon_text = ' '.join(ui_comp.get('iconClass').split('_')).strip()
            icon_text = '"' + icon_text + '"' if quote else icon_text
            feat_str = icon_text + ' (' + get_refined_comp_type(ui_comp) + ')'
        elif ui_comp.get('componentLabel') == 'Text Button':
            if ui_comp.get('buttonClass'):
                button_text = ' '.join(ui_comp.get('buttonClass').split('_')).strip()
                button_text = '"' + button_text + '"' if quote else button_text
                feat_str = button_text + ' (' + get_refined_comp_type(ui_comp) + ')'
            elif ui_comp.get('textButtonClass'):
                button_text = ' '.join(ui_comp.get('textButtonClass').split('_')).strip()
                button_text = '"' + button_text + '"' if quote else button_text
                feat_str = button_text + ' (' + get_refined_comp_type(ui_comp) + ')'
            else:
                feat_str = uic_text + ' (' + get_refined_comp_type(ui_comp) + ')'
        elif ui_comp.get('componentLabel') == 'Input':
            if ui_comp.get('text'):
                input_text = '"' + uic_text + '"' if quote else button_text
                feat_str = input_text + ' (' + get_refined_comp_type(ui_comp) + ')'
            elif ui_comp.get('text_updated'):
                input_text = '"' + ui_comp.get('text_updated').strip().replace('\n', '').replace('\f', '') + '"' \
                    if quote else ui_comp.get('text_updated').strip().replace('\n', '').replace('\f', '')
                feat_str = input_text + ' (' + get_refined_comp_type(ui_comp) + ')'
            else:
                feat_str = '(' + get_refined_comp_type(ui_comp) + ')'
        else:
            if ui_comp.get('text'):
                feat_str = uic_text + ' (' + get_refined_comp_type(ui_comp) + ')'
            else:
                feat_str = '(' + get_refined_comp_type(ui_comp) + ')'
        style_attrs = []
        if style.get(STYLE_SIZE):
            bounds = ui_comp['bounds']
            width, height = bounds[2] - bounds[0], bounds[3] - bounds[1]
            style_attrs.append('width:' + str(width))
            style_attrs.append('height:' + str(height))
        if style.get(STYLE_BOUNDS):
            bounds = ui_comp['bounds']
            style_attrs.append('top_left_corner: (' + str(bounds[0]) + ", " + str(bounds[1]) + "), " +\
                               'bottom_right_corner: (' + str(bounds[1]) + ", " + str(bounds[3]) + ")")
        if style.get(STYLE_RICO_ID):
            rico_id = ui_comp["id"]
            style_attrs.append('id: ' + str(rico_id))
        if style.get(STYLE_BACK_COLOR) and ui_comp.get('bg_color'):
            style_attrs.append('bg_color:' + ui_comp.get('bg_color'))
        if style.get(STYLE_FONT_COLOR) and ui_comp.get('text_color'):
            style_attrs.append('text_color:' + ui_comp.get('text_color'))
        if style.get(STYLE_FONT_SIZE) and ui_comp.get('font_size'):
            style_attrs.append('font_size:' + str(int(ui_comp.get('font_size'))))
        if style_attrs:
            feat_str = feat_str + ' (' + '; '.join(style_attrs) + ')'
        if to_lower:
            feat_str = feat_str.lower()
        features.append((ui_comp.get('id'), feat_str, ui_comp.get('bounds')))
    return {feat[0]: (feat[1], feat[2]) for feat in features}

def feat_method_text_comp_type_res_id(gui, n, m, to_lower, quote, style, id=False):
    features = []
    for ui_comp in gui['ui_comps']:
        uic_text = ui_comp.get('text').strip() if ui_comp.get('text') else ui_comp.get('text_updated').strip() if ui_comp.get('text_updated') else ''
        uic_text = '"' + uic_text + '"' if quote else uic_text
        feat_str = ''
        if ui_comp.get('componentLabel') == 'Icon':
            icon_text = ' '.join(ui_comp.get('iconClass').split('_')).strip()
            icon_text = '"' + icon_text + '"' if quote else icon_text
            feat_str = icon_text + ' (' + get_refined_comp_type(ui_comp) + ')'
        elif ui_comp.get('componentLabel') == 'Text Button':
            if ui_comp.get('buttonClass'):
                button_text = ' '.join(ui_comp.get('buttonClass').split('_')).strip()
                button_text = '"' + button_text + '"' if quote else button_text
                feat_str = button_text + ' (' + get_refined_comp_type(ui_comp) + ')'
            elif ui_comp.get('textButtonClass'):
                button_text = ' '.join(ui_comp.get('textButtonClass').split('_')).strip()
                button_text = '"' + button_text + '"' if quote else button_text
                feat_str = button_text + ' (' + get_refined_comp_type(ui_comp) + ')'
            else:
                feat_str = uic_text + ' (' + get_refined_comp_type(ui_comp) + ')'
        elif ui_comp.get('componentLabel') == 'Input':
            if ui_comp.get('text'):
                input_text = '"' + uic_text + '"' if quote else button_text
                feat_str = input_text + ' (' + get_refined_comp_type(ui_comp) + ')'
            elif ui_comp.get('text_updated'):
                input_text = '"' + ui_comp.get('text_updated').strip().replace('\n', '').replace('\f', '') + '"' \
                    if quote else ui_comp.get('text_updated').strip().replace('\n', '').replace('\f', '')
                feat_str = input_text + ' (' + get_refined_comp_type(ui_comp) + ')'
            else:
                feat_str = '(' + get_refined_comp_type(ui_comp) + ')'
        else:
            if ui_comp.get('text'):
                feat_str = uic_text + ' (' + get_refined_comp_type(ui_comp) + ')'
            else:
                feat_str = '(' + get_refined_comp_type(ui_comp) + ')'
        feat_str += ' (' + normalize_resource_id(ui_comp.get('resource-id')) + ')' if ui_comp.get(
            'resource-id') else ''
        style_attrs = []
        if style.get(STYLE_SIZE):
            bounds = ui_comp['bounds']
            width, height = bounds[2] - bounds[0], bounds[3] - bounds[1]
            style_attrs.append('width:' + str(width))
            style_attrs.append('height:' + str(height))
        if style.get(STYLE_BOUNDS):
            bounds = ui_comp['bounds']
            style_attrs.append('top_left_corner: (' + str(bounds[0]) + ", " + str(bounds[1]) + "), " +\
                               'bottom_right_corner: (' + str(bounds[1]) + ", " + str(bounds[3]) + ")")
        if style.get(STYLE_RICO_ID):
            rico_id = ui_comp["id"]
            style_attrs.append('id: ' + str(rico_id))
        if style.get(STYLE_BACK_COLOR) and ui_comp.get('bg_color'):
            style_attrs.append('bg_color:' + ui_comp.get('bg_color'))
        if style.get(STYLE_FONT_COLOR) and ui_comp.get('text_color'):
            style_attrs.append('text_color:' + ui_comp.get('text_color'))
        if style.get(STYLE_FONT_SIZE) and ui_comp.get('font_size'):
            style_attrs.append('font_size:' + str(int(ui_comp.get('font_size'))))
        if style_attrs:
            feat_str = feat_str + ' (' + '; '.join(style_attrs) + ')'
        if to_lower:
            feat_str = feat_str.lower()
        if id:
            feat_str = feat_str + ' (id=' + ui_comp.get('id').split('_')[1] + ')'
        features.append((ui_comp.get('id'), feat_str, ui_comp.get('bounds')))
    return {feat[0]: (feat[1], feat[2]) for feat in features}

def feat_method_html(gui, n, m, to_lower, quote, style):
    features = []
    for ui_comp in gui['ui_comps']:
        uic_text = ui_comp.get('text').strip() if ui_comp.get('text') else ui_comp.get('text_updated').strip() if ui_comp.get('text_updated') else ''
        html_comp = html_comp_mapping.get(get_refined_comp_type(ui_comp))
        feat_str = html_comp[0]
        if ui_comp.get('resource-id'):
            feat_str += ' id="' + '-'.join(normalize_resource_id(ui_comp.get('resource-id')).split(' ')) + '"'
        style_attrs = []
        if style.get(STYLE_SIZE):
            bounds = ui_comp['bounds']
            width, height = bounds[2] - bounds[0], bounds[3] - bounds[1]
            style_attrs.append('width:' + str(width))
            style_attrs.append('height:' + str(height))
        if style.get(STYLE_BOUNDS):
            bounds = ui_comp['bounds']
            style_attrs.append('top_left_corner: (' + str(bounds[0]) + ", " + str(bounds[1]) + "), " +\
                               'bottom_right_corner: (' + str(bounds[1]) + ", " + str(bounds[3]) + ")")
        if style.get(STYLE_RICO_ID):
            rico_id = ui_comp["id"]
            style_attrs.append('id: ' + str(rico_id))
        if style.get(STYLE_BACK_COLOR) and ui_comp.get('bg_color'):
            style_attrs.append('bg_color:' + ui_comp.get('bg_color'))
        if style.get(STYLE_FONT_COLOR) and ui_comp.get('text_color'):
            style_attrs.append('text_color:' + ui_comp.get('text_color'))
        if style.get(STYLE_FONT_SIZE) and ui_comp.get('font_size'):
            style_attrs.append('font_size:' + str(int(ui_comp.get('font_size'))))
        if style_attrs:
            feat_str = feat_str + ' style="' + ';'.join(style_attrs) + '"'
        feat_str += '>'
        if ui_comp.get('componentLabel') == 'Icon':
            icon_text = ' '.join(ui_comp.get('iconClass').split('_')).strip()
            feat_str += icon_text
        elif ui_comp.get('componentLabel') == 'Text Button':
            if ui_comp.get('buttonClass'):
                button_text = ' '.join(ui_comp.get('buttonClass').split('_')).strip()
                feat_str += button_text
            elif ui_comp.get('textButtonClass'):
                button_text = ' '.join(ui_comp.get('textButtonClass').split('_')).strip()
                feat_str += button_text
            else:
                feat_str += uic_text
        else:
            if ui_comp.get('text'):
                feat_str += uic_text
        feat_str += html_comp[1]
        features.append((ui_comp.get('id'), feat_str))
    if to_lower:
        feat_str = feat_str.lower()
    return {feat[0]: (feat[1], feat[2]) for feat in features}

def features_to_str(gui, feat_method, n, m, to_lower, quote, style, id):
    if feat_method == FEAT_METHOD_TEXT_ONLY:
        return feat_method_text_only(gui, n, m, to_lower, quote, style)
    elif feat_method == FEAT_METHOD_TEXT_COMP_TYPE:
        return feat_method_text_comp_type(gui, n, m, to_lower, quote, style)
    elif feat_method == FEAT_METHOD_TEXT_COMP_TYPE_RES_ID:
        return feat_method_text_comp_type_res_id(gui, n, m, to_lower, quote, style, id)
    elif feat_method == FEAT_METHOD_HTML:
        return feat_method_html(gui, n, m, to_lower, quote, style)

def filter_uic_groups(uic_groups):
    filtered_uic_groups = []
    # Sort ui comp group based on number of ui comps
    uic_groups_sorted = sorted(uic_groups, key=lambda x: len(x['ui_comp_ids']), reverse=True)
    for i, uic_group_1 in enumerate(uic_groups_sorted, 0):
        subset_count = 0
        for uic_group_2 in uic_groups_sorted[(i + 1):]:
            if uic_group_1['id'] != uic_group_2['id']:
                if set(uic_group_2['ui_comp_ids']).issubset(uic_group_1['ui_comp_ids']):
                    subset_count += 1
        if subset_count == 0:
            filtered_uic_groups.append(uic_group_1)
    return filtered_uic_groups

def comp_in_uic(ui_comp_id, ui_comp_groups):
    for uic in ui_comp_groups:
        if ui_comp_id in uic.get('ui_comp_ids'):
            return True
    return False

def structure_to_str(gui, feat_mappings, struct_method, style):
    gui_cpy = copy.deepcopy(gui)
    gui_mapping = {elem['id']: elem for elem in gui['ui_comps']}
    if struct_method == STRUCT_METHOD_SIMPLE_BULLETS:
        return '\n- ' + '\n- '.join([elem[0] for elem in feat_mappings.values()])
    elif struct_method == STRUCT_METHOD_SIMPLE_BULLETS_SORTED:
        uic_bounds = [(key, val, gui_mapping[key]['bounds']) for key, val in feat_mappings.items()]
        uic_sorted = sorted(uic_bounds, key=lambda x: (x[2][1], x[2][0]))
        return '\n- ' + '\n- '.join([elem[1][0] for elem in uic_sorted])
    elif struct_method == STRUCT_METHOD_TWO_LEVEL_BULLETS or struct_method == STRUCT_METHOD_TWO_LEVEL_HTML:
        uic_groups = gui_cpy['ui_comp_groups']
        uic_groups = gui_cpy['ui_comp_groups']
        single_ui_comps = [(ui_comp_id, vals[1]) for ui_comp_id, vals in feat_mappings.items()
                           if not comp_in_uic(ui_comp_id, uic_groups)]
        for elem in single_ui_comps:
            uic_groups.append({
                "componentLabel": "Layout",
                "bounds": elem[1],
                "class": "android.widget.LinearLayout",
                "bg_color": "#FFFFFF",
                "ui_comp_ids": [elem[0]],
                'id': str(uuid.uuid4())
            })
        filtered_uic_groups = filter_uic_groups(uic_groups)
        filtered_ui_comp_ids = []
        for fuic in filtered_uic_groups:
            filtered_ui_comp_ids.extend(fuic['ui_comp_ids'])
        filtered_ui_comp_ids = set(filtered_ui_comp_ids)
        all_ui_comp_ids = set([elem['id'] for elem in gui_cpy['ui_comps']])
        missing_ui_comp_ids = all_ui_comp_ids.difference(filtered_ui_comp_ids)
        uic_groups_sorted_len = sorted(uic_groups, key=lambda x: len(x['ui_comp_ids']), reverse=False)
        matched_ui_comp_groups = {}
        for miss_ui_comp_id in missing_ui_comp_ids:
            for uic_group_len in uic_groups_sorted_len:
                if miss_ui_comp_id in uic_group_len['ui_comp_ids']:
                    if uic_group_len['id'] in matched_ui_comp_groups:
                        matched_ui_comp_groups[uic_group_len['id']]['ui_comp_ids'].append(miss_ui_comp_id)
                        break
                    else:
                        matched_ui_comp_groups[uic_group_len['id']] = {
                            "componentLabel": "Layout",
                            "bounds": uic_group_len['bounds'],
                            "class": "android.widget.LinearLayout",
                            "bg_color": "#FFFFFF",
                            "ui_comp_ids": [miss_ui_comp_id],
                            'id': str(uuid.uuid4())
                        }
                        break
        filtered_uic_groups.extend([val for key, val in matched_ui_comp_groups.items()])
        uic_groups_sorted = sorted(filtered_uic_groups, key=lambda x: (x['bounds'][1], x['bounds'][0]))
        feat_str = ''
        for uic_group in uic_groups_sorted:
            uic_group['ui_comp_ids'] = [(feat_mappings.get(idd)[0], gui_mapping.get(idd)) for idd in
                                        uic_group['ui_comp_ids']]
            uic_group['ui_comp_ids'] = sorted(uic_group['ui_comp_ids'],
                                              key=lambda x: (x[1]['bounds'][1], x[1]['bounds'][0]))
            if struct_method == STRUCT_METHOD_TWO_LEVEL_BULLETS:
                feat_str += '- ' + uic_group.get('componentLabel')
                feat_str += '\n\t- ' + '\n\t- '.join([elem[0] for elem in uic_group['ui_comp_ids']]) + '\n'
            elif struct_method == STRUCT_METHOD_TWO_LEVEL_HTML:
                html_mapping = html_comp_group_mapping.get(uic_group.get('componentLabel'))
                feat_str += html_mapping[0]
                style_attrs = []
                feat_str += '>'
                feat_str += '\n\t' + '\n\t'.join([elem[0] for elem in uic_group['ui_comp_ids']]) + '\n'
                feat_str += html_mapping[1] + '\n'
        return feat_str

def get_str_repr_gui(gui, n, m, to_lower, quote, style, id, feat_method, struct_method):
    feat_mappings = features_to_str(gui, feat_method, n, m, to_lower, quote, style, id)
    final_str = structure_to_str(gui, feat_mappings, struct_method, style)
    return final_str

## 2. OpenAI API

In [None]:
from openai import OpenAI
organization = "organization id"
api_key = "api key"
client = OpenAI(api_key=api_key, organization=organization)

In [None]:
def generate_completion(prompt, model='gpt-4o', temp=0.7, n=1, max_tokens=15500, logprobs=True, top_logprobs=5, return_obj=True):
    if logprobs:
        chat_completion = client.chat.completions.create(
              model=model,
              messages=[
                    {"role": "user", "content": prompt},
                ],
              temperature=temp,
              n=n,
              logprobs=logprobs,
              top_logprobs=top_logprobs
        )
    else:
        chat_completion = client.chat.completions.create(
              model=model,
              messages=[
                    {"role": "user", "content": prompt},
                ],
              temperature=temp,
              n=n,
        )
    return chat_completion if return_obj else [choice.message.content for choice in chat_completion.choices]

In [None]:
def generate_completion_multiple(prompts, model='gpt-4o', temp=0.7, n=1, max_tokens=15500, logprobs=True, top_logprobs=5, return_obj=True):
    messages = []
    for prompt in prompts:
        messages.append({"role": "user", "content": prompt})
    if logprobs:
        chat_completion = client.chat.completions.create(
              model=model,
              messages=messages,
              temperature=temp,
              n=n,
              logprobs=logprobs,
              top_logprobs=top_logprobs
        )
    else:
        chat_completion = client.chat.completions.create(
              model=model,
              messages=messages,
              temperature=temp,
              n=n,
        )
    return chat_completion if return_obj else [choice.message.content for choice in chat_completion.choices]

In [None]:
import base64
import requests

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def respond_to_image(prompt, image, model="gpt-4o", temp=0.7, n=1, max_tokens=15500, return_obj=True, timeout=300):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "user", "content": 
                [
                    {"type": "text", "text": prompt},
                        {
                            "type": "image_url",
                            "image_url": {
                            "url":  f"data:image/jpeg;base64,{image}",
                            },
                        },
                ]
            },
        ],
        "temperature": temp,
        "n": n,
    }
    chat_completion = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload, timeout=timeout).json()    
    return chat_completion

In [None]:
def respond_to_image_multiple(prompts_and_images, model="gpt-4o", temp=0.7, n=1, max_tokens=15500, return_obj=True, timeout=300):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    messages = []
    for prompt, image in prompts_and_images:
        if image is not None:
            message = {"role": "user", "content": 
                        [
                            {"type": "text", "text": prompt},
                                {
                                    "type": "image_url",
                                    "image_url": {
                                    "url":  f"data:image/jpeg;base64,{image}",
                                },
                                }
                        ]
                      }
        else:
            message = {"role": "user", "content": 
                        [{"type": "text", "text": prompt}]
                      }
        messages.append(message)
    payload = {
        "model": model,
        "messages": messages,
        "temperature": temp,
        "n": n,
    }
    chat_completion = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload, timeout=timeout).json()
    
    return chat_completion

# 3. Annotation and Reranking Methods

In [None]:
from itertools import compress

"""
Possible methods: 
 - 'one_step': Use the raw GUI2String representation
 - 'two_step': Create a natural-language description based on the GUI2String representation
 - 'image': Use a screenshot of the GUI
Possible ranking types:
 - 'binary': Assign the UI a relevance score of 1 or 0
 - 'three_level' and 'three_level_v2': Assign the UI a relevance score of 2, 1, or 0
 - 'one_to_ten': Assign the UI a relevance score between 1 and 10
 - 'one_to_ten_rate': Assign the UI a relevance score between 1 and 10 and give its feature set, design, and layout a score between 1 and 10
"""

def rank_and_filter(top_k, query, retrieve, method="two_step", reasoning=False, ranking_type="binary"):

    ret = []

    for retrieved in top_k:
        print("   ", len(ret))
        try:
            rico_id = retrieved[0]
            conf = retrieved[1]
        except:
            rico_id = retrieved
        resp = {"id": rico_id, "score": "", "reasoning": "", "description": "", "prompt": "", "response": "", "sub_response": "", "full_response": "", "error": 0}
        gui = all_guis_with_comps[all_guis_with_comps['id'] == rico_id]['data'].values.tolist()[0]
        desc = get_str_repr_gui(gui, n=30, m=30, to_lower=False, quote=True, style={}, id=True,
                            feat_method=FEAT_METHOD_TEXT_COMP_TYPE_RES_ID,
                            struct_method=STRUCT_METHOD_TWO_LEVEL_HTML)
        
        if method == "two_step":
            prompt = "I have extracted some information about the contents of a mobile UI from its source code:\n" + desc +\
            "\nDescribe its semantic contents succinctly."
            sub_resp = generate_completion(prompt, temp=0.7)
            desc_gpt = sub_resp.choices[0].message.content
            resp["description"] = desc_gpt
            resp["sub_response"] = sub_resp

            prompt = "You are a user searching for mobile UIs to base your design" +\
            " on using the following query: \"" + query + "\"\n My retrieval algorithm returned a UI " +\
            "that can be described as follows:\n\"" + desc_gpt +\
            "\"\nDoes this specific UI fit your query? Do not think about what the rest of the app might contain and do not be afraid to say no. " +\
            "I would rather filter out a (semi-) relevant UI than not remove an irrelevant UI."
        elif method == "image":
            image = encode_image(rico_path + str(rico_id) + ".jpg")
            prompt = "You are a user searching for mobile UIs to base your design" +\
            " on using the following query: \"" + query + "\"\n My retrieval algorithm returned " +\
            "the following image." +\
            "\nDoes this specific UI fit your query? Do not think about what the rest of the app might contain and do not be afraid to say no. " +\
            "I would rather filter out a (semi-) relevant UI than not remove an irrelevant UI."
        else: 
            #one_step
            prompt = "You are a user searching for mobile UIs to base your design" +\
            " on using the following query: " + query + "\n My retrieval algorithm returned a UI " +\
            "containing the following elements:\n" + desc + "Does the UI fit your query? " +\
            "Do not think about what the rest of the app might contain and do not be afraid to say no. " +\
            "I would rather filter out a (semi-) relevant UI than not remove an irrelevant UI."
            resp["description"] = desc

        if ranking_type == "binary":
            prompt += "Answer in json format, like so: {'relevance_score': X}, X being 1 if the UI is relevant and 0 if it is irrelevant."
        elif ranking_type == "one_to_ten":
            prompt += "Assign the UI a relevance score between 1 and 10, with 1 meaning that it is not relevant at all, and 10 meaning that it " +\
                      "is completely relevant, and the numbers inbetween indicating various levels of relevance depending on features and functions " +\
                      "present in the UI. Answer in json format, like so: {'relevance_score': X}. "
        elif ranking_type == "one_to_ten_rate":
            prompt += "Assign the UI a relevance score between 1 and 10, with 1 meaning that it is not relevant at all, and 10 meaning that it " +\
                      "is completely relevant, and the numbers inbetween indicating various levels of relevance depending on features and functions " +\
                      "present in the UI. Also rate the UI in the same way regarding these attributes: feature set (related to the query), " +\
                      "design, and layout. Answer in json format, like so: {'relevance_score': X, 'feature_set': X, 'design': X, 'layout': X}. "
            resp["feature_set"] = ""
            resp["design"] = ""
            resp["layout"] = ""
        elif ranking_type == "three_level":
            prompt += "Assign the UI a relevance score of 0, 1, or 2. You should give it a 2 if both the function and the content domain of the UI align with " +\
            "the query, a 1 if only the function fits and a 0 if neither or only the content domain fits. " +\
            "Answer in json format, like so: {'relevance_score': X}, with X being the relevance score you assign to the UI. "
        else:
            #three_level_v2
            prompt += "Assign the UI a relevance score of 0, 1, or 2. You should give it a 2 if it is relevant to the query, a 0 if it is irrelevant to the query " +\
            "and a 1 if it is somewhere inbetween. Answer in json format, like so: {'relevance_score': X}, with X being the relevance score you assign to the UI. "
        
        if reasoning:
            prompt += "Briefly explain your chain of thoughts and append it to the json like so: {'relevance_score': X, 'reasoning': 'your reasoning'}. "
        

        resp["prompt"] = prompt

        try:
            if method == "image":
                full_response = respond_to_image(prompt, image, temp=0.7)
                response = full_response["choices"][0]["message"]["content"]
            else:
                full_response = generate_completion(prompt, temp=0.7)
                response = full_response.choices[0].message.content
    
            resp["response"] = response
            resp["full_response"] = full_response
    
            json_start_loc = response.find("{")
            json_end_loc = response.rfind("}")+1
            json_str = response[json_start_loc:json_end_loc]
            try:
                response_json = literal_eval(json_str)
                resp["score"] = response_json["relevance_score"]
                resp["reasoning"] = response_json["reasoning"]
                if ranking_type == "one_to_ten_rate":
                        resp["feature_set"] = response_json["feature_set"]
                        resp["design"] = response_json["design"]
                        resp["layout"] = response_json["layout"]
            except Exception as e:
                print(type(e), e)
                resp["error"] = 1
                resp["score"] = 0
                resp["reasoning"] = ""
                if ranking_type == "one_to_ten_rate":
                        resp["feature_set"] = 0
                        resp["design"] = 0
                        resp["layout"] = 0
        except Exception as e:
            print(type(e), e)
            resp["error"] = 2
            resp["score"] = 0
            resp["reasoning"] = ""
            if ranking_type == "one_to_ten_rate":
                    resp["feature_set"] = 0
                    resp["design"] = 0
                    resp["layout"] = 0

        ret.append(resp)
            
    return ret[:retrieve]

In [None]:
"""
Same approach as above, but the n parameter lets you set the number of answers you want, also computes averages and standard deviations for ratings
"""

def rank_and_filter_n(top_k, query, retrieve, method="two_step", reasoning=False, ranking_type="binary", n=1):

    ret = []

    for retrieved in top_k:
        print("   ", len(ret))
        try:
            rico_id = retrieved[0]
            conf = retrieved[1]
        except:
            rico_id = retrieved
        resp = {"id": rico_id, "scores": [], "reasonings": [], "description": "", "prompt": "", "responses": [], 
                "sub_response": "", "full_responses": [], "errors": [], "avg_score": "", "std_dev": ""}
        gui = all_guis_with_comps[all_guis_with_comps['id'] == rico_id]['data'].values.tolist()[0]
        desc = get_str_repr_gui(gui, n=30, m=30, to_lower=False, quote=True, style={}, id=True,
                            feat_method=FEAT_METHOD_TEXT_COMP_TYPE_RES_ID,
                            struct_method=STRUCT_METHOD_TWO_LEVEL_HTML)
        
        if method == "two_step":
            prompt = "I have extracted some information about the contents of a mobile UI from its source code:\n" + desc +\
            "\nDescribe its semantic contents succinctly."
            sub_resp = generate_completion(prompt, temp=0.7)
            desc_gpt = sub_resp.choices[0].message.content
            resp["description"] = desc_gpt
            resp["sub_response"] = sub_resp

            prompt = "You are a user searching for mobile UIs to base your design" +\
            " on using the following query: \"" + query + "\"\n My retrieval algorithm returned a UI " +\
            "that can be described as follows:\n\"" + desc_gpt +\
            "\"\nDoes this specific UI fit your query? Do not think about what the rest of the app might contain and do not be afraid to say no. " +\
            "I would rather filter out a (semi-) relevant UI than not remove an irrelevant UI."
        elif method == "image":
            image = encode_image(rico_path + str(rico_id) + ".jpg")
            prompt = "You are a user searching for mobile UIs to base your design" +\
            " on using the following query: \"" + query + "\"\n My retrieval algorithm returned " +\
            "the following image." +\
            "\nDoes this specific UI fit your query? Do not think about what the rest of the app might contain and do not be afraid to say no. " +\
            "I would rather filter out a (semi-) relevant UI than not remove an irrelevant UI."
        else: 
            prompt = "You are a user searching for mobile UIs to base your design" +\
            " on using the following query: " + query + "\n My retrieval algorithm returned a UI " +\
            "containing the following elements:\n" + desc + "Does the UI fit your query? " +\
            "Do not think about what the rest of the app might contain and do not be afraid to say no. " +\
            "I would rather filter out a (semi-) relevant UI than not remove an irrelevant UI."
            resp["description"] = desc

        if ranking_type == "binary":
            prompt += "Answer in json format, like so: {'relevance_score': X}, X being 1 if the UI is relevant and 0 if it is irrelevant."
        elif ranking_type == "one_to_ten":
            prompt += "Assign the UI a relevance score between 1 and 10, with 1 meaning that it is not relevant at all, and 10 meaning that it " +\
                      "is completely relevant, and the numbers inbetween indicating various levels of relevance depending on features and functions " +\
                      "present in the UI. Answer in json format, like so: {'relevance_score': X}. "
        elif ranking_type == "one_to_ten_rate":
            prompt += "Assign the UI a relevance score between 1 and 10, with 1 meaning that it is not relevant at all, and 10 meaning that it " +\
                      "is completely relevant, and the numbers inbetween indicating various levels of relevance depending on features and functions " +\
                      "present in the UI. Also rate the UI in the same way regarding these attributes: feature set (related to the query), " +\
                      "design, and layout. Answer in json format, like so: {'relevance_score': X, 'feature_set': X, 'design': X, 'layout': X}. "
            resp["feature_set"] = []
            resp["design"] = []
            resp["layout"] = []
        elif ranking_type == "three_level":
            prompt += "Assign the UI a relevance score of 0, 1, or 2. You should give it a 2 if both the function and the content domain of the UI align with " +\
            "the query, a 1 if only the function fits and a 0 if neither or only the content domain fits. " +\
            "Answer in json format, like so: {'relevance_score': X}, with X being the relevance score you assign to the UI. "
        else:
            prompt += "Assign the UI a relevance score of 0, 1, or 2. You should give it a 2 if it is relevant to the query, a 0 if it is irrelevant to the query " +\
            "and a 1 if it is somewhere inbetween. Answer in json format, like so: {'relevance_score': X}, with X being the relevance score you assign to the UI. "
        
        if reasoning:
            prompt += "Briefly explain your chain of thoughts and append it to the json like so: {'relevance_score': X, 'reasoning': 'your reasoning'}. "
        

        resp["prompt"] = prompt
        responses = []
        
        try:
            if method == "image":
                full_response = respond_to_image(prompt, image, temp=0.7, n=n)
                for i in range(n):
                    responses.append(full_response["choices"][i]["message"]["content"])
            else:
                full_response = generate_completion(prompt, temp=0.7, n=n)
                for i in range(n):
                    responses.append(full_response.choices[i].message.content)
            
            resp["responses"] = responses
            resp["full_response"] = full_response
            
            for response in responses:
                json_start_loc = response.find("{")
                json_end_loc = response.rfind("}")+1
                json_str = response[json_start_loc:json_end_loc]
                try:
                    response_json = literal_eval(json_str)
                    resp["scores"].append(response_json["relevance_score"])
                    resp["reasonings"].append(response_json["reasoning"])
                    resp["errors"].append(0)
                    if ranking_type == "one_to_ten_rate":
                        resp["feature_set"].append(response_json["feature_set"])
                        resp["design"].append(response_json["design"])
                        resp["layout"].append(response_json["layout"])
                except Exception as e:
                    resp["scores"].append(0)
                    resp["reasonings"].append("")
                    print(type(e), e)
                    resp["errors"].append(1)
                    if ranking_type == "one_to_ten_rate":
                        resp["feature_set"].append(0)
                        resp["design"].append(0)
                        resp["layout"].append(0)
            avg_score = np.mean(resp["scores"])
            std_dev = np.std(resp["scores"])
            resp["avg_score"] = avg_score
            resp["std_dev"] = std_dev
            
        except Exception as e:
            resp["scores"].append(0)
            resp["reasonings"].append("")
            print(type(e), e)
            resp["errors"].append(2)
            resp["avg_score"] = 0
            resp["std_dev"] = 0

        ret.append(resp)
            
    return ret[:retrieve]

In [None]:
"""
Possible methods:
 - 'one_step': Use the raw GUI2String representation
 - 'two_step': Create a natural-language description based on the GUI2String representation
 - 'image': Use a screenshot of the GUI
Possible ranking types:
 - 'v1': Reorder the list of UIs based on their relevance to the query
 - 'v2': Reorder the list of UIs based on their relevance to the query, their design, their layout, and their usability
"""

def rank_and_rerank(top_k, query, retrieve, method="two_step", reasoning=False, ranking_type="v1", temp=0.7):

    descs = []
    sub_responses = []
    ret = {"retrieved": top_k, "ranked": "", "ranked_unmapped": "", "reasoning": "", "descriptions": "", "prompt": "", "response": "", "full_response": "", "sub_responses": [], "full_prompts": "", "id_mappings": "", "error": 0}
    id_mappings = {}
    counter = 0
    
    for retrieved in top_k:
        try:
            rico_id = retrieved[0]
            conf = retrieved[1]
        except:
            rico_id = retrieved
        gui = all_guis_with_comps[all_guis_with_comps['id'] == rico_id]['data'].values.tolist()[0]
        desc = get_str_repr_gui(gui, n=30, m=30, to_lower=False, quote=True, style={}, id=True,
                            feat_method=FEAT_METHOD_TEXT_COMP_TYPE_RES_ID,
                            struct_method=STRUCT_METHOD_TWO_LEVEL_HTML)
        id_mappings[counter] = rico_id
        
        if method == "two_step":
            prompt = "I have extracted some information about the contents of a mobile UI from its source code:\n" + desc +\
            "\nDescribe its semantic contents succinctly."
            sub_resp = generate_completion(prompt, temp=temp)
            desc_gpt = sub_resp.choices[0].message.content
            descs.append("id: " + str(counter) + ", description:" + desc_gpt)
            sub_responses.append(sub_resp)
        elif method == "image":
            image = encode_image(rico_path + str(rico_id) + ".jpg")
            descs.append(("id: " + str(counter), image))
        else:
            descs.append("id: " + str(counter) + ", description:" + desc)
            
        counter += 1
        
    if method != "image":
        ret["descriptions"] = descs
        ret["sub_responses"] = sub_responses

    ret["id_mappings"] = id_mappings

    if ranking_type == "v1":
        prompt = "You are a user searching for mobile UIs to base your design on using the following query: \"" + query + "\"\nMy retrieval algorithm returned" +\
        " some UIs, whose ids and descriptions I will send you next. Please order the UIs based on how relevant you think they are to the query and answer" +\
        " with a list of ids (beginning with the most relevant UI). Answer in json format like so (this ranking is just a random example: " +\
        "{'ranking': 0, 1, 8, 9, 18, 19, 7, 5, 3, 6, 2, 15, 4, 10, 13, 11, 12, 17, 16, 14}" 
              
        if reasoning: 
            prompt += " Additionally, shortly explain your chain of thoughts for the ranking, going into detail about each UI and append it to " \
                      "the json like so (this ranking is just a random example): {'ranking': 0, 1, 8, 9, 18, 19, 7, 5, 3, 6, 2, 15, 4, 10, 13, 11, 12, 17, 16, 14," +\
                      "'reasoning': '0: reasoning, 1: reasoning, 8: reasoning, 9: reasoning, 18: reasoning, 19: reasoning, 7: reasoning, 5: reasoning, " +\
                      "3: reasoning, 6: reasoning, 2: reasoning, 15: reasoning, 4: reasoning, 10: reasoning, 13: reasoning, 11: reasoning, 12: reasoning, " + \
                      "17: reasoning, 16: reasoning, 14: reasoning'}. Make sure your response can be correctly parsed" \
                      " as json data. Also, be sure that the lists in the ranking and the reasoning sections are the same and that you don't miss any of the UIs in the list."
    else:
        prompt = "You are a user searching for mobile UIs to base your design on using the following query: \"" + query + "\"\nMy retrieval algorithm returned" +\
        " some UIs, whose ids and descriptions I will send you next. Please judge the UIs based on their design, their layout, their usability, and how relevant you think their feature set is to the query," +\
        " and answer with a list of ids (beginning with the best and most relevant UI). Answer in json format like so (this ranking is just a random example: " +\
        "{'ranking': 0, 1, 8, 9, 18, 19, 7, 5, 3, 6, 2, 15, 4, 10, 13, 11, 12, 17, 16, 14}" 
              
        if reasoning: 
            prompt += " Additionally, shortly explain your chain of thoughts for the ranking, going into detail about each UI and append it to " \
                      "the json like so (this ranking is just a random example): {'ranking': 0, 1, 8, 9, 18, 19, 7, 5, 3, 6, 2, 15, 4, 10, 13, 11, 12, 17, 16, 14," +\
                      "'reasoning': '0: reasoning, 1: reasoning, 8: reasoning, 9: reasoning, 18: reasoning, 19: reasoning, 7: reasoning, 5: reasoning, " +\
                      "3: reasoning, 6: reasoning, 2: reasoning, 15: reasoning, 4: reasoning, 10: reasoning, 13: reasoning, 11: reasoning, 12: reasoning, " + \
                      "17: reasoning, 16: reasoning, 14: reasoning'}. Make sure your response can be correctly parsed" \
                      " as json data. Also, be sure that the lists in the ranking and the reasoning sections are the same and that you don't miss any of the UIs in the list."

    ret["prompt"] = prompt

    if method != "image":
        prompts = [prompt]
    else:
        prompts = [(prompt, None)]
        
    prompts.extend(descs)
    ret["full_prompts"] = prompts

    try:
        if method != "image":
            full_response = generate_completion_multiple(prompts, temp=temp, timeout=420)
            response = full_response.choices[0].message.content
        else:
            full_response = respond_to_image_multiple(prompts, temp=temp, timeout=420)
            response = full_response["choices"][0]["message"]["content"]
    
        json_start_loc = response.find("{")
        json_end_loc = response.rfind("}")+1
        json_str = response[json_start_loc:json_end_loc]
        try:
            response_json = literal_eval(json_str)
            ranked_unmapped = response_json["ranking"]
            reasoning = response_json["reasoning"]
            ranked = []
            for mapped_id in ranked_unmapped:
                ranked.append(id_mappings[int(mapped_id)])
            ret["ranked_unmapped"] = ranked_unmapped
            ret["ranked"] = ranked
            ret["reasoning"] = reasoning
            
        except Exception as e:
            print(type(e), e)
            print(json_str)
            print("HERE")
            ret["error"] = 1
        
        ret["response"] = response
        
        ret["full_response"] = full_response
    except Exception as e:
        print(type(e), e)
        ret["error"] = 2
        
    return ret

In [None]:
"""
Possible methods:
 - 'one_step': Use the raw GUI2String representation
 - 'two_step': Create a natural-language description based on the GUI2String representation
 - 'image': Use a screenshot of the GUI

Returns a single UI that is deemed to be the most relevant to the query
"""

def return_best(top_k, query, method="two_step", reasoning=False):
    descs = []
    sub_responses = []
    ret = {"retrieved": top_k, "best": "", "descriptions": "", "prompt": "", "response": "", "full_response": "", "sub_responses": [], "full_prompts": ""}

    for retrieved in top_k:
        try:
            rico_id = retrieved[0]
            conf = retrieved[1]
        except:
            rico_id = retrieved
        print(rico_id)
        gui = all_guis_with_comps[all_guis_with_comps['id'] == rico_id]['data'].values.tolist()[0]
        desc = get_str_repr_gui(gui, n=30, m=30, to_lower=False, quote=True, style={}, id=True,
                            feat_method=FEAT_METHOD_TEXT_COMP_TYPE_RES_ID,
                            struct_method=STRUCT_METHOD_TWO_LEVEL_HTML)
        
        if method == "two_step":
            prompt = "I have extracted some information about the contents of a mobile UI from its source code:\n" + desc +\
            "\nDescribe its semantic contents succinctly."
            sub_resp = generate_completion(prompt, temp=0.7)
            desc_gpt = sub_resp.choices[0].message.content
            descs.append("id: " + str(rico_id) + ", description:" + desc_gpt)
            sub_responses.append(sub_resp)
        elif method == "image":
            image = encode_image(rico_path + str(rico_id) + ".jpg")
            descs.append(("id: " + str(rico_id), image))
        else:
            descs.append("id: " + str(rico_id) + ", description:" + desc)
    if method != "image":
        ret["descriptions"] = descs
        ret["sub_responses"] = sub_responses
    
    prompt = "You are a user searching for mobile UIs to base your design on using the following query: \"" + query + "\"\nMy retrieval algorithm returned" +\
    " some UIs, whose ids and descriptions I will send you next. Please rate the UIs in terms of the features they provide, usability, design, and how well" +\
    " they fit the query and return the id of the UI you think is the best overall at the end of your answer."
          
    if reasoning: 
        prompt += " Additionally, shortly explain your chain of thoughts, going into detail about each UI."

    ret["prompt"] = prompt

    if method != "image":
        prompts = [prompt]
    else:
        prompts = [(prompt, None)]
        
    prompts.extend(descs)
    #print(prompts)
    ret["full_prompts"] = prompts

    if method != "image":
        full_response = generate_completion_multiple(prompts,  temp=0.7)
        response = full_response.choices[0].message.content
    else:
        full_response = respond_to_image_multiple(prompts,  temp=0.7)
        response = full_response["choices"][0]["message"]["content"]
    
    best = re.findall(r'\d+', response)[-1]
    
    ret["response"] = response
    ret["best"] = best
    ret["full_response"] = full_response

    return ret

# 4. Examples/Experiments

In [None]:
query = "A page from a shopping app presenting a t-shirt for purchase."

## Rank and Filter

### Two Step

In [None]:
#Three Level Ranking

query = "A page from a music player app, presenting playback controls for audio content"
retrieve = 1
top_k = rank(query)[:retrieve]
filtered_top_k = rank_and_filter(top_k, query, retrieve, method="two_step", reasoning=True, ranking_type="three_level")

In [None]:
total_input_tokens = 0
total_completion_tokens = 0
total_sub_input_tokens = 0
total_sub_completion_tokens = 0
for resp in filtered_top_k:
    input_tokens = resp["full_response"].usage.prompt_tokens
    completion_tokens = resp["full_response"].usage.completion_tokens
    sub_input_tokens = resp["sub_response"].usage.prompt_tokens
    sub_completion_tokens = resp["sub_response"].usage.completion_tokens
    total_input_tokens += input_tokens
    total_completion_tokens += completion_tokens
    total_sub_input_tokens += sub_input_tokens
    total_sub_completion_tokens += sub_completion_tokens
    rico_id = resp["id"]
    img = Image.open(rico_path + str(rico_id) + ".jpg")
    img = img.resize((1080, 1920))
    plt.imshow(img)
    plt.grid(False)
    title = str(rico_id) 
    plt.title(title)
    plt.show()
    if resp["description"] != "":
        print(resp["description"])
        print()
    print(resp["response"])
    print(resp["reasoning"])
    print(resp["score"])
print("\nUSAGE")
print("INPUT", total_input_tokens)
print("COMPLETION", total_completion_tokens)
print("SUB INPUT", total_sub_input_tokens)
print("SUB COMPLETION", total_sub_completion_tokens)

In [None]:
#Binary ranking

#query = "A page from a music player app, presenting playback controls for audio content"
retrieve = 5
top_k = rank(query)[:retrieve]
filtered_top_k = rank_and_filter(top_k, query, retrieve, method="two_step", reasoning=True, ranking_type="binary")

In [None]:
total_input_tokens = 0
total_completion_tokens = 0
total_sub_input_tokens = 0
total_sub_completion_tokens = 0
for resp in filtered_top_k:
    input_tokens = resp["full_response"].usage.prompt_tokens
    completion_tokens = resp["full_response"].usage.completion_tokens
    sub_input_tokens = resp["sub_response"].usage.prompt_tokens
    sub_completion_tokens = resp["sub_response"].usage.completion_tokens
    total_input_tokens += input_tokens
    total_completion_tokens += completion_tokens
    total_sub_input_tokens += sub_input_tokens
    total_sub_completion_tokens += sub_completion_tokens
    rico_id = resp["id"]
    img = Image.open(rico_path + str(rico_id) + ".jpg")
    img = img.resize((1080, 1920))
    plt.imshow(img)
    plt.grid(False)
    title = str(rico_id) 
    plt.title(title)
    plt.show()
    if resp["description"] != "":
        print(resp["description"])
        print()
    print(resp["response"])
    print(resp["reasoning"])
    print(resp["score"])
print("\nUSAGE")
print("INPUT", total_input_tokens)
print("COMPLETION", total_completion_tokens)
print("SUB INPUT", total_sub_input_tokens)
print("SUB COMPLETION", total_sub_completion_tokens)

### Image

In [None]:
#Three level ranking

query = "A page from a music player app, presenting playback controls for audio content"
retrieve = 1
top_k = rank(query)[:retrieve]
filtered_top_k = rank_and_filter(top_k, query, retrieve, method="image", reasoning=True, ranking_type="three_level")

In [None]:
total_input_tokens = 0
total_completion_tokens = 0
for resp in filtered_top_k:
    input_tokens = resp["full_response"]["usage"]["prompt_tokens"]
    completion_tokens = resp["full_response"]["usage"]["completion_tokens"]
    total_input_tokens += input_tokens
    total_completion_tokens += completion_tokens
    rico_id = resp["id"]
    img = Image.open(rico_path + str(rico_id) + ".jpg")
    img = img.resize((1080, 1920))
    plt.imshow(img)
    plt.grid(False)
    title = str(rico_id) 
    plt.title(title)
    plt.show()
    if resp["description"] != "":
        print(resp["description"])
        print()
    print(resp["response"])
    print(resp["reasoning"])
    print(resp["score"])
print("\nUSAGE")
print("INPUT", total_input_tokens)
print("COMPLETION", total_completion_tokens)

In [None]:
#Binary ranking

query = "A page from a music player app, presenting playback controls for audio content"
retrieve = 1
top_k = rank(query)[:retrieve]
filtered_top_k = rank_and_filter(top_k, query, retrieve, method="image", reasoning=True, ranking_type="binary")

In [None]:
total_input_tokens = 0
total_completion_tokens = 0
for resp in filtered_top_k:
    input_tokens = resp["full_response"]["usage"]["prompt_tokens"]
    completion_tokens = resp["full_response"]["usage"]["completion_tokens"]
    total_input_tokens += input_tokens
    total_completion_tokens += completion_tokens
    rico_id = resp["id"]
    img = Image.open(rico_path + str(rico_id) + ".jpg")
    img = img.resize((1080, 1920))
    plt.imshow(img)
    plt.grid(False)
    title = str(rico_id) 
    plt.title(title)
    plt.show()
    if resp["description"] != "":
        print(resp["description"])
        print()
    print(resp["response"])
    print(resp["reasoning"])
    print(resp["score"])
print("\nUSAGE")
print("INPUT", total_input_tokens)
print("COMPLETION", total_completion_tokens)

### One Step

In [None]:
#Three level ranking

query = "A page from a music player app, presenting playback controls for audio content"
retrieve = 1
top_k = rank(query)[:retrieve]
filtered_top_k = rank_and_filter(top_k, query, retrieve, method="one_step", reasoning=True, ranking_type="three_level")

In [None]:
total_input_tokens = 0
total_completion_tokens = 0
total_sub_input_tokens = 0
total_sub_completion_tokens = 0
for resp in filtered_top_k:
    input_tokens = resp["full_response"].usage.prompt_tokens
    completion_tokens = resp["full_response"].usage.completion_tokens
    total_input_tokens += input_tokens
    total_completion_tokens += completion_tokens
    rico_id = resp["id"]
    img = Image.open(rico_path + str(rico_id) + ".jpg")
    img = img.resize((1080, 1920))
    plt.imshow(img)
    plt.grid(False)
    title = str(rico_id) 
    plt.title(title)
    plt.show()
    if resp["description"] != "":
        print(resp["description"])
        print()
    print(resp["response"])
    print(resp["reasoning"])
    print(resp["score"])
print("\nUSAGE")
print("INPUT", total_input_tokens)
print("COMPLETION", total_completion_tokens)


In [None]:
#Binary ranking

query = "A page from a music player app, presenting playback controls for audio content"
retrieve = 1
top_k = rank(query)[:retrieve]
filtered_top_k = rank_and_filter(top_k, query, retrieve, method="one_step", reasoning=True, ranking_type="binary")

In [None]:
total_input_tokens = 0
total_completion_tokens = 0
total_sub_input_tokens = 0
total_sub_completion_tokens = 0
for resp in filtered_top_k:
    input_tokens = resp["full_response"].usage.prompt_tokens
    completion_tokens = resp["full_response"].usage.completion_tokens
    total_input_tokens += input_tokens
    total_completion_tokens += completion_tokens
    rico_id = resp["id"]
    img = Image.open(rico_path + str(rico_id) + ".jpg")
    img = img.resize((1080, 1920))
    plt.imshow(img)
    plt.grid(False)
    title = str(rico_id) 
    plt.title(title)
    plt.show()
    if resp["description"] != "":
        print(resp["description"])
        print()
    print(resp["response"])
    print(resp["reasoning"])
    print(resp["score"])
print("\nUSAGE")
print("INPUT", total_input_tokens)
print("COMPLETION", total_completion_tokens)

## Rank and Rerank

### Two Step

In [None]:
query = "A page from a music player app, presenting playback controls for audio content"
retrieve = 5
top_k = rank(query)[:retrieve]
reranked = rank_and_rerank(top_k, query, retrieve, method="two_step", reasoning=True)

In [None]:
print(reranked["response"])

total_sub_input_tokens = 0
total_sub_completion_tokens = 0
for sub_resp in reranked["sub_responses"]:
    sub_input_tokens = sub_resp.usage.prompt_tokens
    sub_completion_tokens = sub_resp.usage.completion_tokens
    total_sub_input_tokens += sub_input_tokens
    total_sub_completion_tokens += sub_completion_tokens

print("\nUSAGE")
print("INPUT", reranked["full_response"].usage.prompt_tokens)
print("COMPLETION", reranked["full_response"].usage.completion_tokens)
print("SUB INPUT", total_sub_input_tokens)
print("SUB COMPLETION", total_sub_completion_tokens)

show_images(reranked["retrieved"], rico_path)

### One Step

In [None]:
query = "A page from a music player app, presenting playback controls for audio content"
retrieve = 5
top_k = rank(query)[:retrieve]
reranked = rank_and_rerank(top_k, query, retrieve, method="one_step", reasoning=True)

In [None]:
print(reranked["response"])

print("\nUSAGE")
print("INPUT", reranked["full_response"].usage.prompt_tokens)
print("COMPLETION", reranked["full_response"].usage.completion_tokens)
print("RANKING", reranked["ranked"])

show_images(reranked["retrieved"], rico_path)

### Image

In [None]:
query = "A page from a music player app, presenting playback controls for audio content"
retrieve = 5
top_k = rank(query)[:retrieve]
reranked = rank_and_rerank(top_k, query, retrieve, method="image", reasoning=True)

In [None]:
print(reranked["response"])
print("\nUSAGE")
print("INPUT", reranked["full_response"]["usage"]["prompt_tokens"])
print("COMPLETION", reranked["full_response"]["usage"]["completion_tokens"])
show_images(reranked["retrieved"], rico_path)

## Get best UI

### Two Step

In [None]:
query = "A page from a music player app, presenting playback controls for audio content"
retrieve = 5
top_k = rank(query)[:retrieve]
reranked = return_best(top_k, query, method="two_step", reasoning=True)

In [None]:
print(reranked["response"])
print(reranked["best"])

total_sub_input_tokens = 0
total_sub_completion_tokens = 0
for sub_resp in reranked["sub_responses"]:
    sub_input_tokens = sub_resp.usage.prompt_tokens
    sub_completion_tokens = sub_resp.usage.completion_tokens
    total_sub_input_tokens += sub_input_tokens
    total_sub_completion_tokens += sub_completion_tokens

print("\nUSAGE")
print("INPUT", reranked["full_response"].usage.prompt_tokens)
print("COMPLETION", reranked["full_response"].usage.completion_tokens)
print("SUB INPUT", total_sub_input_tokens)
print("SUB COMPLETION", total_sub_completion_tokens)

show_images(reranked["retrieved"], rico_path)

### One Step

In [None]:
query = "A page from a music player app, presenting playback controls for audio content"
retrieve = 5
top_k = rank(query)[:retrieve]
reranked = return_best(top_k, query, method="one_step", reasoning=True)

In [None]:
print(reranked["response"])
print(reranked["best"])

total_sub_input_tokens = 0
total_sub_completion_tokens = 0
for sub_resp in reranked["sub_responses"]:
    sub_input_tokens = sub_resp.usage.prompt_tokens
    sub_completion_tokens = sub_resp.usage.completion_tokens
    total_sub_input_tokens += sub_input_tokens
    total_sub_completion_tokens += sub_completion_tokens

print("\nUSAGE")
print("INPUT", reranked["full_response"].usage.prompt_tokens)
print("COMPLETION", reranked["full_response"].usage.completion_tokens)
print("SUB INPUT", total_sub_input_tokens)
print("SUB COMPLETION", total_sub_completion_tokens)

show_images(reranked["retrieved"], rico_path)

### Image

In [None]:
query = "A page from a music player app, presenting playback controls for audio content"
retrieve = 5
top_k = rank(query)[:retrieve]
reranked = return_best(top_k, query, method="image", reasoning=True)

In [None]:
print(reranked["response"])
print(reranked["best"])

total_sub_input_tokens = 0
total_sub_completion_tokens = 0
for sub_resp in reranked["sub_responses"]:
    sub_input_tokens = sub_resp.usage.prompt_tokens
    sub_completion_tokens = sub_resp.usage.completion_tokens
    total_sub_input_tokens += sub_input_tokens
    total_sub_completion_tokens += sub_completion_tokens

print("\nUSAGE")
print("INPUT", reranked["full_response"].usage.prompt_tokens)
print("COMPLETION", reranked["full_response"].usage.completion_tokens)
print("SUB INPUT", total_sub_input_tokens)
print("SUB COMPLETION", total_sub_completion_tokens)

show_images(reranked["retrieved"], rico_path)

# 5. Annotation CSV

## Set up (run one time only)

In [None]:
dataset = pd.read_csv(data_path + "Description Dataset/dataset.csv", sep=";")

In [None]:
dataset.head()

In [None]:
rand = dataset.sample(15)
rand.head()

In [None]:
annotate = []
for index, row in rand.iterrows():
    top_10 = rank(row["Summary"])[:10]
    for i in range(10):
        anno_row = {"query": row["Summary"], "id": top_10[i][0], "annotation": 0}
        annotate.append(anno_row)

anno_df = pd.DataFrame(annotate)
anno_df.head()

In [None]:
anno_df.to_csv(data_path + "annotations.csv")

## Annotate with GPT (binary)

In [None]:
annotated = pd.read_csv(data_path + "annotations.csv", sep = ";")
annotated.head()

In [None]:
annotated["two_step"] = 0
annotated["one_step"] = 0
annotated["image"] = 0
annotated["two_step_reasoning"] = ""
annotated["one_step_reasoning"] = ""
annotated["image_reasoning"] = ""
annotated.head()

In [None]:
methods = ["two_step", "one_step", "image"]
count = 0
for index, row in annotated.iterrows():
    query = row["query"]
    rico_id = row["rico_id"]
    for method in methods:
        ranked = rank_and_filter([(rico_id, 0.0)], query, 1, method=method, ranking_type="binary", reasoning = True)
        score = ranked[0]["score"]
        reasoning = ranked[0]["reasoning"]
        annotated.at[index, method] = score
        annotated.at[index, method + "_reasoning"] = reasoning
annotated.head()

In [None]:
annotated.to_csv(data_path + "annotations_with_gpt4o.csv")

## Annotate with GPT (three-level)

In [None]:
annotated = pd.read_csv(data_path + "annotations_three_level.csv", sep = ";")
annotated.head()

In [None]:
annotated["two_step"] = 0
annotated["one_step"] = 0
annotated["image"] = 0
annotated["two_step_reasoning"] = ""
annotated["one_step_reasoning"] = ""
annotated["image_reasoning"] = ""
annotated.head()

In [None]:
methods = ["two_step", "one_step", "image"]
count = 0
for index, row in annotated.iterrows():
    print(index)
    query = row["query"]
    rico_id = row["rico_id"]
    for method in methods:
        ranked = rank_and_filter([(rico_id, 0.0)], query, 1, method=method, ranking_type="three_level", reasoning = True)
        score = ranked[0]["score"]
        reasoning = ranked[0]["reasoning"]
        annotated.at[index, method] = score
        annotated.at[index, method + "_reasoning"] = reasoning
annotated.head()

In [None]:
annotated.to_csv(data_path + "annotations_three_level_with_gpt4o.csv")

# 6. Annotate Gold Standard

In [None]:
goldstandard_csv = pd.read_csv(data_path + "goldstandard.csv")
goldstandard_csv["gui_indexes"] = goldstandard_csv["gui_indexes"].apply(literal_eval)
goldstandard_csv["relevance"] = goldstandard_csv["relevance"].apply(literal_eval)
goldstandard_csv.head()

In [None]:
test_data = goldstandard_csv.iloc[0]
reranked = rank_and_rerank(test_data["gui_indexes"][:3], test_data["query"], retrieve=3, method="image", reasoning=True)

In [None]:
test_data["gui_indexes"][:2]

In [None]:
reranked["ranked"] = list(map(int, reranked["ranked"]))
print(reranked["ranked"])
print(reranked["id_mappings"])
print(reranked["ranked_unmapped"])
print(reranked["reasoning"])

In [None]:
set(test_data["gui_indexes"]) == set(reranked["ranked"])

In [None]:
print(type(reranked["full_response"]["usage"]["prompt_tokens"]))
print(reranked["full_response"]["usage"]["completion_tokens"])

## Three-level/Binary annotations

In [None]:
anno = goldstandard_csv.copy()

methods = ["image"]
ranking_type = "one_to_ten
for method in methods:
    anno[method] = ""
    anno[method + "_reasonings"] = ""
    anno[method + "_errors"] = ""
    anno[method + "_responses"] = ""
    anno[method + "_prompt_tokens"] = ""
    anno[method + "_completion_tokens"] = ""
    anno[method + "_total_prompt_tokens"] = ""
    anno[method + "_total_completion_tokens"] = ""
    print(method)
    counter = 0
    for index, row in anno.iterrows():
        print(index)
        filtered = rank_and_filter(row["gui_indexes"], row["query"], retrieve=20, method=method, reasoning=True, ranking_type=ranking_type)
        scores = []
        reasonings = []
        errors = []
        responses = []
        prompt_tokens = []
        completion_tokens = []
        total_prompt_tokens = 0
        total_completion_tokens = 0
        
        for filt in filtered:
            if filt["error"] == 0:
                scores.append(filt["score"])
                reasonings.append(filt["reasoning"])
                responses.append(filt["response"])
                errors.append(filt["error"])
            else:
                scores.append(0)
                reasonings.append("")
                responses.append(filt["full_response"])
                print(filt["full_response"])
                errors.append(filt["error"])
            try:
                if method == "image":
                    total_prompt_tokens += filt["full_response"]["usage"]["prompt_tokens"]
                    total_completion_tokens += filt["full_response"]["usage"]["completion_tokens"]
                    prompt_tokens.append(filt["full_response"]["usage"]["prompt_tokens"])
                    completion_tokens.append(filt["full_response"]["usage"]["completion_tokens"])
                else:
                    total_prompt_tokens += filt["full_response"].usage.prompt_tokens
                    total_completion_tokens += filt["full_response"].usage.completion_tokens
                    prompt_tokens.append(filt["full_response"].usage.prompt_tokens)
                    completion_tokens.append(filt["full_response"].usage.completion_tokens)
            except Exception as e:
                print("Error!", type(e), e)
                print(filt)
                prompt_tokens.append(0)
                completion_tokens.append(0)
        anno.at[index, method] = scores
        anno.at[index, method + "_reasonings"] = reasonings
        anno.at[index, method + "_errors"] = errors
        anno.at[index, method + "_responses"] = responses
        anno.at[index, method + "_prompt_tokens"] = prompt_tokens
        anno.at[index, method + "_completion_tokens"] = completion_tokens
        anno.at[index, method + "_total_prompt_tokens"] = total_prompt_tokens
        anno.at[index, method + "_total_completion_tokens"] = total_completion_tokens
        if index >= 1:
            break
        counter += 1
anno.to_csv(data_path + "goldstandard_one_to_ten_image_anno.csv")
anno.head()

In [None]:
anno.to_csv(data_path + "goldstandard_three_level_v2_image_anno.csv")

## Reranking Annotations

In [None]:
reranking_anno = goldstandard_csv.copy()

methods = ["image"]
temp = 0.25
for method in methods:
    reranking_anno[method] = ""
    reranking_anno[method + "_reasoning"] = ""
    reranking_anno[method + "_response"] = ""
    reranking_anno[method + "_id_mappings"] = ""
    reranking_anno[method + "_ranked_unmapped"] = ""
    reranking_anno[method + "_full_response"] = ""
    reranking_anno[method + "_prompt_tokens"] = ""
    reranking_anno[method + "_completion_tokens"] = ""
    reranking_anno[method + "_error"] = ""
    
    print(method)
    counter = 0
    for index, row in reranking_anno.iterrows():
        print(index)
        no_error = False
        reranked = rank_and_rerank(row["gui_indexes"], row["query"], retrieve=len(row["gui_indexes"]), method=method, reasoning=True, temp=temp)
        if reranked["error"] == 0:
            reranked["ranked"] = list(map(int, reranked["ranked"]))
            if set(row["gui_indexes"]) == set(reranked["ranked"]) and len(reranked["ranked"]) == len(row["gui_indexes"]):
                no_error = True
            else:
                print("Error!")
                print(set(row["gui_indexes"]).difference(set(reranked["ranked"])))
                print(len(reranked["ranked"]))
        try:
            reranking_anno.at[index, method + "_full_response"] = reranked["full_response"]
            if method == "image":
                reranking_anno.at[index, method + "_prompt_tokens"] = reranked["full_response"]["usage"]["prompt_tokens"]
                reranking_anno.at[index, method + "_completion_tokens"] = reranked["full_response"]["usage"]["completion_tokens"]
            else:
                reranking_anno.at[index, method + "_prompt_tokens"] = int(reranked["full_response"].usage.prompt_tokens)
                reranking_anno.at[index, method + "_completion_tokens"] = int(reranked["full_response"].usage.completion_tokens)
            reranking_anno.at[index, method] = reranked["ranked"]
            reranking_anno.at[index, method + "_reasoning"] = reranked["reasoning"]
            reranking_anno.at[index, method + "_response"] = reranked["response"]
            reranking_anno.at[index, method + "_id_mappings"] = reranked["id_mappings"]
            reranking_anno.at[index, method + "_ranked_unmapped"] = reranked["ranked_unmapped"]
            if no_error:
                reranking_anno.at[index, method + "_error"] = reranked["error"]
            else:
                reranking_anno.at[index, method + "_error"] = 1
        except Exception as e:
            print(type(e), e)
            reranking_anno.at[index, method + "_error"] = 1       
reranking_anno.head()

In [None]:
reranking_anno = pd.read_csv(data_path + "goldstandard_reranking_anno_image_run1_fix.csv")
missing_uis = []
for index, row in reranking_anno.iterrows():
    if row["image_error"] == 1 and row["image"] != "":
        try:
            print()
            row["gui_indexes"] = list(sorted(literal_eval(row["gui_indexes"])))
            row["image"] = list(sorted(literal_eval(row["image"])))
            
            print(row["gui_indexes"])
            print(row["image"])
            print(len(set(row["gui_indexes"]).difference(set(row["image"]))))
            for diff in set(row["gui_indexes"]).difference(set(row["image"])):
                print(diff)
                print(str(diff) in row["image_reasoning"])
                print(diff in row["image"])
                print(diff in row["gui_indexes"])
                missing_uis.append(diff)
            print(row["image_reasoning"])
            #show_images(row["gui_indexes"], rico_path)
        except Exception as e:
            print(e)
            print(row["image"])
print()
print("MISSING")
print(missing_uis)

# 7. Annotate Test Dataset

In [None]:
test_dataset = pd.read_csv(data_path + "dataset_test_k_top_20.csv")
test_dataset["rico_ranking"] = test_dataset["rico_ranking"].apply(literal_eval)
test_dataset.head()

## Binary/One to ten annotation

In [None]:
method = "image"
ranking_types = ["binary", "one_to_ten"]

for ranking_type in ranking_types:
    print(ranking_type)
    test_dataset[ranking_type + "_annotation"] = ""
    test_dataset[ranking_type + "_reasonings"] = ""
    test_dataset[ranking_type + "_full_responses"] = ""
    test_dataset[ranking_type + "_prompt_tokens"] = ""
    test_dataset[ranking_type + "_completion_tokens"] = ""
    test_dataset[ranking_type + "_total_prompt_tokens"] = ""
    test_dataset[ranking_type + "_total_completion_tokens"] = ""
    for index, row in test_dataset.iterrows():
        print(index)
        filtered = rank_and_filter(row["rico_ranking"], row["Descriptions"], retrieve=len(row["rico_ranking"]), method=method, reasoning=True, ranking_type=ranking_type)
        scores = []
        reasonings = []
        full_responses = []
        prompt_tokens = []
        completion_tokens = []
        total_prompt_tokens = 0
        total_completion_tokens = 0
        for filt in filtered:
            if filt["error"] == 0:
                scores.append(filt["score"])
                reasonings.append(filt["reasoning"])
                full_response = filt["full_response"]
                full_responses.append(full_response)
                prompt_tokens.append(full_response["usage"]["prompt_tokens"])
                completion_tokens.append(full_response["usage"]["completion_tokens"])
                total_prompt_tokens += full_response["usage"]["prompt_tokens"]
                total_completion_tokens += full_response["usage"]["completion_tokens"]
            else:
                scores.append(0)
                reasonings.append("")
                try:
                    full_response = filt["full_response"]
                    full_responses.append(full_response)
                    prompt_tokens.append(full_response["usage"]["prompt_tokens"])
                    completion_tokens.append(full_response["usage"]["completion_tokens"])
                    total_prompt_tokens += full_response["usage"]["prompt_tokens"]
                    total_completion_tokens += full_response["usage"]["completion_tokens"]
                except Exception as e:
                    print(type(e), e)
                    full_responses.append("Error")
                    prompt_tokens.append(0)
                    completion_tokens.append(0)
                    total_prompt_tokens += 0
                    total_completion_tokens += 0

        test_dataset.at[index, ranking_type + "_annotation"] = str(scores)
        test_dataset.at[index, ranking_type + "_reasonings"] = str(reasonings)
        test_dataset.at[index, ranking_type + "_full_responses"] = full_responses
        test_dataset.at[index, ranking_type + "_prompt_tokens"] = str(prompt_tokens)
        test_dataset.at[index, ranking_type + "_completion_tokens"] = str(completion_tokens)
        test_dataset.at[index, ranking_type + "_total_prompt_tokens"] = total_prompt_tokens
        test_dataset.at[index, ranking_type + "_total_completion_tokens"] = total_completion_tokens

test_dataset.head()

In [None]:
for ranking_type in ranking_types:
    print(ranking_type)
    test_dataset[ranking_type + "_annotation"] = test_dataset[ranking_type + "_annotation"].apply(literal_eval)
    test_dataset[ranking_type + "_reasonings"] = test_dataset[ranking_type + "_reasonings"].apply(literal_eval)
    test_dataset[ranking_type + "_prompt_tokens"] = test_dataset[ranking_type + "_prompt_tokens"].apply(literal_eval)
    test_dataset[ranking_type + "_completion_tokens"] = test_dataset[ranking_type + "_completion_tokens"].apply(literal_eval)


In [None]:
test_dataset.to_csv(data_path + "dataset_test_k_top_20_binary_one_to_ten_annotated.csv", index=False)

In [None]:
test_dataset.head()