<h2> Load Data</h2>

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import utils.GenSnippetsLib as gsl
import math
import json

In [3]:

def string_to_list_string(data):
    data = data.replace(' ', ',')
    data = data.replace('\n', ',')
    data = ','.join([element for element in data.split(",") if len(element) > 0])
    if data[1] == ",":
        data = "[" + data[2:]
    return data


df_fixation = pd.read_csv('./data/filteredData/fixation_stats.csv', sep=";")
df_fixation = df_fixation[df_fixation["IsOutlier"] == False]
df_fixation = df_fixation.drop(columns=["IsOutlier", "Behavioral"])
df_fixation["Fixation_startT"] = df_fixation["Fixation_startT"].apply(string_to_list_string)
df_fixation["Fixation_endT"] = df_fixation["Fixation_endT"].apply(string_to_list_string)
df_fixation["Fixation_x"] = df_fixation["Fixation_x"].apply(string_to_list_string)
df_fixation["Fixation_y"] = df_fixation["Fixation_y"].apply(string_to_list_string)
df_fixation["Fixation_x_range"] = df_fixation["Fixation_x_range"].apply(string_to_list_string)
df_fixation["Fixation_y_range"] = df_fixation["Fixation_y_range"].apply(string_to_list_string)

In [6]:
# Get Bounding Boxes for Lines Of Code
snippets = df_fixation["Algorithm"].unique()
df_lines = pd.DataFrame(columns=["Algorithm", "Line", "BoundingBox"])
for snippet in tqdm(snippets):
    aoi_token_generator = f"./../CodeSnippets/aois/Generators/{snippet}_ast.json"
    image, aoi_list = gsl.create_image(aoi_token_generator, font_path="./../CodeSnippets/fonts/ttf/")
    height, width = image.size
    width_offset = int(1920 * 0.5) - int(height / 2)
    height_offset = int(1080 * 0.5) - int(width / 2)
    aoi_clustered = []
    current_left = None
    current_top = None
    current_right = None
    current_bottom = None
    current_line = 0
    for letter in aoi_list:
        if letter["letter"] == '\n':
            if current_left is not None:
                aoi_clustered.append((current_line, current_left, current_top, current_right, current_bottom))
            current_left = None
            current_top = None
            current_right = None
            current_bottom = None
            current_line += 1
            continue
        if current_left is None:
            current_left = letter["BoundingBox"][0]
            current_top = letter["BoundingBox"][1]
            current_right = letter["BoundingBox"][2]
            current_bottom = letter["BoundingBox"][3]
        else:
            current_left = min(current_left, letter["BoundingBox"][0])
            current_top = min(current_top, letter["BoundingBox"][1])
            current_right = max(current_right, letter["BoundingBox"][2])
            current_bottom = max(current_bottom, letter["BoundingBox"][3])

    for token in aoi_clustered:
        df_lines.loc[len(df_lines)] = [snippet, token[0],
                                                 (token[1] + width_offset,
                                                  token[2] + height_offset,
                                                  token[3] + width_offset,
                                                  token[4] + height_offset)]
df_lines

  0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,Algorithm,Line,BoundingBox
0,IsPrime,0,"(768, 467, 1152, 482)"
1,IsPrime,1,"(768, 486, 1112, 501)"
2,IsPrime,2,"(768, 505, 1049, 519)"
3,IsPrime,3,"(768, 526, 1008, 539)"
4,IsPrime,4,"(768, 543, 880, 557)"
...,...,...,...
429,Rectangle,12,"(740, 601, 1012, 615)"
430,Rectangle,13,"(740, 619, 788, 633)"
431,Rectangle,15,"(740, 657, 956, 672)"
432,Rectangle,16,"(740, 676, 1100, 691)"


In [9]:
df_line_fixation_per_participant = pd.DataFrame([], columns=["Algorithm", "Participant", "FixationNumber", "FixationStart", "FixationEnd", "LineNumber"])
participants = df_fixation["Participant"].unique()
for snippet in tqdm(snippets):
    df_token_per_algo = df_lines[df_lines["Algorithm"] == snippet]

    for participant in participants:
        df_fixation_participant = df_fixation[(df_fixation["Algorithm"] == snippet) & (df_fixation["Participant"] == participant)]
        if len(df_fixation_participant) == 0:
            continue
        start_times = eval(df_fixation_participant["Fixation_startT"].values[0])
        end_times = eval(df_fixation_participant["Fixation_endT"].values[0])
        y_coordinates = eval(df_fixation_participant["Fixation_y"].values[0])
        y_range = eval(df_fixation_participant["Fixation_y_range"].values[0])
        idx_values = range(len(start_times))
        for (fix_idx, start, end, y, y_range) in zip(idx_values, start_times, end_times, y_coordinates, y_range):
            low_y = int(float(y) - math.ceil(float(y_range)))
            high_y = int(float(y) + math.ceil(float(y_range)))
            possible_coordinates = [y for y in range(low_y, high_y + 1)]

            found = False
            for idx, row in df_token_per_algo.iterrows():
                line_number = row["Line"]
                bounding_box = row["BoundingBox"]

                for possible_y in possible_coordinates:
                    if bounding_box[1] <= possible_y <= bounding_box[3]:
                        df_line_fixation_per_participant.loc[len(df_line_fixation_per_participant)] = [snippet, participant, fix_idx, start, end, line_number]
                        found = True
                        break
                if found:
                    break

df_line_fixation_per_participant

  0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,Algorithm,Participant,FixationNumber,FixationStart,FixationEnd,LineNumber
0,IsPrime,1,3,1136.027,1332.034,0
1,IsPrime,1,4,1356.034,1512.039,0
2,IsPrime,1,5,1536.038,1812.044,0
3,IsPrime,1,6,1840.047,2132.055,0
4,IsPrime,1,7,2156.055,2932.074,0
...,...,...,...,...,...,...
77833,Rectangle,71,26,8160.260,8312.263,7
77834,Rectangle,71,27,8332.263,8520.265,13
77835,Rectangle,71,29,8800.282,8948.286,3
77836,Rectangle,71,31,9428.297,9596.305,2


In [11]:
snippets = df_fixation["Algorithm"].unique()
df_snippet_length = pd.DataFrame(columns=["Algorithm", "LOC"])
for snippet in tqdm(snippets):
    aoi_token_generator = f"./../CodeSnippets/aois/Generators/{snippet}_ast.json"
    with open(aoi_token_generator) as f:
        aoi_list = json.load(f)
        data = aoi_list["source-code"]
        LOC = len(data)
        df_snippet_length.loc[len(df_snippet_length)] = [snippet, LOC]
df_snippet_length

  0%|          | 0/32 [00:00<?, ?it/s]

In [14]:
df_behavioral = pd.read_csv('./data/filteredData/fixation_stats.csv', sep=";")
df_behavioral = df_behavioral[df_behavioral["IsOutlier"] == False]
df_behavioral = df_behavioral[["Participant", "Algorithm", "Duration", "SkillScore"]]
df_behavioral

Unnamed: 0,Participant,Algorithm,Duration,SkillScore
0,1,IsPrime,12.390280,0.331385
1,1,SiebDesEratosthenes,152.571914,0.331385
2,1,IsAnagram,109.615724,0.331385
3,1,RemoveDoubleChar,53.456276,0.331385
4,1,BinToDecimal,49.922091,0.331385
...,...,...,...,...
1066,71,GreatestCommonDivisor,30.757360,0.435651
1067,71,DumpSorting,113.368945,0.435651
1068,71,BinomialCoefficient,50.637861,0.435651
1069,71,IsAnagram,110.995754,0.435651


In [53]:
df_combined = pd.merge(df_line_fixation_per_participant, df_snippet_length, on=["Algorithm"])
df_combined = pd.merge(df_combined, df_behavioral, on=["Participant", "Algorithm"])
df_combined["FixationStart"] = df_combined["FixationStart"] / 1000.0
df_combined["FixationEnd"] = df_combined["FixationEnd"] / 1000.0


def loc_coverage_after_time_percentage(df, percentage):
    end_duration = df["Duration"].iloc[0]
    loc = df["LOC"].iloc[0]
    max_duration = end_duration * percentage
    df_filtered = df[df["FixationEnd"] <= max_duration]
    unique_loc = df_filtered["LineNumber"].nunique()
    return unique_loc / loc

df_20 = df_combined.groupby(["Algorithm", "Participant"]).apply(lambda df : loc_coverage_after_time_percentage(df, 0.2))
df_20 = df_20.reset_index()

df_30 = df_combined.groupby(["Algorithm", "Participant"]).apply(lambda df : loc_coverage_after_time_percentage(df, 0.3))
df_30 = df_30.reset_index()

df_40 = df_combined.groupby(["Algorithm", "Participant"]).apply(lambda df : loc_coverage_after_time_percentage(df, 0.4))
df_40 = df_40.reset_index()

df_50 = df_combined.groupby(["Algorithm", "Participant"]).apply(lambda df : loc_coverage_after_time_percentage(df, 0.5))
df_50 = df_50.reset_index()

df_20 = df_20.groupby(["Participant"]).mean().values.reshape(37, )
df_30 = df_30.groupby(["Participant"]).mean().values.reshape(37, )
df_40 = df_40.groupby(["Participant"]).mean().values.reshape(37, )
df_50 = df_50.groupby(["Participant"]).mean().values.reshape(37, )


df_skill = df_behavioral[["Participant", "SkillScore"]]
df_skill = df_skill.drop_duplicates()

df_code_coverage = pd.DataFrame({"Participant": participants, "20%": df_20, "30%": df_30, "40%": df_40, "50%": df_50})
df_code_coverage.set_index("Participant", inplace=True, drop=True)
df_code_coverage = pd.merge(df_code_coverage, df_skill, on=["Participant"])
df_code_coverage.set_index("Participant", inplace=True, drop=True)
df_code_coverage

Unnamed: 0_level_0,20%,30%,40%,50%,SkillScore
Participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.245391,0.348396,0.432725,0.49312,0.331385
2,0.251232,0.355527,0.418872,0.464605,0.379187
3,0.415433,0.521158,0.638263,0.71783,0.311264
4,0.245859,0.331361,0.427805,0.500664,0.424727
5,0.298898,0.390414,0.469532,0.536783,0.313031
6,0.255461,0.391562,0.485468,0.578998,0.315932
7,0.267153,0.392478,0.459563,0.495865,0.420873
10,0.152989,0.243787,0.323574,0.405975,0.350392
11,0.332465,0.457989,0.540979,0.610152,0.178206
12,0.192586,0.29123,0.38224,0.483282,0.309233


In [54]:
df_code_coverage["30%"].mean()

0.3102354705268953

In [55]:
# spearman correlation
df_code_coverage.corr(method="spearman")

Unnamed: 0,20%,30%,40%,50%,SkillScore
20%,1.0,0.974633,0.964438,0.944523,-0.08606
30%,0.974633,1.0,0.982693,0.955666,-0.11119
40%,0.964438,0.982693,1.0,0.978426,-0.110479
50%,0.944523,0.955666,0.978426,1.0,-0.172831
SkillScore,-0.08606,-0.11119,-0.110479,-0.172831,1.0
