In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import utils.GenSnippetsLib as gsl
import math

## Import Fixation Data

In [2]:
def string_to_list_string(data):
    data = data.replace(' ', ',')
    data = data.replace('\n', ',')
    data = ','.join([element for element in data.split(",") if len(element) > 0])
    if data[1] == ",":
        data = "[" + data[2:]
    return data

df_fixation = pd.read_csv('./data/filteredData/fixation_stats.csv', sep=";")
df_fixation = df_fixation[df_fixation["IsOutlier"] == False]
df_fixation = df_fixation.drop(columns=["IsOutlier", "Behavioral"])
df_fixation["Fixation_startT"] = df_fixation["Fixation_startT"].apply(string_to_list_string)
df_fixation["Fixation_endT"] = df_fixation["Fixation_endT"].apply(string_to_list_string)
df_fixation["Fixation_x"] = df_fixation["Fixation_x"].apply(string_to_list_string)
df_fixation["Fixation_y"] = df_fixation["Fixation_y"].apply(string_to_list_string)
df_fixation["Fixation_x_range"] = df_fixation["Fixation_x_range"].apply(string_to_list_string)
df_fixation["Fixation_y_range"] = df_fixation["Fixation_y_range"].apply(string_to_list_string)

# Token Based Metrics

### Read in the Generator for Token Based Metrics to get The BoundingBoxes and Indices of each Token per Algorithm

In [3]:
# Get Token based AOIs
snippets = df_fixation["Algorithm"].unique()
df_token_aois = pd.DataFrame(columns=["Algorithm", "Token", "TokenIdx", "BoundingBox"])
for snippet in tqdm(snippets):
    aoi_token_generator = f"./../CodeSnippets/aois/Generators/{snippet}_ast.json"
    image, aoi_list = gsl.create_image(aoi_token_generator, font_path="./../CodeSnippets/fonts/ttf/")
    height, width = image.size
    width_offset = int(1920 * 0.5) - int(height / 2)
    height_offset = int(1080 * 0.5) - int(width / 2)
    aoi_clustered = []
    current_left = None
    current_top = None
    current_right = None
    current_bottom = None
    current_aoi = None
    color = None
    for letter in aoi_list:
        if len(letter["AOI"]) == 1 or letter["letter"] == '\n':
            if current_aoi is not None:
                aoi_clustered.append(
                    (len(aoi_clustered), current_aoi, current_left, current_top, current_right, current_bottom, color))
            current_aoi = None
            color = None
            current_left = None
            current_top = None
            current_right = None
            current_bottom = None
            continue
        if current_aoi is None:
            current_aoi = letter["AOI"][1]
            color = letter["color"]
            current_left = letter["BoundingBox"][0]
            current_top = letter["BoundingBox"][1]
            current_right = letter["BoundingBox"][2]
            current_bottom = letter["BoundingBox"][3]
        elif current_aoi == letter["AOI"][1]:
            current_left = min(current_left, letter["BoundingBox"][0])
            current_top = min(current_top, letter["BoundingBox"][1])
            current_right = max(current_right, letter["BoundingBox"][2])
            current_bottom = max(current_bottom, letter["BoundingBox"][3])
        else:
            aoi_clustered.append(
                (len(aoi_clustered), current_aoi, current_left, current_top, current_right, current_bottom, color))
            current_aoi = letter["AOI"][1]
            color = letter["color"]
            current_left = letter["BoundingBox"][0]
            current_top = letter["BoundingBox"][1]
            current_right = letter["BoundingBox"][2]
            current_bottom = letter["BoundingBox"][3]

    for token in aoi_clustered:
        df_token_aois.loc[len(df_token_aois)] = [snippet, token[1], token[0],
                                                 (token[2] + width_offset,
                                                  token[3] + height_offset,
                                                  token[4] + width_offset,
                                                  token[5] + height_offset)]
df_token_aois

  0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,Algorithm,Token,TokenIdx,BoundingBox
0,IsPrime,Modifier,0,"(808, 468, 856, 482)"
1,IsPrime,Modifier,1,"(864, 468, 912, 479)"
2,IsPrime,BasicType,2,"(920, 469, 976, 479)"
3,IsPrime,Identifier,3,"(984, 468, 1040, 479)"
4,IsPrime,Separator,4,"(1040, 467, 1048, 481)"
...,...,...,...,...
2700,Rectangle,Keyword,82,"(988, 677, 1020, 688)"
2701,Rectangle,Separator,83,"(1020, 685, 1028, 688)"
2702,Rectangle,Identifier,84,"(1028, 677, 1076, 691)"
2703,Rectangle,Separator,85,"(1076, 676, 1100, 691)"


### Check which Fixation of which Participant is in which Token

In [4]:
df_token_fixation_per_participant = pd.DataFrame([], columns=["Algorithm", "Participant", "FixationNumber",
                                                              "FixationDuration", "TokenIdx"])
participants = df_fixation["Participant"].unique()
for snippet in tqdm(snippets):
    df_token_per_algo = df_token_aois[df_token_aois["Algorithm"] == snippet]

    for participant in participants:
        df_fixation_participant = df_fixation[
            (df_fixation["Algorithm"] == snippet) & (df_fixation["Participant"] == participant)]
        if len(df_fixation_participant) == 0:
            continue
        start_times = eval(df_fixation_participant["Fixation_startT"].values[0])
        end_times = eval(df_fixation_participant["Fixation_endT"].values[0])
        x_coordinates = eval(df_fixation_participant["Fixation_x"].values[0])
        y_coordinates = eval(df_fixation_participant["Fixation_y"].values[0])
        x_range = eval(df_fixation_participant["Fixation_x_range"].values[0])
        y_range = eval(df_fixation_participant["Fixation_y_range"].values[0])
        idx_values = range(len(start_times))
        for (fix_idx, start, end, x, y, x_range, y_range) in zip(idx_values, start_times, end_times, x_coordinates, y_coordinates, x_range, y_range):
            low_x = int(float(x) - math.ceil(float(x_range)))
            low_y = int(float(y) - math.ceil(float(y_range)))
            high_x = int(float(x) + math.ceil(float(x_range)))
            high_y = int(float(y) + math.ceil(float(y_range)))
            possible_coordinates = [(x, y) for x in range(low_x, high_x + 1) for y in range(low_y, high_y + 1)]

            found = False
            for idx, row in df_token_per_algo.iterrows():
                token_idx = row["TokenIdx"]
                bounding_box = row["BoundingBox"]

                for possible_x, possible_y in possible_coordinates:
                    if bounding_box[0] <= possible_x <= bounding_box[2] and bounding_box[1] <= possible_y <=bounding_box[3]:
                        df_token_fixation_per_participant.loc[len(df_token_fixation_per_participant)] = [snippet, participant,fix_idx,end - start,token_idx]
                        found = True
                        break
                if found:
                    break

df_token_fixation_per_participant

  0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,Algorithm,Participant,FixationNumber,FixationDuration,TokenIdx
0,IsPrime,1,4,156.005,0
1,IsPrime,1,6,292.008,1
2,IsPrime,1,7,776.019,3
3,IsPrime,1,11,196.006,13
4,IsPrime,1,16,480.011,38
...,...,...,...,...,...
37381,Rectangle,70,70,347.998,66
37382,Rectangle,71,16,212.006,43
37383,Rectangle,71,17,776.027,42
37384,Rectangle,71,26,152.003,42


### Transform the Data to a Fixation/ Refixation split by Participant

In [5]:
df_token = df_token_aois.copy()
df_token = df_token.drop(["BoundingBox", "Token"], axis=1)
for participant in participants:
    df_token.loc[:, f"TokenFixation_P{participant}"] = [[] for _ in range(len(df_token))]
    df_token.loc[:, f"TokenReFixation_P{participant}"] = [[] for _ in range(len(df_token))]


prev_participant = df_token_fixation_per_participant["Participant"].iloc[0]
prev_token_idx = df_token_fixation_per_participant["TokenIdx"].iloc[0]
prev_algorithm = df_token_fixation_per_participant["Algorithm"].iloc[0]
fixations = []
re_fixation = False
for idx, row in tqdm(df_token_fixation_per_participant.iterrows(), total=len(df_token_fixation_per_participant)):
    participant = row["Participant"]
    token_idx = row["TokenIdx"]
    algorithm = row["Algorithm"]
    FixationDuration = row["FixationDuration"]

    # fixation switches
    if prev_participant != participant or prev_token_idx != token_idx:
        index = df_token[(df_token["TokenIdx"] == prev_token_idx) & (df_token["Algorithm"] == prev_algorithm)].index[0]
        if re_fixation:
            re_fixations = df_token.loc[index, f"TokenReFixation_P{prev_participant}"]
            re_fixations.extend(fixations.copy())
            df_token.loc[index, f"TokenReFixation_P{prev_participant}"] = re_fixations.copy()
        else:
            df_token.loc[index, f"TokenFixation_P{prev_participant}"] = fixations.copy()
        fixations = []
        # possible new fixation
        re_fixation = False

    sub_frame = df_token[(df_token["TokenIdx"] == token_idx) & (df_token["Algorithm"] == algorithm)]
    if len(sub_frame) == 0:
        raise Exception(f"No Token found for {token_idx} in Algorithm {algorithm}")
    len_of_fixation = len(sub_frame[f"TokenFixation_P{participant}"].iloc[0])
    if re_fixation == False and len_of_fixation > 0 and len(fixations) == 0:
        re_fixation = True

    fixations.append(FixationDuration)
    prev_participant = participant
    prev_token_idx = token_idx
    prev_algorithm = algorithm

df_token

  0%|          | 0/37386 [00:00<?, ?it/s]

Unnamed: 0,Algorithm,TokenIdx,TokenFixation_P1,TokenReFixation_P1,TokenFixation_P2,TokenReFixation_P2,TokenFixation_P3,TokenReFixation_P3,TokenFixation_P4,TokenReFixation_P4,...,TokenFixation_P66,TokenReFixation_P66,TokenFixation_P67,TokenReFixation_P67,TokenFixation_P68,TokenReFixation_P68,TokenFixation_P70,TokenReFixation_P70,TokenFixation_P71,TokenReFixation_P71
0,IsPrime,0,[156.00499999999988],[],[296.01099999999997],[156.0050000000001],[],[],[],[],...,[],[],[312.0119999999988],[],[],[],[],[],[],[]
1,IsPrime,1,[292.0079999999998],[],[388.01199999999994],"[360.0160000000001, 96.0029999999997]",[216.00700000000143],[],[208.0069999999996],[],...,[],[],[1108.0370000000003],"[1240.0420000000013, 232.00499999999738, 648.0...",[],[],[],[],[],[]
2,IsPrime,2,[],[],[192.01600000000008],"[212.0050000000001, 176.00800000000027, 996.04...",[],[],[440.0110000000004],[],...,[],[],[1944.0699999999997],"[235.9960000000001, 3036.1009999999987]",[704.0219999999999],"[236.00700000000143, 368.01300000000265, 3268....",[],[],[],[]
3,IsPrime,3,[776.0190000000002],[204.00900000000001],[],[],[],[],[620.0229999999997],[],...,[156.00400000000081],[500.012999999999],[740.028],"[900.031, 276.0159999999996, 212.0099999999984...",[320.0129999999999],[],[],[],[],[]
4,IsPrime,4,[],[],[236.00099999999998],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2700,Rectangle,82,[376.0139999999992],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
2701,Rectangle,83,[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
2702,Rectangle,84,[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
2703,Rectangle,85,[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]


### Calculate the Token Based Eyetracking Metrics

In [6]:
# Melt the Dataframe to be thinner so that we have Algorithm TokenIdx and Participant and Keys per Fixation / Refixation
df_token_melted = pd.melt(df_token, id_vars=["Algorithm", "TokenIdx"], var_name="KindOfFixation", value_name="FixationDurations")

# Classify Participant and the Kind and Number of Fixations / Refixations
df_token_melted["Participant"] = df_token_melted["KindOfFixation"].apply(lambda x: int(x.split("_")[1][1:]))
df_token_melted["KindOfFixation"] = df_token_melted["KindOfFixation"].apply(lambda x: x.split("_")[0])
df_token_melted["KindOfFixation"] = df_token_melted["KindOfFixation"].apply(lambda x: "Fixation" if x == "TokenFixation" else "ReFixation")
df_token_melted["NumberOfFixations"] = df_token_melted["FixationDurations"].apply(lambda x: len(x))

# Get the number of Participants for further calculations
number_of_participants = len(participants)

# Calculate the First Fixation Duration per Participant per Token
df_token_melted["FirstFixationDuration"] = None
df_token_melted.loc[df_token_melted["KindOfFixation"] == "Fixation" , "FirstFixationDuration"] = df_token_melted["FixationDurations"]\
    .apply(lambda x: x[0] if len(x) > 0 else None)

# Calculate the Single Fixation Duration per Participant per Token
df_token_melted["SingleFixationDuration"] = None
df_token_melted.loc[df_token_melted["KindOfFixation"] == "Fixation" , "SingleFixationDuration"] = df_token_melted["FixationDurations"]\
    .apply(lambda x: x[0] if len(x) == 1 else None)

# Calculate the Gaze Duration per Participant per Token
df_token_melted["GazeDuration"] = None
df_token_melted.loc[df_token_melted["KindOfFixation"] == "Fixation" , "GazeDuration"] = df_token_melted["FixationDurations"]\
    .apply(lambda x: sum(x) if len(x) > 0 else None)

# Calculate the Total Time per Participant per Token
df_token_melted_total_time = df_token_melted.groupby(["Participant", "Algorithm", "TokenIdx"])\
    .agg({"FixationDurations": lambda x: sum(x.values.sum())})
df_token_melted_total_time = df_token_melted_total_time.rename(columns={"FixationDurations": "TotalTime"})

# Merge the Dataframes
df_token_melted = pd.merge(df_token_melted, df_token_melted_total_time, on=["Participant", "Algorithm", "TokenIdx"], how="left")

# Cast the Dataframes to the right datatype
df_token_melted["FirstFixationDuration"] = df_token_melted["FirstFixationDuration"].astype(float)
df_token_melted["SingleFixationDuration"] = df_token_melted["SingleFixationDuration"].astype(float)
df_token_melted["GazeDuration"] = df_token_melted["GazeDuration"].astype(float)
df_token_melted["TotalTime"] = df_token_melted["TotalTime"].astype(float).replace(0, np.nan)

# Read in the Skilllevel
df_skill = pd.read_csv(f"./data/filteredData/filtered_data.csv")
df_skill = df_skill[["Participant", "SkillScore"]]
df_skill = df_skill.drop_duplicates()

# Merge the Dataframes to combine metrics with the Skilllevel
df_metrics_skill = pd.merge(df_token_melted, df_skill, on=["Participant"], how="left")

# Helper Methods for the Metrics
def get_single_fixations(df):
    df_fixations = df[df["KindOfFixation"] == "Fixation"]
    df_refixations = df[df["KindOfFixation"] == "ReFixation"]
    df_refixations = df_refixations[df_refixations["NumberOfFixations"] > 0]
    # remove every entry from df fixations on ["Algorithm", "TokenIdx"] where there is a refixation6
    df_fixations = df_fixations[~df_fixations["TokenIdx"].isin(df_refixations["TokenIdx"].values)]
    # remove every entry from df fixations on where Number Of Fixations is not 1
    df_fixations = df_fixations[df_fixations["NumberOfFixations"] == 1]
    return df_fixations

def get_no_fixations(df):
    df_fixations = df[df["KindOfFixation"] == "Fixation"]
    # remove every entry from df fixations on ["Algorithm", "TokenIdx"] where there is a refixation6
    df_fixations = df_fixations[df_fixations["NumberOfFixations"] == 0]
    return df_fixations


def get_fixations(df):
    df_fixations = df[df["KindOfFixation"] == "Fixation"]
    df_refixations = df[df["KindOfFixation"] == "ReFixation"]
    df_refixations = df_refixations[df_refixations["NumberOfFixations"] > 0]
    # remove every entry from df fixations on ["Algorithm", "TokenIdx"] where there is a refixation6
    df_fixations = df_fixations[~df_fixations["TokenIdx"].isin(df_refixations["TokenIdx"].values)]
    # remove every entry from df fixations on where Number Of Fixations is not 1
    df_fixations = df_fixations[df_fixations["NumberOfFixations"] > 1]
    return df_fixations

def get_multiple_fixations(df):
    df_fixations = df[df["KindOfFixation"] == "Fixation"]
    df_refixations = df[df["KindOfFixation"] == "ReFixation"]
    df_refixations = df_refixations[df_refixations["NumberOfFixations"] > 0]
    # remove every entry from df fixations on ["Algorithm", "TokenIdx"] where there is a refixation6
    df_fixations = df_fixations[(df_fixations["TokenIdx"].isin(df_refixations["TokenIdx"].values)) & (df_fixations["NumberOfFixations"] >= 1)]
    return df_fixations

def get_fixations(df):
    df_fixations = df[df["KindOfFixation"] == "Fixation"]
    # remove every entry from df fixations on where no Fixation is found
    df_fixations = df_fixations[df_fixations["NumberOfFixations"] > 1]
    return df_fixations



# dataframe for number of fixations per participant
number_of_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])["NumberOfFixations"].count()
number_of_fixation_per_algorithm = number_of_fixation_per_algorithm.reset_index()

# dataframe for number of tokens per algorithm
number_of_tokens_per_algorithm = df_metrics_skill.groupby(["Algorithm"])["TokenIdx"].max()
number_of_tokens_per_algorithm = number_of_tokens_per_algorithm.reset_index()

# dataframe for number of tokens with no fixation per algorithm per participant
df_no_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])\
    .apply(get_no_fixations)\
    .drop(["Algorithm"], axis=1)

# dataframe for number of tokens with only one fixation per algorithm per participant
df_single_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])\
    .apply(get_single_fixations)\
    .drop(["Algorithm"], axis=1)

# dataframe for number of tokens with more than one fixation per algorithm per participant
df_multiple_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])\
    .apply(get_multiple_fixations)\
    .drop(["Algorithm"], axis=1)

# dataframe for number of tokens with more or equal than one fixation per algorithm per participant
df_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])\
    .apply(get_fixations)\
    .drop(["Algorithm"], axis=1)

# Reformat the dataframes
no_fixation_per_algorithm = df_no_fixation_per_algorithm[["TokenIdx"]]
no_fixation_per_algorithm = no_fixation_per_algorithm.reset_index().drop(["level_2"], axis=1)

single_fixation_per_algorithm = df_single_fixation_per_algorithm[["TokenIdx"]]
single_fixation_per_algorithm = single_fixation_per_algorithm.reset_index().drop(["level_2"], axis=1)

multiple_fixation_per_algorithm = df_multiple_fixation_per_algorithm[["TokenIdx"]]
multiple_fixation_per_algorithm = multiple_fixation_per_algorithm.reset_index().drop(["level_2"], axis=1)

fixations_per_algorithm = df_fixation_per_algorithm[["TokenIdx"]]
fixations_per_algorithm = fixations_per_algorithm.reset_index().drop(["level_2"], axis=1)

# Helper Method for Probability Metrics
def group_len_divided_by_number(current_df, counting_df):
    algorithm = current_df["Algorithm"].iloc[0]
    number_of_tokens = counting_df[counting_df["Algorithm"] == algorithm]["TokenIdx"].iloc[0]
    value = len(current_df) / number_of_tokens
    return len(current_df) / (number_of_tokens + 1)

# Calculate the Metrics per Participant
# Probability of no fixation
df_no_fixation_probability = no_fixation_per_algorithm.groupby(["Participant", "Algorithm"]).apply(lambda df: group_len_divided_by_number(df, number_of_tokens_per_algorithm))
df_no_fixation_probability = df_no_fixation_probability.reset_index()

# Probability of single fixation
df_single_fixation_probability = single_fixation_per_algorithm.groupby(["Participant", "Algorithm"]).apply(lambda df: group_len_divided_by_number(df, number_of_tokens_per_algorithm))
df_single_fixation_probability = df_single_fixation_probability.reset_index()

# Probability of multiple fixation
df_multiple_fixation_probability = multiple_fixation_per_algorithm.groupby(["Participant", "Algorithm"]).apply(lambda df: group_len_divided_by_number(df, number_of_tokens_per_algorithm))
df_multiple_fixation_probability = df_multiple_fixation_probability.reset_index()

# Probability of fixation
df_fixation_probability = fixations_per_algorithm.groupby(["Participant", "Algorithm"]).apply(lambda df: group_len_divided_by_number(df, number_of_tokens_per_algorithm))
df_fixation_probability = df_fixation_probability.reset_index()

# Calculate the means for the metrics per algorithm
df_no_fixation_probability = df_no_fixation_probability.groupby(["Participant"]).mean()
df_single_fixation_probability = df_single_fixation_probability.groupby(["Participant"]).mean()
df_multiple_fixation_probability = df_multiple_fixation_probability.groupby(["Participant"]).mean()
df_fixation_probability = df_fixation_probability.groupby(["Participant"]).mean()

# Raw Durations Metrics
# Duration of first fixation
df_first_fixation = df_metrics_skill[~df_metrics_skill["FirstFixationDuration"].isnull()]
df_first_fixation = df_first_fixation.groupby(["Participant"])["FirstFixationDuration"].mean()

# Duration of single fixation
df_single_fixation = df_metrics_skill[~df_metrics_skill["SingleFixationDuration"].isnull()]
df_single_fixation = df_single_fixation.groupby(["Participant"])["SingleFixationDuration"].mean()

# Duration of gaze duration
df_gaze_duration = df_metrics_skill[~df_metrics_skill["GazeDuration"].isnull()]
df_gaze_duration = df_gaze_duration.groupby(["Participant"])["GazeDuration"].mean()

# Total Time
df_total_time = df_metrics_skill[~df_metrics_skill["TotalTime"].isnull()]
df_total_time = df_total_time.groupby(["Participant"])["TotalTime"].mean()

# Put every metric dataframe together into one
df_combined = pd.DataFrame({"FirstFixationDuration": df_first_fixation.values,
                            "SingleFixationDuration": df_single_fixation.values,
                            "GazeDuration": df_gaze_duration.values,
                            "TotalTime": df_total_time.values,
                            "TokenNoFixationProbability": df_no_fixation_probability.values.reshape(37, ),
                            "TokenSingleFixationProbability": df_single_fixation_probability.values.reshape(37, ),
                            "TokenMultipleFixationProbability": df_multiple_fixation_probability.values.reshape(37, ),
                            "TokenFixationProbability": df_fixation_probability.values.reshape(37, ),
                            "Skill": df_metrics_skill.groupby(["Participant"])["SkillScore"].mean().values})
# get spearman correlation for metrics and skill level
df_combined.corr(method="spearman")

Unnamed: 0,FirstFixationDuration,SingleFixationDuration,GazeDuration,TotalTime,TokenNoFixationProbability,TokenSingleFixationProbability,TokenMultipleFixationProbability,TokenFixationProbability,Skill
FirstFixationDuration,1.0,0.995258,0.970602,0.822191,0.122333,0.117117,-0.070175,0.083452,-0.141299
SingleFixationDuration,0.995258,1.0,0.968468,0.82385,0.130394,0.117591,-0.064011,0.059033,-0.143433
GazeDuration,0.970602,0.968468,1.0,0.78165,0.174964,0.066619,-0.136795,0.120199,-0.188952
TotalTime,0.822191,0.82385,0.78165,1.0,-0.040778,0.27193,0.28734,0.307729,-0.370792
TokenNoFixationProbability,0.122333,0.130394,0.174964,-0.040778,1.0,-0.740161,-0.717639,-0.541489,0.081555
TokenSingleFixationProbability,0.117117,0.117591,0.066619,0.27193,-0.740161,1.0,0.800379,0.625652,-0.311759
TokenMultipleFixationProbability,-0.070175,-0.064011,-0.136795,0.28734,-0.717639,0.800379,1.0,0.646278,-0.374111
TokenFixationProbability,0.083452,0.059033,0.120199,0.307729,-0.541489,0.625652,0.646278,1.0,-0.586297
Skill,-0.141299,-0.143433,-0.188952,-0.370792,0.081555,-0.311759,-0.374111,-0.586297,1.0


In [43]:
df_combined

Unnamed: 0,Algorithm,Participant,FixationNumber,FixationStart,FixationEnd,LineNumber,LOC,Duration,SkillScore
0,IsPrime,1,3,1.136027,1.332034,0,8,12.390280,0.331385
1,IsPrime,1,4,1.356034,1.512039,0,8,12.390280,0.331385
2,IsPrime,1,5,1.536038,1.812044,0,8,12.390280,0.331385
3,IsPrime,1,6,1.840047,2.132055,0,8,12.390280,0.331385
4,IsPrime,1,7,2.156055,2.932074,0,8,12.390280,0.331385
...,...,...,...,...,...,...,...,...,...
77833,Rectangle,71,26,8.160260,8.312263,7,18,9.794147,0.435651
77834,Rectangle,71,27,8.332263,8.520265,13,18,9.794147,0.435651
77835,Rectangle,71,29,8.800282,8.948286,3,18,9.794147,0.435651
77836,Rectangle,71,31,9.428297,9.596305,2,18,9.794147,0.435651


# AOI based analysis

### Read in the Generator for AOI Based Metrics to get The BoundingBoxes and Indices of each Token per Algorithm

In [31]:
# Get Token based AOIs
snippets = df_fixation["Algorithm"].unique()
df_aois = pd.DataFrame(columns=["Algorithm", "AOI", "AOIIdx", "BoundingBox"])
for snippet in tqdm(snippets):
    aoi_token_generator = f"./../CodeSnippets/aois/Generators/{snippet}.json"
    try:
        image, aoi_list = gsl.create_image(aoi_token_generator, font_path="./../CodeSnippets/fonts/ttf/")
    except:
        print(f"{snippet} failed")
        image, aoi_list = gsl.create_image(aoi_token_generator, font_path="./../CodeSnippets/fonts/ttf/", logging=True)
        break
    height, width = image.size
    width_offset = int(1920 * 0.5) - int(height / 2)
    height_offset = int(1080 * 0.5) - int(width / 2)
    aoi_clustered = []
    current_left = []
    current_top = []
    current_right = []
    current_bottom = []
    current_aoi = []
    color = []
    for letter in aoi_list:
        # Close AOI
        if letter['letter'] in " \t\n":
            continue
        if len(letter["AOI"]) == 1:
            for idx in range(len(current_aoi)):
                aoi_clustered.append((len(aoi_clustered), current_aoi[idx], current_left[idx], current_top[idx], current_right[idx], current_bottom[idx], color[idx]))

            current_aoi = []
            color = []
            current_left = []
            current_top = []
            current_right = []
            current_bottom = []
            continue
        # There is no AOI set
        if len(current_aoi) == 0:
            current_aoi = []
            color = []
            current_left = []
            current_top = []
            current_right = []
            current_bottom = []
            for idx in range(1, len(letter["AOI"])):
                current_aoi.append(letter["AOI"][idx])
                current_left.append(letter["BoundingBox"][0])
                current_top.append(letter["BoundingBox"][1])
                current_right.append(letter["BoundingBox"][2])
                current_bottom.append(letter["BoundingBox"][3])
                color.append(letter["color"])
            continue


        for idx in reversed(range(len(current_aoi))):
            if current_aoi[idx] in letter["AOI"]:
                current_left[idx] = min(current_left[idx], letter["BoundingBox"][0])
                current_top[idx] = min(current_top[idx], letter["BoundingBox"][1])
                current_right[idx] = max(current_right[idx], letter["BoundingBox"][2])
                current_bottom[idx] = max(current_bottom[idx], letter["BoundingBox"][3])
                # remove the AOI from the letter
                letter["AOI"].remove(current_aoi[idx])
            else:
                aoi_clustered.append((len(aoi_clustered), current_aoi[idx], current_left[idx], current_top[idx], current_right[idx], current_bottom[idx], color[idx]))
                del current_aoi[idx]
                del current_left[idx]
                del current_top[idx]
                del current_right[idx]
                del current_bottom[idx]
                del color[idx]

        for idx in range(1, len(letter["AOI"])):
            current_aoi.append(letter["AOI"][idx])
            current_left.append(letter["BoundingBox"][0])
            current_top.append(letter["BoundingBox"][1])
            current_right.append(letter["BoundingBox"][2])
            current_bottom.append(letter["BoundingBox"][3])
            color.append(letter["color"])

    for idx in range(len(current_aoi)):
        aoi_clustered.append((len(aoi_clustered), current_aoi[idx], current_left[idx], current_top[idx], current_right[idx], current_bottom[idx], color[idx]))

    for token in aoi_clustered:
        df_aois.loc[len(df_aois)] = [snippet, token[1], token[0],
                                                 (token[2] + width_offset,
                                                  token[3] + height_offset,
                                                  token[4] + width_offset,
                                                  token[5] + height_offset)]

df_aois

  0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,Algorithm,AOI,AOIIdx,BoundingBox
0,IsPrime,method_identifier,0,"(993, 444, 1070, 460)"
1,IsPrime,method_signature,1,"(751, 444, 1202, 464)"
2,IsPrime,method_argument_declaration,2,"(1070, 444, 1202, 463)"
3,IsPrime,for_head,3,"(795, 469, 1147, 489)"
4,IsPrime,arithmetic_expression,4,"(883, 494, 994, 510)"
...,...,...,...,...
626,Rectangle,method_call_identifier,14,"(1048, 769, 1114, 789)"
627,Rectangle,method_arguments,15,"(1114, 769, 1136, 788)"
628,Rectangle,method_call,16,"(993, 769, 1136, 789)"
629,Rectangle,arithmetic_expression,17,"(828, 769, 1136, 789)"


### Check which Fixation of which Participant is in which AOI

In [33]:
df_aoi_fixation_per_participant = pd.DataFrame([], columns=["Algorithm", "Participant", "FixationNumber",
                                                              "FixationDuration", "AOIIdx", "AOIName"])
participants = df_fixation["Participant"].unique()
for snippet in tqdm(snippets):
    df_aois_per_algo = df_aois[df_aois["Algorithm"] == snippet]

    for participant in tqdm(participants):

        df_fixation_participant = df_fixation[(df_fixation["Algorithm"] == snippet) & (df_fixation["Participant"] == participant)]
        if len(df_fixation_participant) == 0:
            continue
        start_times = eval(df_fixation_participant["Fixation_startT"].values[0])
        end_times = eval(df_fixation_participant["Fixation_endT"].values[0])
        x_coordinates = eval(df_fixation_participant["Fixation_x"].values[0])
        y_coordinates = eval(df_fixation_participant["Fixation_y"].values[0])
        x_range = eval(df_fixation_participant["Fixation_x_range"].values[0])
        y_range = eval(df_fixation_participant["Fixation_y_range"].values[0])
        idx_values = range(len(start_times))
        for (fix_idx, start, end, x, y, x_range, y_range) in zip(idx_values, start_times, end_times, x_coordinates, y_coordinates, x_range, y_range):
            low_x = int(float(x) - math.ceil(float(x_range)))
            low_y = int(float(y) - math.ceil(float(y_range)))
            high_x = int(float(x) + math.ceil(float(x_range)))
            high_y = int(float(y) + math.ceil(float(y_range)))
            possible_coordinates = [(x, y) for x in range(low_x, high_x + 1) for y in range(low_y, high_y + 1)]

            for idx, row in df_aois_per_algo.iterrows():
                aoi_idx = row["AOIIdx"]
                aoi_name = row["AOI"]
                bounding_box = row["BoundingBox"]

                for possible_x, possible_y in possible_coordinates:
                    if bounding_box[0] <= possible_x <= bounding_box[2] and bounding_box[1] <= possible_y <=bounding_box[3]:
                        df_aoi_fixation_per_participant.loc[len(df_aoi_fixation_per_participant)] = [snippet, participant, fix_idx ,end - start, aoi_idx, aoi_name]
                        break

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]


### Transform the Data to a Fixation/ Refixation split by Participant

In [34]:
df_token = df_token_aois.copy()
df_token = df_token.drop(["BoundingBox", "Token"], axis=1)
for participant in participants:
    df_token.loc[:, f"TokenFixation_P{participant}"] = [[] for _ in range(len(df_token))]
    df_token.loc[:, f"TokenReFixation_P{participant}"] = [[] for _ in range(len(df_token))]


prev_participant = df_token_fixation_per_participant["Participant"].iloc[0]
prev_token_idx = df_token_fixation_per_participant["TokenIdx"].iloc[0]
prev_algorithm = df_token_fixation_per_participant["Algorithm"].iloc[0]
fixations = []
re_fixation = False
for idx, row in tqdm(df_token_fixation_per_participant.iterrows(), total=len(df_token_fixation_per_participant)):
    participant = row["Participant"]
    token_idx = row["TokenIdx"]
    algorithm = row["Algorithm"]
    FixationDuration = row["FixationDuration"]

    # fixation switches
    if prev_participant != participant or prev_token_idx != token_idx:
        index = df_token[(df_token["TokenIdx"] == prev_token_idx) & (df_token["Algorithm"] == prev_algorithm)].index[0]
        if re_fixation:
            re_fixations = df_token.loc[index, f"TokenReFixation_P{prev_participant}"]
            re_fixations.extend(fixations.copy())
            df_token.loc[index, f"TokenReFixation_P{prev_participant}"] = re_fixations.copy()
        else:
            df_token.loc[index, f"TokenFixation_P{prev_participant}"] = fixations.copy()
        fixations = []
        # possible new fixation
        re_fixation = False

    sub_frame = df_token[(df_token["TokenIdx"] == token_idx) & (df_token["Algorithm"] == algorithm)]
    if len(sub_frame) == 0:
        raise Exception(f"No Token found for {token_idx} in Algorithm {algorithm}")
    len_of_fixation = len(sub_frame[f"TokenFixation_P{participant}"].iloc[0])
    if re_fixation == False and len_of_fixation > 0 and len(fixations) == 0:
        re_fixation = True

    fixations.append(FixationDuration)
    prev_participant = participant
    prev_token_idx = token_idx
    prev_algorithm = algorithm

  0%|          | 0/37386 [00:00<?, ?it/s]

### Calculate the AOI Based Eyetracking Metrics

In [35]:
# Melt the Dataframe to be thinner so that we have Algorithm TokenIdx and Participant and Keys per Fixation / Refixation
df_token_melted = pd.melt(df_token, id_vars=["Algorithm", "TokenIdx"], var_name="KindOfFixation", value_name="FixationDurations")

# Classify Participant and the Kind and Number of Fixations / Refixations
df_token_melted["Participant"] = df_token_melted["KindOfFixation"].apply(lambda x: int(x.split("_")[1][1:]))
df_token_melted["KindOfFixation"] = df_token_melted["KindOfFixation"].apply(lambda x: x.split("_")[0])
df_token_melted["KindOfFixation"] = df_token_melted["KindOfFixation"].apply(lambda x: "Fixation" if x == "TokenFixation" else "ReFixation")
df_token_melted["NumberOfFixations"] = df_token_melted["FixationDurations"].apply(lambda x: len(x))

# Get the number of Participants for further calculations
number_of_participants = len(participants)

# Calculate the First Fixation Duration per Participant per Token
df_token_melted["FirstFixationDuration"] = None
df_token_melted.loc[df_token_melted["KindOfFixation"] == "Fixation" , "FirstFixationDuration"] = df_token_melted["FixationDurations"]\
    .apply(lambda x: x[0] if len(x) > 0 else None)

# Calculate the Single Fixation Duration per Participant per Token
df_token_melted["SingleFixationDuration"] = None
df_token_melted.loc[df_token_melted["KindOfFixation"] == "Fixation" , "SingleFixationDuration"] = df_token_melted["FixationDurations"]\
    .apply(lambda x: x[0] if len(x) == 1 else None)

# Calculate the Gaze Duration per Participant per Token
df_token_melted["GazeDuration"] = None
df_token_melted.loc[df_token_melted["KindOfFixation"] == "Fixation" , "GazeDuration"] = df_token_melted["FixationDurations"]\
    .apply(lambda x: sum(x) if len(x) > 0 else None)

# Calculate the Total Time per Participant per Token
df_token_melted_total_time = df_token_melted.groupby(["Participant", "Algorithm", "TokenIdx"])\
    .agg({"FixationDurations": lambda x: sum(x.values.sum())})
df_token_melted_total_time = df_token_melted_total_time.rename(columns={"FixationDurations": "TotalTime"})

# Merge the Dataframes
df_token_melted = pd.merge(df_token_melted, df_token_melted_total_time, on=["Participant", "Algorithm", "TokenIdx"], how="left")

# Cast the Dataframes to the right datatype
df_token_melted["FirstFixationDuration"] = df_token_melted["FirstFixationDuration"].astype(float)
df_token_melted["SingleFixationDuration"] = df_token_melted["SingleFixationDuration"].astype(float)
df_token_melted["GazeDuration"] = df_token_melted["GazeDuration"].astype(float)
df_token_melted["TotalTime"] = df_token_melted["TotalTime"].astype(float).replace(0, np.nan)

# Read in the Skilllevel
df_skill = pd.read_csv(f"./data/filteredData/filtered_data.csv")
df_skill = df_skill[["Participant", "SkillScore"]]
df_skill = df_skill.drop_duplicates()

# Merge the Dataframes to combine metrics with the Skilllevel
df_metrics_skill = pd.merge(df_token_melted, df_skill, on=["Participant"], how="left")

# Helper Methods for the Metrics
def get_single_fixations(df):
    df_fixations = df[df["KindOfFixation"] == "Fixation"]
    df_refixations = df[df["KindOfFixation"] == "ReFixation"]
    df_refixations = df_refixations[df_refixations["NumberOfFixations"] > 0]
    # remove every entry from df fixations on ["Algorithm", "TokenIdx"] where there is a refixation6
    df_fixations = df_fixations[~df_fixations["TokenIdx"].isin(df_refixations["TokenIdx"].values)]
    # remove every entry from df fixations on where Number Of Fixations is not 1
    df_fixations = df_fixations[df_fixations["NumberOfFixations"] == 1]
    return df_fixations

def get_no_fixations(df):
    df_fixations = df[df["KindOfFixation"] == "Fixation"]
    # remove every entry from df fixations on ["Algorithm", "TokenIdx"] where there is a refixation6
    df_fixations = df_fixations[df_fixations["NumberOfFixations"] == 0]
    return df_fixations


def get_fixations(df):
    df_fixations = df[df["KindOfFixation"] == "Fixation"]
    df_refixations = df[df["KindOfFixation"] == "ReFixation"]
    df_refixations = df_refixations[df_refixations["NumberOfFixations"] > 0]
    # remove every entry from df fixations on ["Algorithm", "TokenIdx"] where there is a refixation6
    df_fixations = df_fixations[~df_fixations["TokenIdx"].isin(df_refixations["TokenIdx"].values)]
    # remove every entry from df fixations on where Number Of Fixations is not 1
    df_fixations = df_fixations[df_fixations["NumberOfFixations"] > 1]
    return df_fixations

def get_multiple_fixations(df):
    df_fixations = df[df["KindOfFixation"] == "Fixation"]
    df_refixations = df[df["KindOfFixation"] == "ReFixation"]
    df_refixations = df_refixations[df_refixations["NumberOfFixations"] > 0]
    # remove every entry from df fixations on ["Algorithm", "TokenIdx"] where there is a refixation6
    df_fixations = df_fixations[(df_fixations["TokenIdx"].isin(df_refixations["TokenIdx"].values)) & (df_fixations["NumberOfFixations"] >= 1)]
    return df_fixations

def get_fixations(df):
    df_fixations = df[df["KindOfFixation"] == "Fixation"]
    # remove every entry from df fixations on where no Fixation is found
    df_fixations = df_fixations[df_fixations["NumberOfFixations"] > 1]
    return df_fixations



# dataframe for number of fixations per participant
number_of_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])["NumberOfFixations"].count()
number_of_fixation_per_algorithm = number_of_fixation_per_algorithm.reset_index()

# dataframe for number of tokens per algorithm
number_of_tokens_per_algorithm = df_metrics_skill.groupby(["Algorithm"])["TokenIdx"].max()
number_of_tokens_per_algorithm = number_of_tokens_per_algorithm.reset_index()

# dataframe for number of tokens with no fixation per algorithm per participant
df_no_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])\
    .apply(get_no_fixations)\
    .drop(["Algorithm"], axis=1)

# dataframe for number of tokens with only one fixation per algorithm per participant
df_single_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])\
    .apply(get_single_fixations)\
    .drop(["Algorithm"], axis=1)

# dataframe for number of tokens with more than one fixation per algorithm per participant
df_multiple_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])\
    .apply(get_multiple_fixations)\
    .drop(["Algorithm"], axis=1)

# dataframe for number of tokens with more or equal than one fixation per algorithm per participant
df_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])\
    .apply(get_fixations)\
    .drop(["Algorithm"], axis=1)

# Reformat the dataframes
no_fixation_per_algorithm = df_no_fixation_per_algorithm[["TokenIdx"]]
no_fixation_per_algorithm = no_fixation_per_algorithm.reset_index().drop(["level_2"], axis=1)

single_fixation_per_algorithm = df_single_fixation_per_algorithm[["TokenIdx"]]
single_fixation_per_algorithm = single_fixation_per_algorithm.reset_index().drop(["level_2"], axis=1)

multiple_fixation_per_algorithm = df_multiple_fixation_per_algorithm[["TokenIdx"]]
multiple_fixation_per_algorithm = multiple_fixation_per_algorithm.reset_index().drop(["level_2"], axis=1)

fixations_per_algorithm = df_fixation_per_algorithm[["TokenIdx"]]
fixations_per_algorithm = fixations_per_algorithm.reset_index().drop(["level_2"], axis=1)

# Helper Method for Probability Metrics
def group_len_divided_by_number(current_df, counting_df):
    algorithm = current_df["Algorithm"].iloc[0]
    number_of_tokens = counting_df[counting_df["Algorithm"] == algorithm]["TokenIdx"].iloc[0]
    value = len(current_df) / number_of_tokens
    return len(current_df) / (number_of_tokens + 1)

# Calculate the Metrics per Participant
# Probability of no fixation
df_no_fixation_probability = no_fixation_per_algorithm.groupby(["Participant", "Algorithm"]).apply(lambda df: group_len_divided_by_number(df, number_of_tokens_per_algorithm))
df_no_fixation_probability = df_no_fixation_probability.reset_index()

# Probability of single fixation
df_single_fixation_probability = single_fixation_per_algorithm.groupby(["Participant", "Algorithm"]).apply(lambda df: group_len_divided_by_number(df, number_of_tokens_per_algorithm))
df_single_fixation_probability = df_single_fixation_probability.reset_index()

# Probability of multiple fixation
df_multiple_fixation_probability = multiple_fixation_per_algorithm.groupby(["Participant", "Algorithm"]).apply(lambda df: group_len_divided_by_number(df, number_of_tokens_per_algorithm))
df_multiple_fixation_probability = df_multiple_fixation_probability.reset_index()

# Probability of fixation
df_fixation_probability = fixations_per_algorithm.groupby(["Participant", "Algorithm"]).apply(lambda df: group_len_divided_by_number(df, number_of_tokens_per_algorithm))
df_fixation_probability = df_fixation_probability.reset_index()

# Calculate the means for the metrics per algorithm
df_no_fixation_probability = df_no_fixation_probability.groupby(["Participant"]).mean()
df_single_fixation_probability = df_single_fixation_probability.groupby(["Participant"]).mean()
df_multiple_fixation_probability = df_multiple_fixation_probability.groupby(["Participant"]).mean()
df_fixation_probability = df_fixation_probability.groupby(["Participant"]).mean()

# Raw Durations Metrics
# Duration of first fixation
df_first_fixation = df_metrics_skill[~df_metrics_skill["FirstFixationDuration"].isnull()]
df_first_fixation = df_first_fixation.groupby(["Participant"])["FirstFixationDuration"].mean()

# Duration of single fixation
df_single_fixation = df_metrics_skill[~df_metrics_skill["SingleFixationDuration"].isnull()]
df_single_fixation = df_single_fixation.groupby(["Participant"])["SingleFixationDuration"].mean()

# Duration of gaze duration
df_gaze_duration = df_metrics_skill[~df_metrics_skill["GazeDuration"].isnull()]
df_gaze_duration = df_gaze_duration.groupby(["Participant"])["GazeDuration"].mean()

# Total Time
df_total_time = df_metrics_skill[~df_metrics_skill["TotalTime"].isnull()]
df_total_time = df_total_time.groupby(["Participant"])["TotalTime"].mean()

# Put every metric dataframe together into one
df_combined = pd.DataFrame({"FirstFixationDuration": df_first_fixation.values,
                            "SingleFixationDuration": df_single_fixation.values,
                            "GazeDuration": df_gaze_duration.values,
                            "TotalTime": df_total_time.values,
                            "TokenNoFixationProbability": df_no_fixation_probability.values.reshape(37, ),
                            "TokenSingleFixationProbability": df_single_fixation_probability.values.reshape(37, ),
                            "TokenMultipleFixationProbability": df_multiple_fixation_probability.values.reshape(37, ),
                            "TokenFixationProbability": df_fixation_probability.values.reshape(37, ),
                            "Skill": df_metrics_skill.groupby(["Participant"])["SkillScore"].mean().values})
# get spearman correlation for metrics and skill level
df_combined.corr(method="spearman")

Unnamed: 0,FirstFixationDuration,SingleFixationDuration,GazeDuration,TotalTime,TokenNoFixationProbability,TokenSingleFixationProbability,TokenMultipleFixationProbability,TokenFixationProbability,Skill
FirstFixationDuration,1.0,0.995258,0.970602,0.822191,0.122333,0.117117,-0.070175,0.083452,-0.141299
SingleFixationDuration,0.995258,1.0,0.968468,0.82385,0.130394,0.117591,-0.064011,0.059033,-0.143433
GazeDuration,0.970602,0.968468,1.0,0.78165,0.174964,0.066619,-0.136795,0.120199,-0.188952
TotalTime,0.822191,0.82385,0.78165,1.0,-0.040778,0.27193,0.28734,0.307729,-0.370792
TokenNoFixationProbability,0.122333,0.130394,0.174964,-0.040778,1.0,-0.740161,-0.717639,-0.541489,0.081555
TokenSingleFixationProbability,0.117117,0.117591,0.066619,0.27193,-0.740161,1.0,0.800379,0.625652,-0.311759
TokenMultipleFixationProbability,-0.070175,-0.064011,-0.136795,0.28734,-0.717639,0.800379,1.0,0.646278,-0.374111
TokenFixationProbability,0.083452,0.059033,0.120199,0.307729,-0.541489,0.625652,0.646278,1.0,-0.586297
Skill,-0.141299,-0.143433,-0.188952,-0.370792,0.081555,-0.311759,-0.374111,-0.586297,1.0


# Get the LOCs

In [36]:
# Get Bounding Boxes for Lines Of Code
snippets = df_fixation["Algorithm"].unique()
df_lines = pd.DataFrame(columns=["Algorithm", "Line", "BoundingBox"])
for snippet in tqdm(snippets):
    aoi_token_generator = f"./../CodeSnippets/aois/Generators/{snippet}_ast.json"
    image, aoi_list = gsl.create_image(aoi_token_generator, font_path="./../CodeSnippets/fonts/ttf/")
    height, width = image.size
    width_offset = int(1920 * 0.5) - int(height / 2)
    height_offset = int(1080 * 0.5) - int(width / 2)
    aoi_clustered = []
    current_left = None
    current_top = None
    current_right = None
    current_bottom = None
    current_line = 0
    for letter in aoi_list:
        if letter["letter"] == '\n':
            if current_left is not None:
                aoi_clustered.append((current_line, current_left, current_top, current_right, current_bottom))
            current_left = None
            current_top = None
            current_right = None
            current_bottom = None
            current_line += 1
            continue
        if current_left is None:
            current_left = letter["BoundingBox"][0]
            current_top = letter["BoundingBox"][1]
            current_right = letter["BoundingBox"][2]
            current_bottom = letter["BoundingBox"][3]
        else:
            current_left = min(current_left, letter["BoundingBox"][0])
            current_top = min(current_top, letter["BoundingBox"][1])
            current_right = max(current_right, letter["BoundingBox"][2])
            current_bottom = max(current_bottom, letter["BoundingBox"][3])

    for token in aoi_clustered:
        df_lines.loc[len(df_lines)] = [snippet, token[0],
                                       (token[1] + width_offset,
                                        token[2] + height_offset,
                                        token[3] + width_offset,
                                        token[4] + height_offset)]
df_lines

  0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,Algorithm,Line,BoundingBox
0,IsPrime,0,"(768, 467, 1152, 482)"
1,IsPrime,1,"(768, 486, 1112, 501)"
2,IsPrime,2,"(768, 505, 1049, 519)"
3,IsPrime,3,"(768, 526, 1008, 539)"
4,IsPrime,4,"(768, 543, 880, 557)"
...,...,...,...
429,Rectangle,12,"(740, 601, 1012, 615)"
430,Rectangle,13,"(740, 619, 788, 633)"
431,Rectangle,15,"(740, 657, 956, 672)"
432,Rectangle,16,"(740, 676, 1100, 691)"


In [37]:
df_line_fixation_per_participant = pd.DataFrame([], columns=["Algorithm", "Participant", "FixationNumber", "FixationStart", "FixationEnd", "LineNumber"])
participants = df_fixation["Participant"].unique()
for snippet in tqdm(snippets):
    df_token_per_algo = df_lines[df_lines["Algorithm"] == snippet]

    for participant in participants:
        df_fixation_participant = df_fixation[(df_fixation["Algorithm"] == snippet) & (df_fixation["Participant"] == participant)]
        if len(df_fixation_participant) == 0:
            continue
        start_times = eval(df_fixation_participant["Fixation_startT"].values[0])
        end_times = eval(df_fixation_participant["Fixation_endT"].values[0])
        y_coordinates = eval(df_fixation_participant["Fixation_y"].values[0])
        y_range = eval(df_fixation_participant["Fixation_y_range"].values[0])
        idx_values = range(len(start_times))
        for (fix_idx, start, end, y, y_range) in zip(idx_values, start_times, end_times, y_coordinates, y_range):
            low_y = int(float(y) - math.ceil(float(y_range)))
            high_y = int(float(y) + math.ceil(float(y_range)))
            possible_coordinates = [y for y in range(low_y, high_y + 1)]

            found = False
            for idx, row in df_token_per_algo.iterrows():
                line_number = row["Line"]
                bounding_box = row["BoundingBox"]

                for possible_y in possible_coordinates:
                    if bounding_box[1] <= possible_y <= bounding_box[3]:
                        df_line_fixation_per_participant.loc[len(df_line_fixation_per_participant)] = [snippet, participant, fix_idx, start, end, line_number]
                        found = True
                        break
                if found:
                    break

df_line_fixation_per_participant

  0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,Algorithm,Participant,FixationNumber,FixationStart,FixationEnd,LineNumber
0,IsPrime,1,3,1136.027,1332.034,0
1,IsPrime,1,4,1356.034,1512.039,0
2,IsPrime,1,5,1536.038,1812.044,0
3,IsPrime,1,6,1840.047,2132.055,0
4,IsPrime,1,7,2156.055,2932.074,0
...,...,...,...,...,...,...
77833,Rectangle,71,26,8160.260,8312.263,7
77834,Rectangle,71,27,8332.263,8520.265,13
77835,Rectangle,71,29,8800.282,8948.286,3
77836,Rectangle,71,31,9428.297,9596.305,2


In [38]:
import json
snippets = df_fixation["Algorithm"].unique()
df_snippet_length = pd.DataFrame(columns=["Algorithm", "LOC"])
for snippet in tqdm(snippets):
    aoi_token_generator = f"./../CodeSnippets/aois/Generators/{snippet}_ast.json"
    with open(aoi_token_generator) as f:
        aoi_list = json.load(f)
        data = aoi_list["source-code"]
        LOC = len(data)
        df_snippet_length.loc[len(df_snippet_length)] = [snippet, LOC]
df_snippet_length

  0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,Algorithm,LOC
0,IsPrime,8
1,SiebDesEratosthenes,20
2,IsAnagram,28
3,RemoveDoubleChar,19
4,BinToDecimal,9
5,PermuteString,23
6,Power,7
7,BinarySearch,15
8,ContainsSubstring,21
9,ReverseArray,7


In [39]:
df_behavioral = pd.read_csv('./data/filteredData/fixation_stats.csv', sep=";")
df_behavioral = df_behavioral[df_behavioral["IsOutlier"] == False]
df_behavioral = df_behavioral[["Participant", "Algorithm", "Duration", "SkillScore"]]
df_behavioral

Unnamed: 0,Participant,Algorithm,Duration,SkillScore
0,1,IsPrime,12.390280,0.331385
1,1,SiebDesEratosthenes,152.571914,0.331385
2,1,IsAnagram,109.615724,0.331385
3,1,RemoveDoubleChar,53.456276,0.331385
4,1,BinToDecimal,49.922091,0.331385
...,...,...,...,...
1066,71,GreatestCommonDivisor,30.757360,0.435651
1067,71,DumpSorting,113.368945,0.435651
1068,71,BinomialCoefficient,50.637861,0.435651
1069,71,IsAnagram,110.995754,0.435651


In [40]:
df_combined = pd.merge(df_line_fixation_per_participant, df_snippet_length, on=["Algorithm"])
df_combined = pd.merge(df_combined, df_behavioral, on=["Participant", "Algorithm"])
df_combined["FixationStart"] = df_combined["FixationStart"] / 1000.0
df_combined["FixationEnd"] = df_combined["FixationEnd"] / 1000.0


def loc_coverage_after_time_percentage(df, percentage):
    end_duration = df["Duration"].iloc[0]
    loc = df["LOC"].iloc[0]
    max_duration = end_duration * percentage
    df_filtered = df[df["FixationEnd"] <= max_duration]
    unique_loc = df_filtered["LineNumber"].nunique()
    return unique_loc / loc

df_20 = df_combined.groupby(["Algorithm", "Participant"]).apply(lambda df : loc_coverage_after_time_percentage(df, 0.2))
df_20 = df_20.reset_index()

df_30 = df_combined.groupby(["Algorithm", "Participant"]).apply(lambda df : loc_coverage_after_time_percentage(df, 0.3))
df_30 = df_30.reset_index()

df_40 = df_combined.groupby(["Algorithm", "Participant"]).apply(lambda df : loc_coverage_after_time_percentage(df, 0.4))
df_40 = df_40.reset_index()

df_50 = df_combined.groupby(["Algorithm", "Participant"]).apply(lambda df : loc_coverage_after_time_percentage(df, 0.5))
df_50 = df_50.reset_index()

df_20 = df_20.groupby(["Participant"]).mean().values.reshape(37, )
df_30 = df_30.groupby(["Participant"]).mean().values.reshape(37, )
df_40 = df_40.groupby(["Participant"]).mean().values.reshape(37, )
df_50 = df_50.groupby(["Participant"]).mean().values.reshape(37, )


df_skill = df_behavioral[["Participant", "SkillScore"]]
df_skill = df_skill.drop_duplicates()

df_code_coverage = pd.DataFrame({"Participant": participants, "20%": df_20, "30%": df_30, "40%": df_40, "50%": df_50})
df_code_coverage.set_index("Participant", inplace=True, drop=True)
df_code_coverage = pd.merge(df_code_coverage, df_skill, on=["Participant"])
df_code_coverage.set_index("Participant", inplace=True, drop=True)
df_code_coverage

Unnamed: 0_level_0,20%,30%,40%,50%,SkillScore
Participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.245391,0.348396,0.432725,0.49312,0.331385
2,0.251232,0.355527,0.418872,0.464605,0.379187
3,0.415433,0.521158,0.638263,0.71783,0.311264
4,0.245859,0.331361,0.427805,0.500664,0.424727
5,0.298898,0.390414,0.469532,0.536783,0.313031
6,0.255461,0.391562,0.485468,0.578998,0.315932
7,0.267153,0.392478,0.459563,0.495865,0.420873
10,0.152989,0.243787,0.323574,0.405975,0.350392
11,0.332465,0.457989,0.540979,0.610152,0.178206
12,0.192586,0.29123,0.38224,0.483282,0.309233


In [41]:
df_code_coverage["30%"].mean()

0.3102354705268953

In [42]:
# spearman correlation
df_code_coverage.corr(method="spearman")

Unnamed: 0,20%,30%,40%,50%,SkillScore
20%,1.0,0.974633,0.964438,0.944523,-0.08606
30%,0.974633,1.0,0.982693,0.955666,-0.11119
40%,0.964438,0.982693,1.0,0.978426,-0.110479
50%,0.944523,0.955666,0.978426,1.0,-0.172831
SkillScore,-0.08606,-0.11119,-0.110479,-0.172831,1.0
