# RQ1 - Eyetracking

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import utils.GenSnippetsLib as gsl
import math
import json

## Import Fixation Data

In [2]:
# function to convert a string seperated by whitespace characters back to python list
def string_to_list_string(data):
    data = data.replace(' ', ',')
    data = data.replace('\n', ',')
    data = ','.join([element for element in data.split(",") if len(element) > 0])
    if data[1] == ",":
        data = "[" + data[2:]
    return data

# read in the fixation data
df_fixation = pd.read_csv('./data/filteredData/fixation_stats.csv', sep=";")
# kick out outliers
df_fixation = df_fixation[df_fixation["IsOutlier"] == False]
df_fixation = df_fixation.drop(columns=["IsOutlier", "Behavioral"])

# transform fixation strings to lists
df_fixation["Fixation_startT"] = df_fixation["Fixation_startT"].apply(string_to_list_string)
df_fixation["Fixation_endT"] = df_fixation["Fixation_endT"].apply(string_to_list_string)
df_fixation["Fixation_x"] = df_fixation["Fixation_x"].apply(string_to_list_string)
df_fixation["Fixation_y"] = df_fixation["Fixation_y"].apply(string_to_list_string)
df_fixation["Fixation_x_range"] = df_fixation["Fixation_x_range"].apply(string_to_list_string)
df_fixation["Fixation_y_range"] = df_fixation["Fixation_y_range"].apply(string_to_list_string)

# Token Based Metrics

### Read in the Generator for Token Based Metrics to get The BoundingBoxes and Indices of each Token per Algorithm

In [3]:
# Get Token based AOIs
snippets = df_fixation["Algorithm"].unique()
df_token_aois = pd.DataFrame(columns=["Algorithm", "Token", "TokenIdx", "BoundingBox"])
for snippet in tqdm(snippets):
    aoi_token_generator = f"./../CodeSnippets/Generators_Labeled/Generators/{snippet}_ast.json"
    image, aoi_list = gsl.create_image(aoi_token_generator, font_path="./../CodeSnippets/fonts/ttf/")
    height, width = image.size
    width_offset = int(1920 * 0.5) - int(height / 2)
    height_offset = int(1080 * 0.5) - int(width / 2)
    aoi_clustered = []
    current_left = None
    current_top = None
    current_right = None
    current_bottom = None
    current_aoi = None
    color = None
    for letter in aoi_list:
        if len(letter["AOI"]) == 1 or letter["letter"] == '\n':
            if current_aoi is not None:
                aoi_clustered.append(
                    (len(aoi_clustered), current_aoi, current_left, current_top, current_right, current_bottom, color))
            current_aoi = None
            color = None
            current_left = None
            current_top = None
            current_right = None
            current_bottom = None
            continue
        if current_aoi is None:
            current_aoi = letter["AOI"][1]
            color = letter["color"]
            current_left = letter["BoundingBox"][0]
            current_top = letter["BoundingBox"][1]
            current_right = letter["BoundingBox"][2]
            current_bottom = letter["BoundingBox"][3]
        elif current_aoi == letter["AOI"][1]:
            current_left = min(current_left, letter["BoundingBox"][0])
            current_top = min(current_top, letter["BoundingBox"][1])
            current_right = max(current_right, letter["BoundingBox"][2])
            current_bottom = max(current_bottom, letter["BoundingBox"][3])
        else:
            aoi_clustered.append(
                (len(aoi_clustered), current_aoi, current_left, current_top, current_right, current_bottom, color))
            current_aoi = letter["AOI"][1]
            color = letter["color"]
            current_left = letter["BoundingBox"][0]
            current_top = letter["BoundingBox"][1]
            current_right = letter["BoundingBox"][2]
            current_bottom = letter["BoundingBox"][3]

    for token in aoi_clustered:
        df_token_aois.loc[len(df_token_aois)] = [snippet, token[1], token[0],
                                                 (token[2] + width_offset,
                                                  token[3] + height_offset,
                                                  token[4] + width_offset,
                                                  token[5] + height_offset)]
df_token_aois

  0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,Algorithm,Token,TokenIdx,BoundingBox
0,IsPrime,Modifier,0,"(808, 468, 856, 482)"
1,IsPrime,Modifier,1,"(864, 468, 912, 479)"
2,IsPrime,BasicType,2,"(920, 469, 976, 479)"
3,IsPrime,Identifier,3,"(984, 468, 1040, 479)"
4,IsPrime,Separator,4,"(1040, 467, 1048, 481)"
...,...,...,...,...
2700,SiebDesEratosthenes,Separator,133,"(788, 676, 796, 690)"
2701,SiebDesEratosthenes,Keyword,134,"(788, 697, 836, 707)"
2702,SiebDesEratosthenes,Identifier,135,"(844, 696, 892, 710)"
2703,SiebDesEratosthenes,Separator,136,"(892, 699, 900, 710)"


### Check which Fixation of which Participant is in which Token

In [5]:
df_token_fixation_per_participant = pd.DataFrame([], columns=["Algorithm", "Participant", "FixationNumber",
                                                              "FixationDuration", "TokenIdx"])
participants = df_fixation["Participant"].unique()
for snippet in tqdm(snippets):
    df_token_per_algo = df_token_aois[df_token_aois["Algorithm"] == snippet]

    for participant in participants:
        df_fixation_participant = df_fixation[
            (df_fixation["Algorithm"] == snippet) & (df_fixation["Participant"] == participant)]
        if len(df_fixation_participant) == 0:
            continue
        start_times = eval(df_fixation_participant["Fixation_startT"].values[0])
        end_times = eval(df_fixation_participant["Fixation_endT"].values[0])
        x_coordinates = eval(df_fixation_participant["Fixation_x"].values[0])
        y_coordinates = eval(df_fixation_participant["Fixation_y"].values[0])
        x_range = eval(df_fixation_participant["Fixation_x_range"].values[0])
        y_range = eval(df_fixation_participant["Fixation_y_range"].values[0])
        idx_values = range(len(start_times))
        for (fix_idx, start, end, x, y, x_range, y_range) in zip(idx_values, start_times, end_times, x_coordinates, y_coordinates, x_range, y_range):
            low_x = int(float(x) - math.ceil(float(x_range)))
            low_y = int(float(y) - math.ceil(float(y_range)))
            high_x = int(float(x) + math.ceil(float(x_range)))
            high_y = int(float(y) + math.ceil(float(y_range)))
            possible_coordinates = [(x, y) for x in range(low_x, high_x + 1) for y in range(low_y, high_y + 1)]

            found = False
            for idx, row in df_token_per_algo.iterrows():
                token_idx = row["TokenIdx"]
                bounding_box = row["BoundingBox"]

                for possible_x, possible_y in possible_coordinates:
                    if bounding_box[0] <= possible_x <= bounding_box[2] and bounding_box[1] <= possible_y <=bounding_box[3]:
                        df_token_fixation_per_participant.loc[len(df_token_fixation_per_participant)] = [snippet, participant,fix_idx,end - start,token_idx]
                        found = True
                        break
                if found:
                    break

df_token_fixation_per_participant

  0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,Algorithm,Participant,FixationNumber,FixationDuration,TokenIdx
0,IsPrime,1,4,156.005,0
1,IsPrime,1,6,292.008,1
2,IsPrime,1,7,776.019,3
3,IsPrime,1,11,196.006,13
4,IsPrime,1,16,480.011,38
...,...,...,...,...,...
36145,SiebDesEratosthenes,71,97,156.003,122
36146,SiebDesEratosthenes,71,101,176.006,94
36147,SiebDesEratosthenes,71,105,228.007,121
36148,SiebDesEratosthenes,71,126,156.008,134


### Transform the Data to a Fixation/ Refixation split by Participant

In [6]:
df_token = df_token_aois.copy()
df_token = df_token.drop(["BoundingBox", "Token"], axis=1)
for participant in participants:
    df_token.loc[:, f"TokenFixation_P{participant}"] = [[] for _ in range(len(df_token))]
    df_token.loc[:, f"TokenReFixation_P{participant}"] = [[] for _ in range(len(df_token))]


prev_participant = df_token_fixation_per_participant["Participant"].iloc[0]
prev_token_idx = df_token_fixation_per_participant["TokenIdx"].iloc[0]
prev_algorithm = df_token_fixation_per_participant["Algorithm"].iloc[0]
fixations = []
re_fixation = False
for idx, row in tqdm(df_token_fixation_per_participant.iterrows(), total=len(df_token_fixation_per_participant)):
    participant = row["Participant"]
    token_idx = row["TokenIdx"]
    algorithm = row["Algorithm"]
    FixationDuration = row["FixationDuration"]

    # fixation switches
    if prev_participant != participant or prev_token_idx != token_idx:
        index = df_token[(df_token["TokenIdx"] == prev_token_idx) & (df_token["Algorithm"] == prev_algorithm)].index[0]
        if re_fixation:
            re_fixations = df_token.loc[index, f"TokenReFixation_P{prev_participant}"]
            re_fixations.extend(fixations.copy())
            df_token.loc[index, f"TokenReFixation_P{prev_participant}"] = re_fixations.copy()
        else:
            df_token.loc[index, f"TokenFixation_P{prev_participant}"] = fixations.copy()
        fixations = []
        # possible new fixation
        re_fixation = False

    sub_frame = df_token[(df_token["TokenIdx"] == token_idx) & (df_token["Algorithm"] == algorithm)]
    if len(sub_frame) == 0:
        raise Exception(f"No Token found for {token_idx} in Algorithm {algorithm}")
    len_of_fixation = len(sub_frame[f"TokenFixation_P{participant}"].iloc[0])
    if re_fixation == False and len_of_fixation > 0 and len(fixations) == 0:
        re_fixation = True

    fixations.append(FixationDuration)
    prev_participant = participant
    prev_token_idx = token_idx
    prev_algorithm = algorithm

df_token

  0%|          | 0/36150 [00:00<?, ?it/s]

Unnamed: 0,Algorithm,TokenIdx,TokenFixation_P1,TokenReFixation_P1,TokenFixation_P2,TokenReFixation_P2,TokenFixation_P3,TokenReFixation_P3,TokenFixation_P4,TokenReFixation_P4,...,TokenFixation_P66,TokenReFixation_P66,TokenFixation_P67,TokenReFixation_P67,TokenFixation_P68,TokenReFixation_P68,TokenFixation_P70,TokenReFixation_P70,TokenFixation_P71,TokenReFixation_P71
0,IsPrime,0,[156.00499999999988],[],[296.01099999999997],[156.0050000000001],[],[],[],[],...,[],[],[312.0119999999988],[],[],[],[],[],[],[]
1,IsPrime,1,[292.0079999999998],[],[388.01199999999994],"[360.0160000000001, 96.0029999999997]",[220.00700000000143],[],[208.0069999999996],[],...,[],[],[616.0129999999999],"[1104.0370000000003, 684.0220000000008, 232.00...",[],[],[],[],[],[]
2,IsPrime,2,[],[],[192.01600000000008],"[212.0050000000001, 176.00800000000027, 996.04...",[],[],[440.0110000000004],[],...,[],[],[1956.0699999999997],"[235.9960000000001, 272.0109999999986, 1268.04...",[704.0219999999999],"[236.00700000000143, 368.01300000000265, 3268....",[],[],[],[]
3,IsPrime,3,[776.0190000000002],[204.00900000000001],[],[],[],[],[620.0229999999997],[],...,[156.00400000000081],[524.0169999999998],[740.028],"[900.031, 276.0159999999996, 208.0099999999984...",[316.0129999999999],[],[],[],[],[]
4,IsPrime,4,[],[],[236.00099999999998],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2700,SiebDesEratosthenes,133,[],[],[],[],[676.025999999998],[],[],[],...,[],[],[],[],[],[],[],[],[],[]
2701,SiebDesEratosthenes,134,[],[],[296.0109999999986],[],"[292.0069999999978, 704.0249999999942, 780.025...",[],[132.00599999999395],[],...,[],[],[],[],[],[],[],[],[156.00800000000163],[]
2702,SiebDesEratosthenes,135,[],[],[],[],[776.028999999995],[416.01299999999173],[],[],...,[],[],[],[],[],[],[],[],[],[]
2703,SiebDesEratosthenes,136,[],[],[],[],[],[],[],[],...,[],[],[],[],[],[],[],[],[],[]


### Calculate the Token Based Eyetracking Metrics

In [9]:
# Melt the Dataframe to be thinner so that we have Algorithm TokenIdx and Participant and Keys per Fixation / Refixation
df_token_melted = pd.melt(df_token, id_vars=["Algorithm", "TokenIdx"], var_name="KindOfFixation", value_name="FixationDurations")

# Classify Participant and the Kind and Number of Fixations / Refixations
df_token_melted["Participant"] = df_token_melted["KindOfFixation"].apply(lambda x: int(x.split("_")[1][1:]))
df_token_melted["KindOfFixation"] = df_token_melted["KindOfFixation"].apply(lambda x: x.split("_")[0])
df_token_melted["KindOfFixation"] = df_token_melted["KindOfFixation"].apply(lambda x: "Fixation" if x == "TokenFixation" else "ReFixation")
df_token_melted["NumberOfFixations"] = df_token_melted["FixationDurations"].apply(lambda x: len(x))

# Get the number of Participants for further calculations
number_of_participants = len(participants)

# Calculate the First Fixation Duration per Participant per Token
df_token_melted["FirstFixationDuration"] = None
df_token_melted.loc[df_token_melted["KindOfFixation"] == "Fixation" , "FirstFixationDuration"] = df_token_melted["FixationDurations"]\
    .apply(lambda x: x[0] if len(x) > 0 else None)

# Calculate the Single Fixation Duration per Participant per Token
df_token_melted["SingleFixationDuration"] = None
df_token_melted.loc[df_token_melted["KindOfFixation"] == "Fixation" , "SingleFixationDuration"] = df_token_melted["FixationDurations"]\
    .apply(lambda x: x[0] if len(x) == 1 else None)

# Calculate the Gaze Duration per Participant per Token
df_token_melted["GazeDuration"] = None
df_token_melted.loc[df_token_melted["KindOfFixation"] == "Fixation" , "GazeDuration"] = df_token_melted["FixationDurations"]\
    .apply(lambda x: sum(x) if len(x) > 0 else None)

# Calculate the Total Time per Participant per Token
df_token_melted_total_time = df_token_melted.groupby(["Participant", "Algorithm", "TokenIdx"])\
    .agg({"FixationDurations": lambda x: sum(x.values.sum())})
df_token_melted_total_time = df_token_melted_total_time.rename(columns={"FixationDurations": "TotalTime"})

# Merge the Dataframes
df_token_melted = pd.merge(df_token_melted, df_token_melted_total_time, on=["Participant", "Algorithm", "TokenIdx"], how="left")

# Cast the Dataframes to the right datatype
df_token_melted["FirstFixationDuration"] = df_token_melted["FirstFixationDuration"].astype(float)
df_token_melted["SingleFixationDuration"] = df_token_melted["SingleFixationDuration"].astype(float)
df_token_melted["GazeDuration"] = df_token_melted["GazeDuration"].astype(float)
df_token_melted["TotalTime"] = df_token_melted["TotalTime"].astype(float).replace(0, np.nan)

# Read in the Skilllevel
df_skill = pd.read_csv(f"./data/filteredData/filtered_data.csv")
df_skill = df_skill[["Participant", "SkillScore"]]
df_skill = df_skill.drop_duplicates()

# Merge the Dataframes to combine metrics with the Skilllevel
df_metrics_skill = pd.merge(df_token_melted, df_skill, on=["Participant"], how="left")

# Helper Methods for the Metrics
def get_no_fixations(df):
    df_fixations = df[df["KindOfFixation"] == "Fixation"]
    # remove every entry from df fixations on ["Algorithm", "TokenIdx"] where there is a refixation6
    df_fixations = df_fixations[df_fixations["NumberOfFixations"] == 0]
    return df_fixations


def get_single_fixations(df):
    df_fixations = df[df["KindOfFixation"] == "Fixation"]
    df_refixations = df[df["KindOfFixation"] == "ReFixation"]
    df_refixations = df_refixations[df_refixations["NumberOfFixations"] > 0]
    # remove every entry from df fixations on ["Algorithm", "TokenIdx"] where there is a refixation6
    df_fixations = df_fixations[~df_fixations["TokenIdx"].isin(df_refixations["TokenIdx"].values)]
    # remove every entry from df fixations on where Number Of Fixations is not 1
    df_fixations = df_fixations[df_fixations["NumberOfFixations"] == 1]
    return df_fixations

def get_multiple_fixations(df):
    df_fixations = df[df["KindOfFixation"] == "Fixation"]
    df_refixations = df[df["KindOfFixation"] == "ReFixation"]
    df_refixations = df_refixations[df_refixations["NumberOfFixations"] > 0]
    # remove every entry from df fixations on ["Algorithm", "TokenIdx"] where there is a refixation6
    df_fixations = df_fixations[(df_fixations["TokenIdx"].isin(df_refixations["TokenIdx"].values)) & (df_fixations["NumberOfFixations"] >= 1)]
    return df_fixations

def get_fixations(df):
    df_fixations = df[df["KindOfFixation"] == "Fixation"]
    # remove every entry from df fixations on where no Fixation is found
    df_fixations = df_fixations[df_fixations["NumberOfFixations"] >= 1]
    return df_fixations



# dataframe for number of fixations per participant
number_of_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])["NumberOfFixations"].count()
number_of_fixation_per_algorithm = number_of_fixation_per_algorithm.reset_index()

# dataframe for number of tokens per algorithm
number_of_tokens_per_algorithm = df_metrics_skill.groupby(["Algorithm"])["TokenIdx"].max()
number_of_tokens_per_algorithm = number_of_tokens_per_algorithm.reset_index()

# dataframe for number of tokens with no fixation per algorithm per participant
df_no_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])\
    .apply(get_no_fixations)\
    .drop(["Algorithm"], axis=1)

# dataframe for number of tokens with only one fixation per algorithm per participant
df_single_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])\
    .apply(get_single_fixations)\
    .drop(["Algorithm"], axis=1)

# dataframe for number of tokens with more than one fixation per algorithm per participant
df_multiple_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])\
    .apply(get_multiple_fixations)\
    .drop(["Algorithm"], axis=1)

# dataframe for number of tokens with more or equal than one fixation per algorithm per participant
df_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])\
    .apply(get_fixations)\
    .drop(["Algorithm"], axis=1)

# Reformat the dataframes
no_fixation_per_algorithm = df_no_fixation_per_algorithm[["TokenIdx"]]
no_fixation_per_algorithm = no_fixation_per_algorithm.reset_index().drop(["level_2"], axis=1)

single_fixation_per_algorithm = df_single_fixation_per_algorithm[["TokenIdx"]]
single_fixation_per_algorithm = single_fixation_per_algorithm.reset_index().drop(["level_2"], axis=1)

multiple_fixation_per_algorithm = df_multiple_fixation_per_algorithm[["TokenIdx"]]
multiple_fixation_per_algorithm = multiple_fixation_per_algorithm.reset_index().drop(["level_2"], axis=1)

fixations_per_algorithm = df_fixation_per_algorithm[["TokenIdx"]]
fixations_per_algorithm = fixations_per_algorithm.reset_index().drop(["level_2"], axis=1)

# Helper Method for Probability Metrics
def group_len_divided_by_number(current_df, counting_df):
    algorithm = current_df["Algorithm"].iloc[0]
    number_of_tokens = counting_df[counting_df["Algorithm"] == algorithm]["TokenIdx"].iloc[0]
    value = len(current_df) / number_of_tokens
    return len(current_df) / (number_of_tokens + 1)

# Calculate the Metrics per Participant
# Probability of no fixation
df_no_fixation_probability = no_fixation_per_algorithm.groupby(["Participant", "Algorithm"]).apply(lambda df: group_len_divided_by_number(df, number_of_tokens_per_algorithm))
df_no_fixation_probability = df_no_fixation_probability.reset_index()

# Probability of single fixation
df_single_fixation_probability = single_fixation_per_algorithm.groupby(["Participant", "Algorithm"]).apply(lambda df: group_len_divided_by_number(df, number_of_tokens_per_algorithm))
df_single_fixation_probability = df_single_fixation_probability.reset_index()

# Probability of multiple fixation
df_multiple_fixation_probability = multiple_fixation_per_algorithm.groupby(["Participant", "Algorithm"]).apply(lambda df: group_len_divided_by_number(df, number_of_tokens_per_algorithm))
df_multiple_fixation_probability = df_multiple_fixation_probability.reset_index()

# Probability of fixation
df_fixation_probability = fixations_per_algorithm.groupby(["Participant", "Algorithm"]).apply(lambda df: group_len_divided_by_number(df, number_of_tokens_per_algorithm))
df_fixation_probability = df_fixation_probability.reset_index()

# Calculate the means for the metrics per algorithm
df_no_fixation_probability = df_no_fixation_probability.groupby(["Participant"]).mean()
df_single_fixation_probability = df_single_fixation_probability.groupby(["Participant"]).mean()
df_multiple_fixation_probability = df_multiple_fixation_probability.groupby(["Participant"]).mean()
df_fixation_probability = df_fixation_probability.groupby(["Participant"]).mean()

# Raw Durations Metrics
# Duration of first fixation
df_first_fixation = df_metrics_skill[~df_metrics_skill["FirstFixationDuration"].isnull()]
df_first_fixation = df_first_fixation.groupby(["Participant"])["FirstFixationDuration"].mean()

# Duration of single fixation
df_single_fixation = df_metrics_skill[~df_metrics_skill["SingleFixationDuration"].isnull()]
df_single_fixation = df_single_fixation.groupby(["Participant"])["SingleFixationDuration"].mean()

# Duration of gaze duration
df_gaze_duration = df_metrics_skill[~df_metrics_skill["GazeDuration"].isnull()]
df_gaze_duration = df_gaze_duration.groupby(["Participant"])["GazeDuration"].mean()

# Total Time
df_total_time = df_metrics_skill[~df_metrics_skill["TotalTime"].isnull()]
df_total_time = df_total_time.groupby(["Participant"])["TotalTime"].mean()

# Put every metric dataframe together into one
df_combined = pd.DataFrame({"FirstFixationDuration": df_first_fixation.values,
                            "SingleFixationDuration": df_single_fixation.values,
                            "GazeDuration": df_gaze_duration.values,
                            "TotalTime": df_total_time.values,
                            "TokenNoFixationProbability": df_no_fixation_probability.values.reshape(37, ),
                            "TokenSingleFixationProbability": df_single_fixation_probability.values.reshape(37, ),
                            "TokenMultipleFixationProbability": df_multiple_fixation_probability.values.reshape(37, ),
                            "TokenFixationProbability": df_fixation_probability.values.reshape(37, ),
                            "Skill": df_metrics_skill.groupby(["Participant"])["SkillScore"].mean().values})
# get spearman correlation for metrics and skill level
df_combined.corrwith(df_combined["Skill"])

FirstFixationDuration              -0.125248
SingleFixationDuration             -0.126261
GazeDuration                       -0.147658
TotalTime                          -0.355490
TokenNoFixationProbability          0.134989
TokenSingleFixationProbability     -0.358763
TokenMultipleFixationProbability   -0.521773
TokenFixationProbability           -0.484857
Skill                               1.000000
dtype: float64

In [10]:
df_combined

Unnamed: 0,FirstFixationDuration,SingleFixationDuration,GazeDuration,TotalTime,TokenNoFixationProbability,TokenSingleFixationProbability,TokenMultipleFixationProbability,TokenFixationProbability,Skill
0,326.931769,323.38939,368.500693,690.261543,0.755473,0.141586,0.111086,0.252415,0.332799
1,314.006129,315.116805,348.919826,646.956938,0.711373,0.168476,0.121324,0.288627,0.381621
2,389.421699,390.4639,423.5242,780.283557,0.851402,0.179015,0.131822,0.317008,0.315012
3,320.08962,318.304065,344.742871,629.863462,0.788228,0.138839,0.077509,0.211772,0.426317
4,407.63699,400.311422,449.510906,783.451509,0.702468,0.185875,0.111608,0.297532,0.313899
5,324.361689,325.332447,337.821778,635.195144,0.719093,0.186016,0.147945,0.332927,0.318673
6,315.629899,314.516943,325.566804,610.13196,0.813849,0.137622,0.08437,0.220623,0.408083
7,373.162479,372.87589,390.279733,590.853231,0.819189,0.130122,0.07108,0.192865,0.350811
8,309.216488,309.613451,329.377227,677.987323,0.837094,0.157021,0.135047,0.28961,0.165306
9,327.892523,327.960447,355.498907,720.907264,0.802349,0.150001,0.101681,0.252993,0.309593


# AOI based analysis

### Read in the Generator for AOI Based Metrics to get The BoundingBoxes and Indices of each Token per Algorithm

In [11]:
# Get Token based AOIs
snippets = df_fixation["Algorithm"].unique()
df_aois = pd.DataFrame(columns=["Algorithm", "AOI", "AOIIdx", "BoundingBox"])
for snippet in tqdm(snippets):
    aoi_token_generator = f"./../CodeSnippets/Generators_Labeled/Generators/{snippet}.json"
    try:
        image, aoi_list = gsl.create_image(aoi_token_generator, font_path="./../CodeSnippets/fonts/ttf/")
    except:
        print(f"{snippet} failed")
        image, aoi_list = gsl.create_image(aoi_token_generator, font_path="./../CodeSnippets/fonts/ttf/", logging=True)
        break
    height, width = image.size
    width_offset = int(1920 * 0.5) - int(height / 2)
    height_offset = int(1080 * 0.5) - int(width / 2)
    aoi_clustered = []
    current_left = []
    current_top = []
    current_right = []
    current_bottom = []
    current_aoi = []
    color = []
    for letter in aoi_list:
        # Close AOI
        if letter['letter'] in " \t\n":
            continue
        if len(letter["AOI"]) == 1:
            for idx in range(len(current_aoi)):
                aoi_clustered.append((len(aoi_clustered), current_aoi[idx], current_left[idx], current_top[idx], current_right[idx], current_bottom[idx], color[idx]))

            current_aoi = []
            color = []
            current_left = []
            current_top = []
            current_right = []
            current_bottom = []
            continue
        # There is no AOI set
        if len(current_aoi) == 0:
            current_aoi = []
            color = []
            current_left = []
            current_top = []
            current_right = []
            current_bottom = []
            for idx in range(1, len(letter["AOI"])):
                current_aoi.append(letter["AOI"][idx])
                current_left.append(letter["BoundingBox"][0])
                current_top.append(letter["BoundingBox"][1])
                current_right.append(letter["BoundingBox"][2])
                current_bottom.append(letter["BoundingBox"][3])
                color.append(letter["color"])
            continue


        for idx in reversed(range(len(current_aoi))):
            if current_aoi[idx] in letter["AOI"]:
                current_left[idx] = min(current_left[idx], letter["BoundingBox"][0])
                current_top[idx] = min(current_top[idx], letter["BoundingBox"][1])
                current_right[idx] = max(current_right[idx], letter["BoundingBox"][2])
                current_bottom[idx] = max(current_bottom[idx], letter["BoundingBox"][3])
                # remove the AOI from the letter
                letter["AOI"].remove(current_aoi[idx])
            else:
                aoi_clustered.append((len(aoi_clustered), current_aoi[idx], current_left[idx], current_top[idx], current_right[idx], current_bottom[idx], color[idx]))
                del current_aoi[idx]
                del current_left[idx]
                del current_top[idx]
                del current_right[idx]
                del current_bottom[idx]
                del color[idx]

        for idx in range(1, len(letter["AOI"])):
            current_aoi.append(letter["AOI"][idx])
            current_left.append(letter["BoundingBox"][0])
            current_top.append(letter["BoundingBox"][1])
            current_right.append(letter["BoundingBox"][2])
            current_bottom.append(letter["BoundingBox"][3])
            color.append(letter["color"])

    for idx in range(len(current_aoi)):
        aoi_clustered.append((len(aoi_clustered), current_aoi[idx], current_left[idx], current_top[idx], current_right[idx], current_bottom[idx], color[idx]))

    for token in aoi_clustered:
        df_aois.loc[len(df_aois)] = [snippet, token[1], token[0],
                                                 (token[2] + width_offset,
                                                  token[3] + height_offset,
                                                  token[4] + width_offset,
                                                  token[5] + height_offset)]

df_aois

  0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,Algorithm,AOI,AOIIdx,BoundingBox
0,IsPrime,method_identifier,0,"(993, 444, 1070, 460)"
1,IsPrime,method_signature,1,"(751, 444, 1202, 464)"
2,IsPrime,method_argument_declaration,2,"(1070, 444, 1202, 463)"
3,IsPrime,for_head,3,"(795, 469, 1147, 489)"
4,IsPrime,arithmetic_expression,4,"(883, 494, 994, 510)"
...,...,...,...,...
626,SiebDesEratosthenes,if_head,21,"(762, 643, 905, 664)"
627,SiebDesEratosthenes,if_body,22,"(762, 643, 960, 712)"
628,SiebDesEratosthenes,if_statement,23,"(762, 643, 960, 712)"
629,SiebDesEratosthenes,for_loop,24,"(718, 618, 1048, 737)"


### Check which Fixation of which Participant is in which AOI

In [24]:
df_aoi_fixation_per_participant = pd.DataFrame([], columns=["Algorithm", "Participant", "FixationNumber",
                                                              "FixationDuration", "AOIIdx", "AOIName"])
participants = df_fixation["Participant"].unique()
for snippet in tqdm(snippets):
    df_aois_per_algo = df_aois[df_aois["Algorithm"] == snippet]

    for participant in tqdm(participants):

        df_fixation_participant = df_fixation[(df_fixation["Algorithm"] == snippet) & (df_fixation["Participant"] == participant)]
        if len(df_fixation_participant) == 0:
            continue
        start_times = eval(df_fixation_participant["Fixation_startT"].values[0])
        end_times = eval(df_fixation_participant["Fixation_endT"].values[0])
        x_coordinates = eval(df_fixation_participant["Fixation_x"].values[0])
        y_coordinates = eval(df_fixation_participant["Fixation_y"].values[0])
        x_range = eval(df_fixation_participant["Fixation_x_range"].values[0])
        y_range = eval(df_fixation_participant["Fixation_y_range"].values[0])
        idx_values = range(len(start_times))
        for (fix_idx, start, end, x, y, x_range, y_range) in zip(idx_values, start_times, end_times, x_coordinates, y_coordinates, x_range, y_range):
            low_x = int(float(x) - math.ceil(float(x_range)))
            low_y = int(float(y) - math.ceil(float(y_range)))
            high_x = int(float(x) + math.ceil(float(x_range)))
            high_y = int(float(y) + math.ceil(float(y_range)))
            possible_coordinates = [(x, y) for x in range(low_x, high_x + 1) for y in range(low_y, high_y + 1)]

            for idx, row in df_aois_per_algo.iterrows():
                aoi_idx = row["AOIIdx"]
                aoi_name = row["AOI"]
                bounding_box = row["BoundingBox"]

                for possible_x, possible_y in possible_coordinates:
                    if bounding_box[0] <= possible_x <= bounding_box[2] and bounding_box[1] <= possible_y <=bounding_box[3]:
                        df_aoi_fixation_per_participant.loc[len(df_aoi_fixation_per_participant)] = [snippet, participant, fix_idx ,end - start, aoi_idx, aoi_name]
                        break

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]


### Transform the Data to a Fixation/ Refixation split by Participant

In [25]:
df_token = df_token_aois.copy()
df_token = df_token.drop(["BoundingBox", "Token"], axis=1)
for participant in participants:
    df_token.loc[:, f"TokenFixation_P{participant}"] = [[] for _ in range(len(df_token))]
    df_token.loc[:, f"TokenReFixation_P{participant}"] = [[] for _ in range(len(df_token))]


prev_participant = df_token_fixation_per_participant["Participant"].iloc[0]
prev_token_idx = df_token_fixation_per_participant["TokenIdx"].iloc[0]
prev_algorithm = df_token_fixation_per_participant["Algorithm"].iloc[0]
fixations = []
re_fixation = False
for idx, row in tqdm(df_token_fixation_per_participant.iterrows(), total=len(df_token_fixation_per_participant)):
    participant = row["Participant"]
    token_idx = row["TokenIdx"]
    algorithm = row["Algorithm"]
    FixationDuration = row["FixationDuration"]

    # fixation switches
    if prev_participant != participant or prev_token_idx != token_idx:
        index = df_token[(df_token["TokenIdx"] == prev_token_idx) & (df_token["Algorithm"] == prev_algorithm)].index[0]
        if re_fixation:
            re_fixations = df_token.loc[index, f"TokenReFixation_P{prev_participant}"]
            re_fixations.extend(fixations.copy())
            df_token.loc[index, f"TokenReFixation_P{prev_participant}"] = re_fixations.copy()
        else:
            df_token.loc[index, f"TokenFixation_P{prev_participant}"] = fixations.copy()
        fixations = []
        # possible new fixation
        re_fixation = False

    sub_frame = df_token[(df_token["TokenIdx"] == token_idx) & (df_token["Algorithm"] == algorithm)]
    if len(sub_frame) == 0:
        raise Exception(f"No Token found for {token_idx} in Algorithm {algorithm}")
    len_of_fixation = len(sub_frame[f"TokenFixation_P{participant}"].iloc[0])
    if re_fixation == False and len_of_fixation > 0 and len(fixations) == 0:
        re_fixation = True

    fixations.append(FixationDuration)
    prev_participant = participant
    prev_token_idx = token_idx
    prev_algorithm = algorithm

  0%|          | 0/36150 [00:00<?, ?it/s]

### Calculate the AOI Based Eyetracking Metrics

In [15]:
# Melt the Dataframe to be thinner so that we have Algorithm TokenIdx and Participant and Keys per Fixation / Refixation
df_token_melted = pd.melt(df_token, id_vars=["Algorithm", "TokenIdx"], var_name="KindOfFixation", value_name="FixationDurations")

# Classify Participant and the Kind and Number of Fixations / Refixations
df_token_melted["Participant"] = df_token_melted["KindOfFixation"].apply(lambda x: int(x.split("_")[1][1:]))
df_token_melted["KindOfFixation"] = df_token_melted["KindOfFixation"].apply(lambda x: x.split("_")[0])
df_token_melted["KindOfFixation"] = df_token_melted["KindOfFixation"].apply(lambda x: "Fixation" if x == "TokenFixation" else "ReFixation")
df_token_melted["NumberOfFixations"] = df_token_melted["FixationDurations"].apply(lambda x: len(x))

# Get the number of Participants for further calculations
number_of_participants = len(participants)

# Calculate the First Fixation Duration per Participant per Token
df_token_melted["FirstFixationDuration"] = None
df_token_melted.loc[df_token_melted["KindOfFixation"] == "Fixation" , "FirstFixationDuration"] = df_token_melted["FixationDurations"]\
    .apply(lambda x: x[0] if len(x) > 0 else None)

# Calculate the Single Fixation Duration per Participant per Token
df_token_melted["SingleFixationDuration"] = None
df_token_melted.loc[df_token_melted["KindOfFixation"] == "Fixation" , "SingleFixationDuration"] = df_token_melted["FixationDurations"]\
    .apply(lambda x: x[0] if len(x) == 1 else None)

# Calculate the Gaze Duration per Participant per Token
df_token_melted["GazeDuration"] = None
df_token_melted.loc[df_token_melted["KindOfFixation"] == "Fixation" , "GazeDuration"] = df_token_melted["FixationDurations"]\
    .apply(lambda x: sum(x) if len(x) > 0 else None)

# Calculate the Total Time per Participant per Token
df_token_melted_total_time = df_token_melted.groupby(["Participant", "Algorithm", "TokenIdx"])\
    .agg({"FixationDurations": lambda x: sum(x.values.sum())})
df_token_melted_total_time = df_token_melted_total_time.rename(columns={"FixationDurations": "TotalTime"})

# Merge the Dataframes
df_token_melted = pd.merge(df_token_melted, df_token_melted_total_time, on=["Participant", "Algorithm", "TokenIdx"], how="left")

# Cast the Dataframes to the right datatype
df_token_melted["FirstFixationDuration"] = df_token_melted["FirstFixationDuration"].astype(float)
df_token_melted["SingleFixationDuration"] = df_token_melted["SingleFixationDuration"].astype(float)
df_token_melted["GazeDuration"] = df_token_melted["GazeDuration"].astype(float)
df_token_melted["TotalTime"] = df_token_melted["TotalTime"].astype(float).replace(0, np.nan)

# Read in the Skilllevel
df_skill = pd.read_csv(f"./data/filteredData/filtered_data.csv")
df_skill = df_skill[["Participant", "SkillScore"]]
df_skill = df_skill.drop_duplicates()

# Merge the Dataframes to combine metrics with the Skilllevel
df_metrics_skill = pd.merge(df_token_melted, df_skill, on=["Participant"], how="left")

# Helper Methods for the Metrics (2. Define because of nearness of the code)
def get_no_fixations(df):
    df_fixations = df[df["KindOfFixation"] == "Fixation"]
    # remove every entry from df fixations on ["Algorithm", "TokenIdx"] where there is a refixation6
    df_fixations = df_fixations[df_fixations["NumberOfFixations"] == 0]
    return df_fixations


def get_single_fixations(df):
    df_fixations = df[df["KindOfFixation"] == "Fixation"]
    df_refixations = df[df["KindOfFixation"] == "ReFixation"]
    df_refixations = df_refixations[df_refixations["NumberOfFixations"] > 0]
    # remove every entry from df fixations on ["Algorithm", "TokenIdx"] where there is a refixation6
    df_fixations = df_fixations[~df_fixations["TokenIdx"].isin(df_refixations["TokenIdx"].values)]
    # remove every entry from df fixations on where Number Of Fixations is not 1
    df_fixations = df_fixations[df_fixations["NumberOfFixations"] == 1]
    return df_fixations

def get_multiple_fixations(df):
    df_fixations = df[df["KindOfFixation"] == "Fixation"]
    df_refixations = df[df["KindOfFixation"] == "ReFixation"]
    df_refixations = df_refixations[df_refixations["NumberOfFixations"] > 0]
    # remove every entry from df fixations on ["Algorithm", "TokenIdx"] where there is a refixation6
    df_fixations = df_fixations[(df_fixations["TokenIdx"].isin(df_refixations["TokenIdx"].values)) & (df_fixations["NumberOfFixations"] >= 1)]
    return df_fixations

def get_fixations(df):
    df_fixations = df[df["KindOfFixation"] == "Fixation"]
    # remove every entry from df fixations on where no Fixation is found
    df_fixations = df_fixations[df_fixations["NumberOfFixations"] >= 1]
    return df_fixations



# dataframe for number of fixations per participant
number_of_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])["NumberOfFixations"].count()
number_of_fixation_per_algorithm = number_of_fixation_per_algorithm.reset_index()

# dataframe for number of tokens per algorithm
number_of_tokens_per_algorithm = df_metrics_skill.groupby(["Algorithm"])["TokenIdx"].max()
number_of_tokens_per_algorithm = number_of_tokens_per_algorithm.reset_index()

# dataframe for number of tokens with no fixation per algorithm per participant
df_no_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])\
    .apply(get_no_fixations)\
    .drop(["Algorithm"], axis=1)

# dataframe for number of tokens with only one fixation per algorithm per participant
df_single_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])\
    .apply(get_single_fixations)\
    .drop(["Algorithm"], axis=1)

# dataframe for number of tokens with more than one fixation per algorithm per participant
df_multiple_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])\
    .apply(get_multiple_fixations)\
    .drop(["Algorithm"], axis=1)

# dataframe for number of tokens with more or equal than one fixation per algorithm per participant
df_fixation_per_algorithm = df_metrics_skill.groupby(["Participant", "Algorithm"])\
    .apply(get_fixations)\
    .drop(["Algorithm"], axis=1)

# Reformat the dataframes
no_fixation_per_algorithm = df_no_fixation_per_algorithm[["TokenIdx"]]
no_fixation_per_algorithm = no_fixation_per_algorithm.reset_index().drop(["level_2"], axis=1)

single_fixation_per_algorithm = df_single_fixation_per_algorithm[["TokenIdx"]]
single_fixation_per_algorithm = single_fixation_per_algorithm.reset_index().drop(["level_2"], axis=1)

multiple_fixation_per_algorithm = df_multiple_fixation_per_algorithm[["TokenIdx"]]
multiple_fixation_per_algorithm = multiple_fixation_per_algorithm.reset_index().drop(["level_2"], axis=1)

fixations_per_algorithm = df_fixation_per_algorithm[["TokenIdx"]]
fixations_per_algorithm = fixations_per_algorithm.reset_index().drop(["level_2"], axis=1)

# Helper Method for Probability Metrics
def group_len_divided_by_number(current_df, counting_df):
    algorithm = current_df["Algorithm"].iloc[0]
    number_of_tokens = counting_df[counting_df["Algorithm"] == algorithm]["TokenIdx"].iloc[0]
    value = len(current_df) / number_of_tokens
    return len(current_df) / (number_of_tokens + 1)

# Calculate the Metrics per Participant
# Probability of no fixation
df_no_fixation_probability = no_fixation_per_algorithm.groupby(["Participant", "Algorithm"]).apply(lambda df: group_len_divided_by_number(df, number_of_tokens_per_algorithm))
df_no_fixation_probability = df_no_fixation_probability.reset_index()

# Probability of single fixation
df_single_fixation_probability = single_fixation_per_algorithm.groupby(["Participant", "Algorithm"]).apply(lambda df: group_len_divided_by_number(df, number_of_tokens_per_algorithm))
df_single_fixation_probability = df_single_fixation_probability.reset_index()

# Probability of multiple fixation
df_multiple_fixation_probability = multiple_fixation_per_algorithm.groupby(["Participant", "Algorithm"]).apply(lambda df: group_len_divided_by_number(df, number_of_tokens_per_algorithm))
df_multiple_fixation_probability = df_multiple_fixation_probability.reset_index()

# Probability of fixation
df_fixation_probability = fixations_per_algorithm.groupby(["Participant", "Algorithm"]).apply(lambda df: group_len_divided_by_number(df, number_of_tokens_per_algorithm))
df_fixation_probability = df_fixation_probability.reset_index()

# Calculate the means for the metrics per algorithm
df_no_fixation_probability = df_no_fixation_probability.groupby(["Participant"]).mean()
df_single_fixation_probability = df_single_fixation_probability.groupby(["Participant"]).mean()
df_multiple_fixation_probability = df_multiple_fixation_probability.groupby(["Participant"]).mean()
df_fixation_probability = df_fixation_probability.groupby(["Participant"]).mean()

# Raw Durations Metrics
# Duration of first fixation
df_first_fixation = df_metrics_skill[~df_metrics_skill["FirstFixationDuration"].isnull()]
df_first_fixation = df_first_fixation.groupby(["Participant"])["FirstFixationDuration"].mean()

# Duration of single fixation
df_single_fixation = df_metrics_skill[~df_metrics_skill["SingleFixationDuration"].isnull()]
df_single_fixation = df_single_fixation.groupby(["Participant"])["SingleFixationDuration"].mean()

# Duration of gaze duration
df_gaze_duration = df_metrics_skill[~df_metrics_skill["GazeDuration"].isnull()]
df_gaze_duration = df_gaze_duration.groupby(["Participant"])["GazeDuration"].mean()

# Total Time
df_total_time = df_metrics_skill[~df_metrics_skill["TotalTime"].isnull()]
df_total_time = df_total_time.groupby(["Participant"])["TotalTime"].mean()

# Put every metric dataframe together into one
df_combined = pd.DataFrame({"FirstFixationDuration": df_first_fixation.values,
                            "SingleFixationDuration": df_single_fixation.values,
                            "GazeDuration": df_gaze_duration.values,
                            "TotalTime": df_total_time.values,
                            "TokenNoFixationProbability": df_no_fixation_probability.values.reshape(37, ),
                            "TokenSingleFixationProbability": df_single_fixation_probability.values.reshape(37, ),
                            "TokenMultipleFixationProbability": df_multiple_fixation_probability.values.reshape(37, ),
                            "TokenFixationProbability": df_fixation_probability.values.reshape(37, ),
                            "Skill": df_metrics_skill.groupby(["Participant"])["SkillScore"].mean().values})
# get spearman correlation for metrics and skill level
df_combined.corrwith(df_metrics_skill["SkillScore"])

FirstFixationDuration               2.489449e-16
SingleFixationDuration             -3.289924e-16
GazeDuration                       -4.530810e-16
TotalTime                           5.697798e-16
TokenNoFixationProbability          2.780766e-15
TokenSingleFixationProbability      2.107193e-16
TokenMultipleFixationProbability    2.159747e-16
TokenFixationProbability           -4.425315e-16
Skill                               6.151681e-16
dtype: float64

# Get the LOCs

In [16]:
# Get Bounding Boxes for Lines Of Code
snippets = df_fixation["Algorithm"].unique()
df_lines = pd.DataFrame(columns=["Algorithm", "Line", "BoundingBox"])
for snippet in tqdm(snippets):
    aoi_token_generator = f"./../CodeSnippets/Generators_Labeled/Generators/{snippet}_ast.json"
    image, aoi_list = gsl.create_image(aoi_token_generator, font_path="./../CodeSnippets/fonts/ttf/")
    height, width = image.size
    width_offset = int(1920 * 0.5) - int(height / 2)
    height_offset = int(1080 * 0.5) - int(width / 2)
    aoi_clustered = []
    current_left = None
    current_top = None
    current_right = None
    current_bottom = None
    current_line = 0
    for letter in aoi_list:
        if letter["letter"] == '\n':
            if current_left is not None:
                aoi_clustered.append((current_line, current_left, current_top, current_right, current_bottom))
            current_left = None
            current_top = None
            current_right = None
            current_bottom = None
            current_line += 1
            continue
        if current_left is None:
            current_left = letter["BoundingBox"][0]
            current_top = letter["BoundingBox"][1]
            current_right = letter["BoundingBox"][2]
            current_bottom = letter["BoundingBox"][3]
        else:
            current_left = min(current_left, letter["BoundingBox"][0])
            current_top = min(current_top, letter["BoundingBox"][1])
            current_right = max(current_right, letter["BoundingBox"][2])
            current_bottom = max(current_bottom, letter["BoundingBox"][3])

    for token in aoi_clustered:
        df_lines.loc[len(df_lines)] = [snippet, token[0],
                                       (token[1] + width_offset,
                                        token[2] + height_offset,
                                        token[3] + width_offset,
                                        token[4] + height_offset)]
df_lines

  0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,Algorithm,Line,BoundingBox
0,IsPrime,0,"(768, 467, 1152, 482)"
1,IsPrime,1,"(768, 486, 1112, 501)"
2,IsPrime,2,"(768, 505, 1049, 519)"
3,IsPrime,3,"(768, 526, 1008, 539)"
4,IsPrime,4,"(768, 543, 880, 557)"
...,...,...,...
429,SiebDesEratosthenes,15,"(716, 638, 964, 653)"
430,SiebDesEratosthenes,16,"(716, 657, 828, 671)"
431,SiebDesEratosthenes,17,"(716, 676, 796, 690)"
432,SiebDesEratosthenes,18,"(716, 696, 900, 710)"


In [17]:
df_line_fixation_per_participant = pd.DataFrame([], columns=["Algorithm", "Participant", "FixationNumber", "FixationStart", "FixationEnd", "LineNumber"])
participants = df_fixation["Participant"].unique()
for snippet in tqdm(snippets):
    df_token_per_algo = df_lines[df_lines["Algorithm"] == snippet]

    for participant in participants:
        df_fixation_participant = df_fixation[(df_fixation["Algorithm"] == snippet) & (df_fixation["Participant"] == participant)]
        if len(df_fixation_participant) == 0:
            continue
        start_times = eval(df_fixation_participant["Fixation_startT"].values[0])
        end_times = eval(df_fixation_participant["Fixation_endT"].values[0])
        y_coordinates = eval(df_fixation_participant["Fixation_y"].values[0])
        y_range = eval(df_fixation_participant["Fixation_y_range"].values[0])
        idx_values = range(len(start_times))
        for (fix_idx, start, end, y, y_range) in zip(idx_values, start_times, end_times, y_coordinates, y_range):
            low_y = int(float(y) - math.ceil(float(y_range)))
            high_y = int(float(y) + math.ceil(float(y_range)))
            possible_coordinates = [y for y in range(low_y, high_y + 1)]

            found = False
            for idx, row in df_token_per_algo.iterrows():
                line_number = row["Line"]
                bounding_box = row["BoundingBox"]

                for possible_y in possible_coordinates:
                    if bounding_box[1] <= possible_y <= bounding_box[3]:
                        df_line_fixation_per_participant.loc[len(df_line_fixation_per_participant)] = [snippet, participant, fix_idx, start, end, line_number]
                        found = True
                        break
                if found:
                    break

df_line_fixation_per_participant

  0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,Algorithm,Participant,FixationNumber,FixationStart,FixationEnd,LineNumber
0,IsPrime,1,3,1136.027,1332.034,0
1,IsPrime,1,4,1356.034,1512.039,0
2,IsPrime,1,5,1536.038,1812.044,0
3,IsPrime,1,6,1840.047,2132.055,0
4,IsPrime,1,7,2156.055,2932.074,0
...,...,...,...,...,...,...
75295,SiebDesEratosthenes,71,114,32113.081,32401.090,10
75296,SiebDesEratosthenes,71,116,32645.097,32817.102,13
75297,SiebDesEratosthenes,71,125,34709.168,34881.169,15
75298,SiebDesEratosthenes,71,126,34893.169,35049.177,18


In [18]:
# Calculate the LOCs
snippets = df_fixation["Algorithm"].unique()
df_snippet_length = pd.DataFrame(columns=["Algorithm", "LOC"])
for snippet in tqdm(snippets):
    aoi_token_generator = f"./../CodeSnippets/Generators_Labeled/Generators/{snippet}_ast.json"
    with open(aoi_token_generator) as f:
        aoi_list = json.load(f)
        data = aoi_list["source-code"]
        LOC = len(data)
        df_snippet_length.loc[len(df_snippet_length)] = [snippet, LOC]
df_snippet_length

  0%|          | 0/32 [00:00<?, ?it/s]

Unnamed: 0,Algorithm,LOC
0,IsPrime,8
1,IsAnagram,28
2,RemoveDoubleChar,19
3,BinToDecimal,9
4,PermuteString,23
5,Power,7
6,BinarySearch,15
7,ContainsSubstring,21
8,ReverseArray,7
9,SumArray,9


In [19]:
# read in Behavioural and Skills data
df_behavioral = pd.read_csv('./data/filteredData/fixation_stats.csv', sep=";")
df_behavioral = df_behavioral[df_behavioral["IsOutlier"] == False]
df_behavioral = df_behavioral[["Participant", "Algorithm", "Duration", "SkillScore"]]
df_behavioral

Unnamed: 0,Participant,Algorithm,Duration,SkillScore
0,1,IsPrime,12.390280,0.332799
2,1,IsAnagram,109.615724,0.332799
3,1,RemoveDoubleChar,53.456276,0.332799
4,1,BinToDecimal,49.922091,0.332799
5,1,PermuteString,109.888549,0.332799
...,...,...,...,...
1066,71,GreatestCommonDivisor,30.757360,0.437218
1067,71,DumpSorting,113.368945,0.437218
1068,71,BinomialCoefficient,50.637861,0.437218
1069,71,IsAnagram,110.995754,0.437218


In [20]:
# merge dataframes together to access every possible combination of participant, algorithm and duration,skillscore
df_combined = pd.merge(df_line_fixation_per_participant, df_snippet_length, on=["Algorithm"])
df_combined = pd.merge(df_combined, df_behavioral, on=["Participant", "Algorithm"])

# transform the data to seconds
df_combined["FixationStart"] = df_combined["FixationStart"] / 1000.0
df_combined["FixationEnd"] = df_combined["FixationEnd"] / 1000.0

# Helper function to calculate the coverage of LOC after 'percentage' percent from the whole duration
def loc_coverage_after_time_percentage(df, percentage):
    end_duration = df["Duration"].iloc[0]
    loc = df["LOC"].iloc[0]
    max_duration = end_duration * percentage
    df_filtered = df[df["FixationEnd"] <= max_duration]
    unique_loc = df_filtered["LineNumber"].nunique()
    return unique_loc / loc

# calculate LOC coverage after a certain percentage of the duration
df_10 = df_combined.groupby(["Algorithm", "Participant"]).apply(lambda df : loc_coverage_after_time_percentage(df, 0.1))
df_20 = df_combined.groupby(["Algorithm", "Participant"]).apply(lambda df : loc_coverage_after_time_percentage(df, 0.2))
df_30 = df_combined.groupby(["Algorithm", "Participant"]).apply(lambda df : loc_coverage_after_time_percentage(df, 0.3))
df_40 = df_combined.groupby(["Algorithm", "Participant"]).apply(lambda df : loc_coverage_after_time_percentage(df, 0.4))
df_50 = df_combined.groupby(["Algorithm", "Participant"]).apply(lambda df : loc_coverage_after_time_percentage(df, 0.5))
df_60 = df_combined.groupby(["Algorithm", "Participant"]).apply(lambda df : loc_coverage_after_time_percentage(df, 0.6))
df_70 = df_combined.groupby(["Algorithm", "Participant"]).apply(lambda df : loc_coverage_after_time_percentage(df, 0.7))
df_80 = df_combined.groupby(["Algorithm", "Participant"]).apply(lambda df : loc_coverage_after_time_percentage(df, 0.8))
df_90 = df_combined.groupby(["Algorithm", "Participant"]).apply(lambda df : loc_coverage_after_time_percentage(df, 0.9))
df_100 = df_combined.groupby(["Algorithm", "Participant"]).apply(lambda df : loc_coverage_after_time_percentage(df, 1.))

df_10 = df_10.reset_index()
df_20 = df_20.reset_index()
df_30 = df_30.reset_index()
df_40 = df_40.reset_index()
df_50 = df_50.reset_index()
df_60 = df_60.reset_index()
df_70 = df_70.reset_index()
df_80 = df_80.reset_index()
df_90 = df_90.reset_index()
df_100 = df_100.reset_index()


df_10 = df_10.groupby(["Participant"]).mean().values.reshape(37, )
df_20 = df_20.groupby(["Participant"]).mean().values.reshape(37, )
df_30 = df_30.groupby(["Participant"]).mean().values.reshape(37, )
df_40 = df_40.groupby(["Participant"]).mean().values.reshape(37, )
df_50 = df_50.groupby(["Participant"]).mean().values.reshape(37, )
df_60 = df_60.groupby(["Participant"]).mean().values.reshape(37, )
df_70 = df_70.groupby(["Participant"]).mean().values.reshape(37, )
df_80 = df_80.groupby(["Participant"]).mean().values.reshape(37, )
df_90 = df_90.groupby(["Participant"]).mean().values.reshape(37, )
df_100 = df_100.groupby(["Participant"]).mean().values.reshape(37, )

# extract the raw skillscore per participant
df_skill = df_behavioral[["Participant", "SkillScore"]]
df_skill = df_skill.drop_duplicates()

# create a dataframe with all the LOC coverage and Participant
df_code_coverage = pd.DataFrame({"Participant": participants,
                                 "10%" : df_10,
                                 "20%" : df_20,
                                 "30%" : df_30,
                                 "40%" : df_40,
                                 "50%" : df_50,
                                 "60%" : df_60,
                                 "70%" : df_70,
                                 "80%" : df_80,
                                 "90%" : df_90,
                                 "100%" : df_100})

df_code_coverage.set_index("Participant", inplace=True, drop=True)

# Merge LOC coverage with skillscore with correlations
df_code_coverage = pd.merge(df_code_coverage, df_skill, on=["Participant"])
df_code_coverage.set_index("Participant", inplace=True, drop=True)
df_code_coverage

Unnamed: 0_level_0,10%,20%,30%,40%,50%,60%,70%,80%,90%,100%,SkillScore
Participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0.106241,0.246855,0.346731,0.425863,0.489672,0.540275,0.580584,0.612115,0.667838,0.699548,0.332799
2,0.145467,0.251232,0.35162,0.418872,0.464605,0.498878,0.556996,0.597462,0.649594,0.684737,0.381621
3,0.290797,0.415433,0.518259,0.638263,0.71783,0.764809,0.807005,0.813672,0.824958,0.845963,0.315012
4,0.081401,0.245859,0.331361,0.425967,0.498826,0.552694,0.59222,0.63411,0.664308,0.710296,0.426317
5,0.204225,0.301503,0.396022,0.476784,0.545772,0.640267,0.679653,0.704485,0.735941,0.7805,0.313899
6,0.121506,0.256986,0.379679,0.479012,0.576007,0.640235,0.690551,0.749088,0.800574,0.816752,0.318673
7,0.103185,0.249724,0.381651,0.449579,0.487177,0.523057,0.556342,0.594654,0.625733,0.644708,0.408083
10,0.076049,0.152989,0.243787,0.323574,0.405975,0.446269,0.498451,0.551098,0.587737,0.615488,0.350811
11,0.14004,0.307725,0.44332,0.529157,0.599477,0.646496,0.708409,0.738717,0.765619,0.826433,0.165306
12,0.048213,0.175058,0.275616,0.367022,0.459118,0.511863,0.575108,0.638642,0.664823,0.719213,0.309593


In [21]:
# describe the LOC coverage
df_code_coverage.describe()

Unnamed: 0,10%,20%,30%,40%,50%,60%,70%,80%,90%,100%,SkillScore
count,37.0,37.0,37.0,37.0,37.0,37.0,37.0,37.0,37.0,37.0,37.0
mean,0.108211,0.216642,0.308442,0.388187,0.460408,0.511375,0.561771,0.603061,0.643536,0.690561,0.325261
std,0.093715,0.113918,0.125393,0.128693,0.123185,0.118993,0.114672,0.109949,0.106552,0.101566,0.102608
min,0.0,0.044056,0.118774,0.193105,0.273844,0.327964,0.380924,0.394535,0.394535,0.410324,0.144164
25%,0.038878,0.12255,0.216412,0.28932,0.36559,0.421855,0.47012,0.524087,0.576259,0.643385,0.262502
50%,0.096695,0.225601,0.320445,0.387982,0.459118,0.51161,0.556996,0.597462,0.649594,0.687911,0.317626
75%,0.142011,0.256986,0.356613,0.449579,0.522202,0.573625,0.636023,0.67023,0.711342,0.77903,0.381621
max,0.418425,0.552526,0.687982,0.765678,0.800206,0.808313,0.824526,0.846036,0.867382,0.888266,0.659556


In [23]:
# spearman correlation for LOC coverage and skillscore
df_code_coverage.corrwith(df_code_coverage["SkillScore"])

10%          -0.277507
20%          -0.288134
30%          -0.310443
40%          -0.289692
50%          -0.320991
60%          -0.326301
70%          -0.350759
80%          -0.358753
90%          -0.340106
100%         -0.376314
SkillScore    1.000000
dtype: float64

In [None]:
from matplotlib.patches import Circle
import matplotlib.pyplot as plt


# check if folder exists and create it if not
if not os.path.exists("./data/RQ1/plots/scanpaths"):
    os.makedirs("./data/RQ1/plots/scanpaths")

if not os.path.exists("./data/RQ1/plots/vertical_scanpaths"):
    os.makedirs("./data/RQ1/plots/vertical_scanpaths")

def clamp(low, value, high):
    return max(low, min(value, high))

snippets = df_fixation["Algorithm"].unique()
# iterate over all algorithms
for snippet in tqdm(snippets):
    aoi_token_generator = f"./../CodeSnippets/aois/Generators/{snippet}_ast.json"
    image, aoi_list = gsl.create_image(aoi_token_generator, font_path="./../CodeSnippets/fonts/ttf/")
    width, height = image.size
    x_low = int(1920 * 0.5) - int(width / 2)
    y_low = int(1080 * 0.5) - int(height / 2)
    x_high = int(1920 * 0.5) + int(width / 2)
    y_high = int(1080 * 0.5) + int(height / 2)
    df_algo = df_fixation[df_fixation["Algorithm"] == snippet]
    df_algo = df_algo.sort_values(by=["SkillScore"])
    participants = df_algo["Participant"].unique()
    # iterate over all participants
    for participant in participants:
        if len(df_algo[df_algo["Participant"] == participant]) != 1:
            display(df_algo[df_algo["Participant"] == participant])
            raise ValueError("Participant has more than not exactly one entry")
        df_algo_part = df_algo[df_algo["Participant"] == participant].reset_index().iloc[0]
        skillscore = df_algo_part["SkillScore"]
        current_image = image.copy()
        fixation_start_array = np.array(eval(df_algo_part["Fixation_startT"]))
        fixation_end_array = np.array(eval(df_algo_part["Fixation_endT"]))
        fixation_x_coordinates = np.array(eval(df_algo_part["Fixation_x"]))
        fixation_y_coordinates = np.array(eval(df_algo_part["Fixation_y"]))
        fixations = np.stack((fixation_start_array, fixation_end_array, fixation_x_coordinates, fixation_y_coordinates), axis=1)

        cm = plt.cm.get_cmap('inferno')
        cm = plt.cm.ScalarMappable(cmap=cm, norm=plt.Normalize(vmin=0, vmax=len(fixations)))
        patches = []

        # total scanpath
        for idx, (start, end, x, y) in enumerate(fixations):
            if x_low <= x <= x_high and y_low <= y <= y_high:
                x = int(x)
                y = int(y)
                x = x - x_low
                y = y - y_low
                color = cm.to_rgba(idx)
                patches.append(Circle((x, y), radius=10, color=color, alpha=0.5))

        fig, ax = plt.subplots(1)
        ax.imshow(current_image)
        for p in patches:
            ax.add_patch(p)

        for idx in range(len(fixations) - 1):
            start_x = fixations[idx, 2] - x_low
            start_y = fixations[idx, 3] - y_low
            end_x = fixations[idx+1, 2] - x_low
            end_y = fixations[idx+1, 3] - y_low
            if 0 <= start_x <= width and 0 <= start_y <= height and 0 <= end_x <= width and 0 <= end_y <= height:
                color = cm.to_rgba(idx)
                ax.plot([start_x, end_x], [start_y, end_y], color=color, alpha=0.3)

        plt.xlim(0, width)
        plt.ylim(height, 0)
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(f"./data/RQ1/plots/scanpaths/scanpath_{snippet}_{skillscore}_{participant}.pdf")
        plt.close()

        # vertical scanpath
        vertical_points_and_time = []
        for idx, (start, end, x, y) in enumerate(fixations):
            if x_low <= x <= x_high and y_low <= y <= y_high:
                y = int(y) - y_low
                duration = (end - start) / 1000
                vertical_points_and_time.append((y, duration))

        pixel_per_second = 3
        current_x = 0
        if len(vertical_points_and_time) == 0:
            continue
        line_points = [(current_x, vertical_points_and_time[0][0])]
        current_x += vertical_points_and_time[0][1] * pixel_per_second
        line_points.append((current_x, vertical_points_and_time[0][0]))
        for idx, (y, duration) in enumerate(vertical_points_and_time[1:]):
            line_points.append((current_x, y))
            current_x += duration * pixel_per_second
            line_points.append((current_x, y))

        max_x = max([x for x, y in line_points])
        line_points = [(x - max_x, y) for x, y in line_points]
        cm = plt.cm.get_cmap('inferno')
        cm = plt.cm.ScalarMappable(cmap=cm, norm=plt.Normalize(vmin=0, vmax=len(line_points)))
        fig, ax = plt.subplots(1)
        ax.imshow(current_image)
        for idx in range(len(line_points) - 1):
            start_x = line_points[idx][0]
            start_y = line_points[idx][1]
            end_x = line_points[idx+1][0]
            end_y = line_points[idx+1][1]
            color = cm.to_rgba(idx)
            ax.plot([start_x, end_x], [start_y, end_y], color=color)
        plt.xlim(-max_x, width)
        plt.ylim(height, 0)
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(f"./data/RQ1/plots/vertical_scanpaths/scanpath_vertical_{snippet}_{skillscore}_{participant}.pdf", dpi=200)
        plt.close()