# Main Analysis

## Imports

In [None]:
import os

import matplotlib.pyplot as plt
import pandas as pd
import pingouin as pg
import scipy.stats as stats
import seaborn as sns
import statsmodels.formula.api as smf
from pandas import DataFrame
from pandas.api.types import CategoricalDtype
from scipy.stats import spearmanr
from statsmodels.stats.multitest import multipletests

## Variable Definitions and Constants

Here I declare all necessary variables and constants. This includes the snippet names as well as the correct answers for each snippet. I further add the possible variations.

### Constants

In [None]:
snippet_answer: dict[str, str] = {
    'arrayAverage': '4.0',
    'binarySearch': '1',
    'binaryToDecimal': '13',
    'bubbleSort': '[1,2,3,4,5]',
    'capitalizeFirstLetter': 'Hello World',
    'commonChars': '2',
    'containsSubstring': 'True',
    'countIntegerInterval': '4',
    'countLetters': '4',
    'crossSum': '16',
    'factorial': '24',
    'forwardBackward': 'pricelesssselecirp',
    'leastCommonMultiple': '30',
    'linearSearch': '1',
    'palindrome': 'True',
    'power': '8',
    'prime': 'True',
    'squareRoot': '[3.0, 5.0, 4.0, 10.0]',
    'unrolledSort': '[8, 9, 11, 12]',
    'validParentheses': 'False',
    'WarmUp': 'oefl',
}

possible_variations: dict[str, list[str]] = {
    'group_meaningful': ['MT', 'MN'],
    'group_obfuscated': ['LT', 'LN'],
}

# The following lists are used to create a dataframe for the General Information
studentQuestions: list[str] = [
    'StudyBefore',
    'Job',
    'CourseOfStudy',
    'Semester',
    'Algorithms',
    'NrFalseInputs_studentQuestions',
]
progQuestions: list[str] = [
    'YearsProgramming',
    'ProgrammingLately',
    'ProgrammingLanguages',
    'RecentProgrammingLanguages',
    'PythonProgramming',
    'OverallExperience',
    'Classmates',
    'NrFalseInputs_progQuestions',
]
generalQuestions: list[str] = [
    'Age',
    'Gender',
    'Eyesight',
    'NrFalseInputs_generalQuestions',
]
miscellaneous_general_information: list[str] = [
    'ActualScreenWidth',
    'ActualScreenHeight',
    'EyeXScreenWidth',
    'EyeXScreenHeight',
    'SubjectID',
]

# The following lists are used to create a dataframe for the Correctness and Time Data
resultsOverall: list[str] = [
    'Number',
    'Task',
    'Answer_Out',
    'Time',
    'TimeOut',
    'SubjectID',
    'CorrectAnswer',
    'Meaningful',
    'TypeAnnotation',
]

necessary_columns: list[str] = [
    'ID',
]

meta_data_columns: list[str] = [
    'GazeData',
    'GazeDataFilled',
    'PersonalInformation',
    'ResultsOverall',
    'PostQuestionnaire',
    'DifficultyRating',
    'InterviewData',
    'OverallTime',
    'Meaningful',
    'Finished',
    'NumberOfMissingSnippets',
    'TrialData',
]

likert_mapping: dict[str, int] = {
    'Very Inexperienced': 0,
    'Inexperienced': 1,
    'Neutral': 2,
    'Experienced': 3,
    'Very Experienced': 4,
}

figure_path = './figures/'

difficulty_type = CategoricalDtype(categories=[1, 2, 3, 4, 5], ordered=True)
experience_type = CategoricalDtype(
    categories=['Very Inexperienced', 'Inexperienced', 'Average', 'Experienced', 'Very Experienced'], ordered=True)

### Global Definitions

In [None]:
participants: list[str] = []  # list of participant numbers
meaningful_participants: list[str] = []  # list of participant numbers who were assigned to the meaningful group
obfuscated_participants: list[str] = []  # list of participant numbers who were assigned to the obfuscated group
df_personal_information: DataFrame = pd.DataFrame(
    columns=necessary_columns + generalQuestions + studentQuestions + progQuestions + miscellaneous_general_information)
df_meta_data: DataFrame = pd.DataFrame(columns=necessary_columns + meta_data_columns)
df_difficulty_rating: DataFrame = pd.DataFrame(columns=['ID', 'Task', 'Difficulty', 'Comment'])
df_subjective_feelings: DataFrame = pd.DataFrame()

### Generally helpful Functions

In [None]:
def define_box_properties(plot_name, color_code, label, axis=None):
    for k, v in plot_name.items():
        plt.setp(plot_name.get(k), color=color_code)

    # use plot function to draw a small line to name the legend
    if axis is not None:
        axis.plot([], c=color_code, label=label)
        axis.legend()
    else:
        plt.plot([], c=color_code, label=label)
        plt.legend()


def calculate_pvalues(df):
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            tmp = df[df[r].notnull() & df[c].notnull()]
            pvalues[r][c] = round(stats.pearsonr(tmp[r], tmp[c])[1], 4)
    return pvalues


def save_trial_start_times(file_path):
    # Load with semicolon separator
    df = pd.read_csv(file_path, sep=";")

    # Remove fixation cross entries
    stimulus_df = df[~df["Snippet"].str.contains("FixationCross")]

    # Add SnippetPath column similar to Snippet but with .jpg ending
    stimulus_df["SnippetPath"] = stimulus_df["Snippet"] + ".jpg"

    # Get first entry for each trial
    first_times = (
        stimulus_df.groupby("TrialNumber")
        .first()
        .reset_index()[["TrialNumber", "Snippet", "Time", "SnippetPath"]]
    )

    # Auto-number sequence
    first_times.insert(0, "Sequence", range(len(first_times)))

    # Clean StartingTime: replace comma with dot, then convert to int
    first_times["Time"] = (
        first_times["Time"]
        .astype(str)
        .str.replace(",", ".", regex=False)
        .astype(float)
        .astype(int)
    )

    # Rename columns
    first_times = first_times.rename(columns={"TrialNumber": "ID", "Time": "StartingTime"})
    if len(first_times) == 0:
        print(len(first_times), "start times found in", file_path)

    # Save to CSV
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    parent_dir = os.path.dirname(file_path)
    output_path = os.path.join(parent_dir, f"{base_name}_start_times.csv")
    first_times.to_csv(output_path, index=False, sep=";")


## Read in all data

This includes:
- [x] ~~Gaze Data~~
- [x] ~~Individual Summary~~
- [x] ~~Personal Information~~
- [x] ~~Correctness and Time~~
- [x] ~~Interview Data~~

First we need to find all possible folders and files. They are located in `Participants`.

### All Participants

In [None]:
participants: list[str] = sorted(os.walk('./data').__next__()[1])

df_meta_data['ID'] = participants

### Check for all files

Collect all files that are present and collect which are still missing.

In [None]:
#  please check if file '`ID` PostQuestionaire.pdf' exists
for participant in participants:
    # check if '`ID` Post-Questionnaire.pdf' exists
    if not os.path.isfile(f'./data/{participant}/{participant} Post-Questionnaire.csv'):
        df_meta_data.loc[df_meta_data['ID'] == participant, 'PostQuestionnaire'] = False
    else:
        df_meta_data.loc[df_meta_data['ID'] == participant, 'PostQuestionnaire'] = True

    # check if '`ID`' exists
    if not os.path.isfile(f'./data/{participant}/DifficultyRating_{participant}.csv'):
        df_meta_data.loc[df_meta_data['ID'] == participant, 'DifficultyRating'] = False
    else:
        df_meta_data.loc[df_meta_data['ID'] == participant, 'DifficultyRating'] = True

    # check if the folder 'Trial_`ID`' exists
    if not os.path.isdir(f'./data/{participant}/Trial_{participant}'):
        df_meta_data.loc[df_meta_data['ID'] == participant, 'TrialData'] = False
    else:
        df_meta_data.loc[df_meta_data['ID'] == participant, 'TrialData'] = True

    # check if 'GeneralInfo_`ID`.csv' exists
    if not os.path.isfile(f'./data/{participant}/Trial_{participant}/GeneralInfo_{participant}.csv'):
        df_meta_data.loc[df_meta_data['ID'] == participant, 'PersonalInformation'] = False
    else:
        df_meta_data.loc[df_meta_data['ID'] == participant, 'PersonalInformation'] = True

    # check if 'ResultsOverall_`ID`.csv' exists
    if not os.path.isfile(f'./data/{participant}/Trial_{participant}/ResultsOverall_{participant}.csv'):
        df_meta_data.loc[df_meta_data['ID'] == participant, 'ResultsOverall'] = False
    else:
        df_meta_data.loc[df_meta_data['ID'] == participant, 'ResultsOverall'] = True

    # check if 'GazeData_`ID`.csv' exists
    if not os.path.isfile(f'./data/{participant}/Trial_{participant}/GazeData_{participant}.csv'):
        df_meta_data.loc[df_meta_data['ID'] == participant, 'GazeData'] = False
    else:
        df_meta_data.loc[df_meta_data['ID'] == participant, 'GazeData'] = True
        save_trial_start_times(f'./data/{participant}/Trial_{participant}/GazeData_{participant}.csv')

Print all Meta Information about Files concerning the study itself.

In [None]:
# print the count of False GazeData vs the count of True GazeData and show which ones are currently missing
gaze_data_count = df_meta_data['GazeData'].value_counts()
print('{:<32} {:2}/{:>2}'.format('GazeDataFiles found:', gaze_data_count[True], len(df_meta_data['GazeData'])))

# print the count of False PersonalInformation vs the count of True PersonalInformation and show which ones are currently missing
personal_information_count = df_meta_data['PersonalInformation'].value_counts()
print('{:<32} {:2}/{:>2}'.format('PersonalInformationFiles found:', personal_information_count[True],
                                 len(df_meta_data['PersonalInformation'])))

# print the count of False ResultsOverall vs the count of True ResultsOverall and show which ones are currently missing
results_overall_count = df_meta_data['ResultsOverall'].value_counts()
print('{:<32} {:2}/{:>2}'.format('ResultsOverallFiles found:', results_overall_count[True],
                                 len(df_meta_data['ResultsOverall'])))

# print the count of False PostQuestionnaire vs the count of True PostQuestionnaire and show which ones are currently missing
post_questionnaire_count = df_meta_data['PostQuestionnaire'].value_counts()
print('{:<32} {:2}/{:>2}'.format('PostQuestionnaireFiles found:', post_questionnaire_count[True],
                                 len(df_meta_data['PostQuestionnaire'])))

# print the count of False DifficultyRating vs the count of True DifficultyRating and show which ones are currently missing
difficulty_rating_count = df_meta_data['DifficultyRating'].value_counts()
print('{:<32} {:2}/{:>2}'.format('DifficultyRatingFiles found:', difficulty_rating_count[True],
                                 len(df_meta_data['DifficultyRating'])))

# print the count of False TrialData vs the count of True TrialData and show which ones are currently missing
trial_data_count = df_meta_data['TrialData'].value_counts()
print('{:<32} {:2}/{:>2}'.format('TrialDataFolders found:', trial_data_count[True], len(df_meta_data['TrialData'])))

# print which ones are missing
print('{:56} {}'.format('The following participants have no GazeData:',
                        list(df_meta_data.loc[df_meta_data['GazeData'] == False, 'ID'].values)))
print('{:56} {}'.format('The following participants have no PersonalInformation:',
                        list(df_meta_data.loc[df_meta_data['PersonalInformation'] == False, 'ID'].values)))
print('{:56} {}'.format('The following participants have no ResultsOverall:',
                        list(df_meta_data.loc[df_meta_data['ResultsOverall'] == False, 'ID'].values)))
print('{:56} {}'.format('The following participants have no PostQuestionnaire:',
                        list(df_meta_data.loc[df_meta_data['PostQuestionnaire'] == False, 'ID'].values)))
print('{:56} {}'.format('The following participants have no DifficultyRating:',
                        list(df_meta_data.loc[df_meta_data['DifficultyRating'] == False, 'ID'].values)))
print('{:56} {}'.format('The following participants have no TrialData:',
                        list(df_meta_data.loc[df_meta_data['TrialData'] == False, 'ID'].values)))

Clean the `df_meta_data` DataFrame of columns that are no longer needed.

In [None]:
unnecessary_columns_meta_data: list[str] = [
    'PersonalInformation',
    'ResultsOverall',
    'PostQuestionnaire',
    'DifficultyRating',
    'TrialData',
]

# remove unnecessary columns from df_meta_data
try:
    df_meta_data.drop(columns=unnecessary_columns_meta_data, inplace=True)

except:
    pass

### Personal Information

I read in all data from the files containing the personal information (`GeneralInfo_{ID}`) and add the `ID` for the participant.

In [None]:
for i in range(0, len(participants)):
    try:
        df_personal_information.loc[i] = \
            pd.read_csv(
                f'./data/{participants[i]}/Trial_{participants[i]}/GeneralInfo_{participants[i]}.csv',
                sep=';').iloc[0]
        df_personal_information.loc[i, 'ID'] = participants[i]
        df_meta_data.loc[i, 'PersonalInformation'] = True

    except:
        print(f'Participant {participants[i]} has no GeneralInfo.csv file')
        df_meta_data.loc[i, 'PersonalInformation'] = False

# write personal_information into a csv
df_personal_information.to_csv(f'./studies/Linearity/GeneralInfo_AllParticipants.csv', sep=';', index=False)

### Difficulty Rating

Read in all the difficulty rating to merge it later on into the results DataFrame

In [None]:
for i in range(0, len(participants)):
    try:
        df_to_add = pd.read_csv(f'./data/{participants[i]}/DifficultyRating_{participants[i]}.csv', sep=';')
        if not all([x in snippet_answer.keys() for x in df_to_add["Task"]]):
            print(
                f'Participant {participants[i]} has a problem in the DifficultyRating file; these names {[x for x in df_to_add["Task"] if x not in snippet_answer.keys()]} are not in the snippet_answer dictionary')

        df_to_add['ID'] = participants[i]
        df_difficulty_rating = pd.concat([df_difficulty_rating, df_to_add], ignore_index=True)
    except:
        pass

df_difficulty_rating['Difficulty'] = df_difficulty_rating['Difficulty'].astype(difficulty_type)

# df_difficulty_rating

### Subjective Feelings

Read in all subjective feelings to merge it later on into the personal information DataFrame

In [None]:
for i in range(0, len(participants)):
    try:
        df_to_add = pd.read_csv(f'./data/{participants[i]}/{participants[i]} Post-Questionnaire.csv',
                                sep=';')
        df_to_add['ID'] = participants[i]
        df_subjective_feelings = pd.concat([df_subjective_feelings, df_to_add], ignore_index=True)
    except:
        pass

#### Data Cleaning for Personal Information

We can drop all columns that contain the number of false Inputs.

In [None]:
unnecessary_columns_gen_info: list[str] = [
    'NrFalseInputs_studentQuestions',
    'NrFalseInputs_progQuestions',
    'NrFalseInputs_generalQuestions',
]

# remove unnecessary columns
try:
    df_personal_information.drop(columns=unnecessary_columns_gen_info, inplace=True)

except:
    pass

# remove participants that are not currently in their bachelor but also mention them
#print(df_personal_information.query('Job != "Undergraduate Student (Bachelor Studies)"')['ID'])
#df_personal_information.drop(df_personal_information[df_personal_information['Job'] != 'Undergraduate Student (Bachelor Studies)'].index, inplace=True)

df_personal_information['PythonProgramming'] = df_personal_information['PythonProgramming'].astype(experience_type)
df_personal_information['OverallExperience'] = df_personal_information['OverallExperience'].astype(experience_type)
df_personal_information['Classmates'] = df_personal_information['Classmates'].astype(experience_type)

category_mapping = {'Very Inexperienced': 1, 'Inexperienced': 2, 'Average': 3, 'Experienced': 4, 'Very Experienced': 5}
df_personal_information['Classmates'] = df_personal_information['Classmates'].map(category_mapping)
df_personal_information['PythonProgramming'] = df_personal_information['PythonProgramming'].map(category_mapping)
df_personal_information['OverallExperience'] = df_personal_information['OverallExperience'].map(category_mapping)

df_personal_information

#### Subjective Difficulty Merging

Merging the subjective feelings into the personal information 

In [None]:
# merge personal information and the subjective feelings
df_personal_information = pd.merge(df_personal_information, df_subjective_feelings, how='left', left_on=['ID'],
                                   right_on=['ID'])


### Cleaning of the personal information

1. Correct and throw out any person that is not in CS-related courses of study
2. Throw out anyone under the age of 18

In [None]:
course_of_study_mapping: dict[str, str] = {
    'cs': 'Computer Science',
    'CS': 'Computer Science',
    'BCs. Informatik': 'Computer Science',
    'Cybersicherheit': 'Cybersecurity',
    'DSAI': 'Data Science and Artificial Intelligence',
    'B. Sc. Business Informatics': 'Business Informatics',
    'Informatik B.Sc': 'Computer Science',
    'Eingebettete Systeme': 'Embedded Systems',
    'Informatik': 'Computer Science',
    'Math and Computer Science': 'Mathematics and Computer Science',
    'Computer Science (German)': 'Computer Science',
    'Mathematik und Informatik': 'Mathematics and Computer Science',
    'Medieninformatik': 'Media Informatics',
    'medieninfo': 'Media Informatics',
    'Informatics': 'Computer Science',
    'Informatik Kernbereich': 'Computer Science',
    'Eingebettete Systeme B.Sc.': 'Embedded Systems',
    'Computerlinguistik B.Sc.': 'Computational Linguistics',
    'materialise science': 'Material Science'
}

possible_courses_of_study: list[str] = ['Computer Science',
                                        'Cybersecurity',
                                        'Business Informatics',
                                        'Data Science and Artificial Intelligence',
                                        'Mathematics and Computer Science',
                                        'Media Informatics',
                                        'Embedded Systems',
                                        'Bioinformatics',
                                        'Computational Linguistics',
                                        'Material Science',
                                        'Bachelors Plus MINT',
                                        'Systems Engineering'
                                        ]

# check the age and throw out anyone under the age of 18
print(f'Have to drop {df_personal_information.query("Age < 18")["ID"].tolist()} because they were under the age of 18')
df_personal_information.drop(df_personal_information[df_personal_information['Age'] < 18].index, inplace=True)

# normalize all the course of study values
df_personal_information['CourseOfStudy'] = df_personal_information['CourseOfStudy'].replace(course_of_study_mapping)
mask = ~df_personal_information['CourseOfStudy'].isin(possible_courses_of_study)
print(
    f"Have to drop {df_personal_information.loc[mask, 'CourseOfStudy'].tolist()} because they were not in the course of study mapping")
print(df_personal_information.loc[mask, 'ID'])

df_personal_information = df_personal_information.loc[~mask].reset_index(drop=True)

# check and throw out anyone that is not studying computer science related fields
non_cs_students: list[str] = df_personal_information.query("CourseOfStudy != @possible_courses_of_study")[
    "CourseOfStudy"].tolist()
print(f'Have to drop {len(non_cs_students)} {non_cs_students} because they were not studying Computer Science')

# change columns to numeric value
df_personal_information['Age'] = pd.to_numeric(df_personal_information['Age'])
df_personal_information['Semester'] = pd.to_numeric(df_personal_information['Semester'])
df_personal_information['YearsProgramming'] = pd.to_numeric(df_personal_information['YearsProgramming'])
df_personal_information['Eyesight'] = df_personal_information['Eyesight'].astype("category")
df_personal_information['Gender'] = df_personal_information['Gender'].astype("category")

# remaining participant IDs
participants = df_personal_information['ID'].unique().tolist()
print(f'Remaining participants: {len(participants)}')

### Correctness and Time

I read in all data from the files containing the results (`ResultsOverall_{ID}`) and add the `ID` for the participant.

In [None]:
df_results_overall: DataFrame = pd.DataFrame(columns=necessary_columns + resultsOverall)

for i in range(0, len(participants)):

    df_to_add_general: DataFrame = DataFrame()

    # check if the participant has finished all snippets and read in the results data
    try:
        df_to_add: DataFrame = pd.read_csv(
            f'./data/{participants[i]}/Trial_{participants[i]}/ResultsOverall_{participants[i]}.csv',
            sep=';').assign(ID=participants[i])

        if df_to_add['Answer_Out'].isnull().values.any():
            df_meta_data.loc[i, 'Finished'] = False
        else:
            df_meta_data.loc[i, 'Finished'] = True

        df_to_add['ID'] = participants[i]
        df_results_overall = pd.concat([df_results_overall, df_to_add], ignore_index=True)

        df_meta_data.loc[i, 'ResultsOverall'] = True

    except:
        print(f'Participant {participants[i]} has no ResultsOverall.csv file')
        df_meta_data.loc[i, 'ResultsOverall'] = False

# now get the values from the eyetracking and merge it into the dataframe
# TODO: this file does not yet exist, create it first
df_eye_tracking_metrics: DataFrame = pd.read_csv(f'./output/AOI/Metrics_Data_for_Anova.csv', sep=';')
df_results_overall['SubjectID'] = df_results_overall['SubjectID'].apply(lambda x: x[:8])
df_results_overall = pd.merge(df_results_overall, df_eye_tracking_metrics, how='left', left_on=['SubjectID', 'Task'],
                              right_on=['Participant', 'Snippet'])

# sort the dataframe by the participant `ID` and `Number`
df_results_overall.sort_values(by=['ID', 'Number'], inplace=True)

Add the meaningful tag to the personal information dataframe

#### Data Cleaning for Correctness and Time

We can drop the column `SubjectID` and turn the `Time` column into seconds.

In [None]:
unnecessary_columns_overall_results: list[str] = [
    'SubjectID',
    'Number',
    'Participant',
    'Expert',
    'ExecOrder_Naive_Score',
    'ExecOrder_Dynamic_Score',
    'ExecOrder_Dynamic_Repetitions',
    'Snippet',
]

# remove unnecessary columns
try:
    df_results_overall.drop(columns=unnecessary_columns_overall_results, inplace=True, ignore_index=True)
except:
    pass

# Show whether a participant did all snippets or if they ran into a timeout
for i in range(0, len(participants)):
    if df_results_overall.query(f'ID == "{participants[i]}"')['Answer_Out'].isnull().values.any():
        number_of_missing_snippets: int = df_results_overall.query(f'ID == "{participants[i]}"')[
            "Answer_Out"].isnull().sum()
        print(
            f'Participant {participants[i]} did not finish all snippets, they missed {number_of_missing_snippets} snippets.')
        df_meta_data.loc[i, 'NumberOfMissingSnippets'] = number_of_missing_snippets
    else:
        df_meta_data.loc[i, 'NumberOfMissingSnippets'] = 0

    if df_results_overall.query(f'ID == "{participants[i]}"')['TimeOut'].any():
        print(f'Participant {participants[i]} ran into a timeout.')

# drop any line that has a `Time` value of 0
df_results_overall: DataFrame = df_results_overall[df_results_overall['Time'] != 0]

# drop every line that has a `Task` value of `WarmUp`
df_results_overall: DataFrame = df_results_overall[df_results_overall['Task'] != 'WarmUp']

# turn the `Time` column into seconds as it is currently in milliseconds
df_results_overall['Time'] = df_results_overall['Time'].apply(lambda x: x // 1000 if x > 1000 else x)

# fill the columns `Meaningful` and `Type Annotation` with the correct values
df_results_overall['Meaningful'] = df_results_overall['Task'].apply(lambda x: True if x[-2] == 'M' else False)
df_results_overall['TypeAnnotation'] = df_results_overall['Task'].apply(lambda x: True if x[-1] == 'T' else False)

# set the column of Meaningful to True if the participant has at least one meaningful snippet
df_meta_data['Meaningful'] = df_meta_data.apply(
    lambda x: True if df_results_overall.query(f'ID == "{x["ID"]}"')['Meaningful'].any() else False, axis=1)

# save which of the participants were assigned to the respective groups
meaningful_participants = df_meta_data.query('Meaningful == True')['ID'].values
obfuscated_participants = df_meta_data.query('Meaningful == False')['ID'].values
print(f'Number of meaningful runs: {len(meaningful_participants)}')
print(
    f'Number of meaningful runs with eye tracking data: {df_meta_data.query("Meaningful == True & GazeData == True")["ID"].nunique()}')
print(f'Number of obfuscated runs: {len(obfuscated_participants)}')
print(
    f'Number of obfuscated runs with eye tracking data: {df_meta_data.query("Meaningful == False & GazeData == True")["ID"].nunique()}')
# remove the last two letters from the `Task` column
df_results_overall['Task'] = df_results_overall['Task'].apply(lambda x: x[:-2] if x[-2] == 'M' or x[-2] == 'L' else x)

# compute if the correct answer was given for each `Task` and `ID` and add it to `CorrectAnswer`
df_results_overall['CorrectAnswer'] = df_results_overall.apply(
    lambda x: True if snippet_answer[x['Task']] == x['Answer_Out'] else False, axis=1)


Now merge `df_results_overall` and `df_difficulty_rating` and also merge `personal_information` and the `Meaningful` columns of `df_results_overall`.

In [None]:
df_results_overall = pd.merge(df_results_overall, df_difficulty_rating, how='left', left_on=['ID', 'Task'],
                              right_on=['ID', 'Task'])

#### Computations for Correctness and Time

First, we create a general overview of the tables data.

We compute the time taken for each `Task` for each participant `ID`.

Add the meaningful and obfuscated values to the personal information DataFrame.

In [None]:
df_personal_information['Meaningful'] = df_personal_information['ID'].apply(
    lambda x: True if x in meaningful_participants else False)

df_personal_information

In [None]:
df_results_overall = pd.merge(df_results_overall, df_personal_information, how='left', left_on=['ID', 'Meaningful'],
                              right_on=['ID', 'Meaningful'])

In [None]:
df_results_overall.groupby(['ID', 'Task']).agg({'Time': 'mean'}).unstack().T.fillna('-')

In [None]:
import numpy as np
import pandas as pd

# --- Aggregate one row per participant (take 'first' for demographic fields) ---
df_participants = (
    df_results_overall.groupby('ID')
    .agg({
        'Meaningful': 'first',
        'Gender': 'first',
        'Age': 'first',
        'Semester': 'first',
        'YearsProgramming': 'first'
    })
    .reset_index()
)

# Compute Ns
n_total = len(df_participants)
n_meaningful = df_participants['Meaningful'].sum() if df_participants['Meaningful'].dtype == bool else \
    df_participants[df_participants['Meaningful'] == True].shape[0]
n_not_meaningful = n_total - n_meaningful

# --- Gender crosstab & percentages (row-wise within Meaningful groups) ---
gender_counts = pd.crosstab(df_participants['Gender'], df_participants['Meaningful'])
# compute percentages within each Meaningful column
gender_percentages = gender_counts.div(gender_counts.sum(axis=0), axis=1) * 100

# --- Print gender table (clean text) ---
print("\nGender distribution (counts and % within each Meaningful group):")
header = f"{'Gender':<22} {'Meaningful=False (N=' + str(n_not_meaningful) + ')':<28} {'Meaningful=True (N=' + str(n_meaningful) + ')':<28}"
print(header)
print('-' * len(header))

for gender in gender_counts.index:
    count_false = int(gender_counts.loc[gender, False]) if False in gender_counts.columns and not pd.isna(
        gender_counts.loc[gender, False]) else 0
    count_true = int(gender_counts.loc[gender, True]) if True in gender_counts.columns and not pd.isna(
        gender_counts.loc[gender, True]) else 0

    pct_false = gender_percentages.loc[gender, False] if False in gender_percentages.columns else 0
    pct_true = gender_percentages.loc[gender, True] if True in gender_percentages.columns else 0

    val_false = f"{count_false} ({pct_false:.0f}%)" if count_false > 0 else "---"
    val_true = f"{count_true} ({pct_true:.0f}%)" if count_true > 0 else "---"

    print(f"{gender:<22} {val_false:<28} {val_true:<28}")


# --- Function to compute mean ± sd and overall ---
def mean_sd_text(series):
    s = series.dropna()
    if len(s) == 0:
        return ("---", "---")
    return (f"{s.mean():.2f}", f"{s.std(ddof=1):.2f}")


# --- Continuous variables summary with overall ---
cont_vars = [
    ('Age', 'Age (in Years)'),
    ('Semester', 'Semester'),
    ('YearsProgramming', 'Years of Programming')
]

print("\nContinuous variables (mean ± SD):")
header2 = f"{'Variable':<25} {'Meaningful=False':<20} {'Meaningful=True':<20} {'Overall':<20}"
print(header2)
print('-' * (len(header2) + 10))

for col, label in cont_vars:
    # per-group
    grp = df_participants.groupby('Meaningful')[col]
    mean_false, sd_false = ("---", "---")
    mean_true, sd_true = ("---", "---")

    if False in grp.groups:
        mean_false, sd_false = mean_sd_text(grp.get_group(False))
    if True in grp.groups:
        mean_true, sd_true = mean_sd_text(grp.get_group(True))

    # overall
    mean_overall, sd_overall = mean_sd_text(df_participants[col])

    text_false = f"{mean_false} ± {sd_false}" if mean_false != "---" else "---"
    text_true = f"{mean_true} ± {sd_true}" if mean_true != "---" else "---"
    text_over = f"{mean_overall} ± {sd_overall}" if mean_overall != "---" else "---"

    print(f"{label:<25} {text_false:<20} {text_true:<20} {text_over:<20}")

# --- Also print sample sizes explicitly for clarity ---
print(
    f"\nSample sizes: Total N = {n_total}, Meaningful=True N = {n_meaningful}, Meaningful=False N = {n_not_meaningful}")


Analyse Correctness and Time

In [None]:
aggregation_computation: dict[str, str] = {
    'CorrectAnswer': [np.count_nonzero, ],
    'TypeAnnotation': ['count', ],
    'Time': ['mean', 'median', 'min', 'max', 'sum', ],
}

overview_correctness_time: DataFrame = df_results_overall.groupby(['ID']).agg(aggregation_computation).reset_index()

fig, axs = plt.subplots(1, 2, figsize=(18, 7))
fig.suptitle('Distribution of Time and Correctness', fontsize=16)

# Plot 1: Time Distribution
time_std_threshold = 3
time_col = ('Time', 'mean')
mean_time = overview_correctness_time[time_col].mean()
std_time = overview_correctness_time[time_col].std()
upper_bound_time = mean_time + time_std_threshold * std_time
lower_bound_time = mean_time - time_std_threshold * std_time
outliers_time = overview_correctness_time[
    (overview_correctness_time[time_col] > upper_bound_time) | (overview_correctness_time[time_col] < lower_bound_time)]

sns.histplot(overview_correctness_time[time_col], kde=True, ax=axs[0], label='Time Distribution')
axs[0].set_title(f'Total Time per Participant (Outliers > {time_std_threshold}σ)')
axs[0].set_xlabel('Total Time (seconds)')
axs[0].set_ylabel('Number of Participants')
axs[0].axvline(mean_time, color='red', linestyle='--', label=f'Mean: {mean_time:.2f}s')
axs[0].axvline(upper_bound_time, color='orange', linestyle=':',
               label=f'Upper Bound (+{time_std_threshold}σ): {upper_bound_time:.2f}s')
if lower_bound_time > 0:
    axs[0].axvline(lower_bound_time, color='orange', linestyle=':',
                   label=f'Lower Bound (-{time_std_threshold}σ): {lower_bound_time:.2f}s')
axs[0].legend()

# Plot 2: Correct Answer Distribution
correct_col = ('CorrectAnswer', 'count_nonzero')
mean_correct = overview_correctness_time[correct_col].mean()
std_correct = overview_correctness_time[correct_col].std()
lower_bound_correct = 11

# Create a boolean Series for filtering
is_outlier_correct = (overview_correctness_time[correct_col] < lower_bound_correct)
outliers_correct = overview_correctness_time[is_outlier_correct]

sns.histplot(overview_correctness_time[correct_col], kde=True, ax=axs[1], color='green', binwidth=1)
axs[1].set_title(f'Correct Answers per Participant')
axs[1].set_xlabel('Number of Correct Answers')
axs[1].set_ylabel('Number of Participants')
axs[1].axvline(mean_correct, color='red', linestyle='--', label=f'Mean: {mean_correct:.2f}')
axs[1].axvline(lower_bound_correct, color='orange', linestyle=':', label=f'Lower Bound (10/20)')
axs[1].legend()

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

print(
    f"Time analysis: {len(outliers_time)} participant(s) found as outliers outside of {time_std_threshold} standard deviations from the mean ({mean_time:.2f}s).")
if not outliers_time.empty:
    print("Outlier participant(s) by ID and their total time:")
    print(pd.concat([outliers_time['ID'], outliers_time[time_col]], axis=1))

print(
    f"\nCorrectness analysis: {len(outliers_correct)} participant(s) found as outliers below {lower_bound_correct}/20).")
if not outliers_correct.empty:
    print("Outlier participant(s) by ID and their counts of correct answers:")
    print(pd.concat([outliers_correct['ID'], outliers_correct[correct_col]], axis=1))

Remove participants 12663, 13020, 24280, 29206, 31195 because correctness below 11

In [None]:
outlier_participants: list[str] = ['12663', '13020', '24280', '29206', '31195']
overview_correctness_time = overview_correctness_time[~overview_correctness_time['ID'].isin(outlier_participants)]

df_results_overall = df_results_overall[~df_results_overall['ID'].isin(outlier_participants)]
df_personal_information = df_personal_information[~df_personal_information['ID'].isin(outlier_participants)]

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(18, 7))
fig.suptitle('Distribution of Time and Correctness (After Removing Outliers)', fontsize=16)
# Plot 1: Time Distribution
time_col = ('Time', 'mean')
mean_time = overview_correctness_time[time_col].mean()

sns.histplot(overview_correctness_time[time_col], kde=True, ax=axs[0], label='Time Distribution')
axs[0].set_title(f'Total Time per Participant (After Removing Outliers)')
axs[0].set_xlabel('Total Time (seconds)')
axs[0].set_ylabel('Number of Participants')
axs[0].axvline(mean_time, color='red', linestyle='--', label=f'Mean: {mean_time:.2f}s')
axs[0].legend()

# Plot 2: Correct Answer Distribution
correct_col = ('CorrectAnswer', 'count_nonzero')
mean_correct = overview_correctness_time[correct_col].mean()

sns.histplot(overview_correctness_time[correct_col], kde=True, ax=axs[1], color='green', binwidth=1)
axs[1].set_title(f'Correct Answers per Participant (After Removing Outliers)')
axs[1].set_xlabel('Number of Correct Answers')
axs[1].set_ylabel('Number of Participants')
axs[1].axvline(mean_correct, color='red', linestyle='--', label=f'Mean: {mean_correct:.2f}')
axs[1].legend()

Now remove certain data points

In [None]:
# Drop rows with any NaN in Time, NumberOfFixations
df_results_overall = df_results_overall.dropna(
    subset=["Time", "NumberOfFixations"]
).copy()

In [None]:
# Add new column for true fixations per second
df_results_overall["TrueFixationsPerSecond"] = (
        df_results_overall["NumberOfFixations"] / df_results_overall["Time"]
)

In [None]:
# Max time (in seconds)
MAX_TIME = 300

#Min and Max number of fixations
MIN_FIX = 10
MAX_FIX = 1000

# Min and Max fixations per second
MIN_FPS = 0.5
MAX_FPS = 8

# Min Percentage HitsBlock
MIN_HITS_BLOCK = 0.7


# IQR bounds
def iqr_bounds(series, k=1.5):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - k * IQR
    upper = Q3 + k * IQR
    return lower, upper


# Compute IQR bounds
fix_lo, fix_hi = iqr_bounds(df_results_overall["NumberOfFixations"])
fps_lo, fps_hi = iqr_bounds(df_results_overall["TrueFixationsPerSecond"])

print("Fix bounds:", fix_lo, fix_hi)
print("FPS bounds:", fps_lo, fps_hi)

mask_outliers = (
    # Absolute checks
        (df_results_overall["Time"] > MAX_TIME) |

        (df_results_overall["NumberOfFixations"] < MIN_FIX) |
        (df_results_overall["NumberOfFixations"] > MAX_FIX) |

        (df_results_overall["TrueFixationsPerSecond"] < MIN_FPS) |
        (df_results_overall["TrueFixationsPerSecond"] > MAX_FPS) |

        (df_results_overall["HitsBlock"] < MIN_HITS_BLOCK) |

        # IQR checks
        (df_results_overall["NumberOfFixations"] < fix_lo) |
        (df_results_overall["NumberOfFixations"] > fix_hi) |
        (df_results_overall["TrueFixationsPerSecond"] < fps_lo) |
        (df_results_overall["TrueFixationsPerSecond"] > fps_hi)
)


In [None]:
# Individual criterion masks
masks = {
    'Time > MAX_TIME': df_results_overall["Time"] > MAX_TIME,
    'NumFix < MIN_FIX': df_results_overall["NumberOfFixations"] < MIN_FIX,
    'NumFix > MAX_FIX': df_results_overall["NumberOfFixations"] > MAX_FIX,
    'FPS < MIN_FPS': df_results_overall["TrueFixationsPerSecond"] < MIN_FPS,
    'FPS > MAX_FPS': df_results_overall["TrueFixationsPerSecond"] > MAX_FPS,
    'HitsBlock < MIN': df_results_overall["HitsBlock"] < MIN_HITS_BLOCK,
    'NumFix < IQR_low': df_results_overall["NumberOfFixations"] < fix_lo,
    'NumFix > IQR_high': df_results_overall["NumberOfFixations"] > fix_hi,
    'FPS < IQR_low': df_results_overall["TrueFixationsPerSecond"] < fps_lo,
    'FPS > IQR_high': df_results_overall["TrueFixationsPerSecond"] > fps_hi
}

# Print individual counts
print("Individual criterion removal counts:")
for name, mask in masks.items():
    count = mask.sum()
    pct = (count / len(df_results_overall)) * 100
    print(f"{name:25s}: {count:4d} ({pct:5.2f}%)")

# Total outliers
total_outliers = mask_outliers.sum()
print("\n" + "=" * 60)
print(f"{'Total outliers':25s}: {total_outliers:4d} ({(total_outliers / len(df_results_overall)) * 100:5.2f}%)")
print("=" * 60)

In [None]:
# Store in dataframe
df_results_overall["Outlier"] = mask_outliers

# Check counts
print(df_results_overall["Outlier"].value_counts())


In [None]:
df_results_overall[
    ['ID', 'Task', 'CorrectAnswer', 'Time', 'NumberOfFixations', 'TrueFixationsPerSecond', 'HitsBlock', 'Outlier',
     'HitsType', 'HitsReturnType', 'Meaningful', 'TypeAnnotation']]

In [None]:
# drop outliers
df_results_overall = df_results_overall[~df_results_overall["Outlier"]].copy()

In [None]:
# df with only id, task, correctanswer, time, number of fixations, true fixations per second
df_analysis = df_results_overall[
    ['ID', 'Task', 'CorrectAnswer', 'Time', 'NumberOfFixations', 'TrueFixationsPerSecond', 'HitsBlock', 'Difficulty',
     'HitsType', 'HitsReturnType', 'Meaningful', 'TypeAnnotation']]
df_analysis

In [None]:
print(len(df_analysis['ID'].value_counts()))
# print number of particpants in meaningful and obfuscated group
print("Number of participants in meaningful group:", df_analysis.loc[df_analysis["Meaningful"] == True, "ID"].nunique())
print("Unique participants in obfuscated group:", df_analysis.loc[df_analysis["Meaningful"] == False, "ID"].nunique())

# print number of tasks for meaningful and obfuscated group
print(f'Number of tasks in meaningful group: {len(df_analysis[df_analysis["Meaningful"] == True])}')
print(f'Number of tasks in obfuscated group: {len(df_analysis[df_analysis["Meaningful"] == False])}')

# print number of tasks in TA and NoTA group
print(f'Number of tasks in TA group: {len(df_analysis[df_analysis["TypeAnnotation"] == True])}')
print(f'Number of tasks in NoTA group: {len(df_analysis[df_analysis["TypeAnnotation"] == False])}')

Describe Data

In [None]:
# mean time and sd
mean_time = df_analysis['Time'].mean()
std_time = df_analysis['Time'].std()
print(f'Mean Time: {mean_time:.1f} seconds, SD: {std_time:.1f} seconds')

# mean time meaningful
mean_time_meaningful = df_analysis[df_analysis['Meaningful'] == True]['Time'].mean()
std_time_meaningful = df_analysis[df_analysis['Meaningful'] == True]['Time'].std()
print(f'Mean Time Meaningful: {mean_time_meaningful:.1f} seconds, SD: {std_time_meaningful:.1f} seconds')

# mean time obfuscated
mean_time_obfuscated = df_analysis[df_analysis['Meaningful'] == False]['Time'].mean()
std_time_obfuscated = df_analysis[df_analysis['Meaningful'] == False]['Time'].std()
print(f'Mean Time Obfuscated: {mean_time_obfuscated:.1f} seconds, SD: {std_time_obfuscated:.1f} seconds')

# number of meaningful tasks correct
num_meaningful_tasks = len(df_analysis[df_analysis['Meaningful'] == True])
num_correct_meaningful = len(df_analysis[(df_analysis['Meaningful'] == True) & (df_analysis['CorrectAnswer'] == True)])
print(
    f'Number of Correct Meaningful Tasks: {num_correct_meaningful} out of {num_meaningful_tasks} ({(num_correct_meaningful / num_meaningful_tasks) * 100:.1f}%)')

# number of obfuscated tasks correct
num_obfuscated_tasks = len(df_analysis[df_analysis['Meaningful'] == False])
num_correct_obfuscated = len(df_analysis[(df_analysis['Meaningful'] == False) & (df_analysis['CorrectAnswer'] == True)])
print(
    f'Number of Correct Obfuscated Tasks: {num_correct_obfuscated} out of {num_obfuscated_tasks} ({(num_correct_obfuscated / num_obfuscated_tasks) * 100:.1f}%)')

AOI Analysis

In [None]:
df_results_overall['DifficultyNumeric'] = df_results_overall['Difficulty'].dropna().astype(float)
df_results_overall['DifficultyNumeric'] = df_results_overall['DifficultyNumeric'].apply(lambda x: x / 5)
df_results_overall['OverallExperienceNumeric'] = df_results_overall['OverallExperience'].dropna().astype(float)
df_results_overall['OverallExperienceNumeric'] = df_results_overall['OverallExperienceNumeric'].apply(lambda x: x / 5)
df_results_overall['HitsTypePct'] = df_results_overall['HitsType'] * 100
df_results_overall['HitsReturnTypePct'] = df_results_overall['HitsReturnType'] * 100
df_results_overall['HitsTypeCombinedPct'] = df_results_overall['HitsTypePct'] + df_results_overall['HitsReturnTypePct']

# Aggregate only snippets with TA
df_type = df_results_overall[df_results_overall['TypeAnnotation'] == True]

# Sort tasks by HitsTypeCombinedPct
df_type = df_type.sort_values('HitsTypeCombinedPct', ascending=False)

# Compute mean per participant
mean_order = (
    df_type.groupby('ID')['HitsTypeCombinedPct']
    .mean()
    .sort_values(ascending=False)
    .index
)

# map each participant in meaningful_participants to another id for the plot starting with M like M01 M02 etc. and for obfuscated use L03 L04 etc.
# create a plot-only ID mapping (Mxx if any Meaningful==True for that ID, otherwise Lxx)
df_plot = df_type.copy()
unique_ids = mean_order.tolist()

label_map = {}
m_labels = []
l_labels = []
count = 1
for uid in unique_ids:
    is_meaningful = df_plot.loc[df_plot['ID'] == uid, 'Meaningful'].fillna(False).any()
    if is_meaningful:
        label_map[uid] = f"P{count:02d}"
        m_labels.append(f"P{count:02d}")
        count += 1
    else:
        label_map[uid] = f"P{count:02d}"
        l_labels.append(f"P{count:02d}")
        count += 1

# add PlotID column and build order for plotting
df_plot['ID'] = df_plot['ID'].map(label_map)

# prepare order with a separator between M and L
sep_label = ''  # empty string used as separator (will produce a tick but no data)
plot_order = m_labels + [sep_label] + l_labels

# add one dummy row per hue with the separator label and NaN y so a gap appears
dummy_rows = []
for hue_val in df_plot['TAComprehension'].dropna().unique():
    dummy_rows.append({
        'PlotID': sep_label,
        'HitsTypeCombinedPct': np.nan,
        'TAComprehension': hue_val
    })
df_plot_with_sep = pd.concat([df_plot, pd.DataFrame(dummy_rows)], ignore_index=True)

# --- Describe data before filtering participants ---

# Count number of datapoints per participant
df_counts = df_plot_with_sep.copy()

# Plot using the sorted order
# Publication styling
sns.set_style('white')
sns.set_context('paper', font_scale=1.05)

#palette = {1.0: '#96a3a6', 0.0: '#aaa97a'}

fig, ax = plt.subplots(figsize=(10, 3))

# point estimates use color c9d3d6 for all points
sns.pointplot(
    data=df_plot,
    x='ID',
    y='HitsTypeCombinedPct',
    color='#8CA8AC',
    #hue='TAComprehension',
    order=plot_order,
    dodge=0,
    linestyle='none',
    errorbar=None,
    #palette=palette,
    markers='_',
    markersize=15,
    markeredgewidth=2,
    ax=ax
)

# overlay raw observations (small, semi-transparent)
sns.stripplot(
    data=df_plot,
    x='ID',
    y='HitsTypeCombinedPct',
    color='#8CA8AC',
    #hue='TAComprehension',
    order=plot_order,
    dodge=False,
    jitter=0.15,
    size=4,
    #palette=palette,
    linewidth=0,
    ax=ax
)

# remove duplicate legend entries
handles, labels = ax.get_legend_handles_labels()
# keep only the first pair of legend entries (one per hue)
unique = {}
for h, l in zip(handles, labels):
    if l not in unique:
        unique[l] = h
ax.legend(unique.values(), unique.keys(), title='Are Type Annotations Helpful?', frameon=False, loc='upper right',
          fontsize=9, bbox_to_anchor=(1, 0.85))

# no legend
ax.get_legend().remove()

# Add group labels above each participant group (aligned)
# Current y-axis limits
ymin, ymax = ax.get_ylim()

# Vertical placement for labels and lines
label_y = ymax * 1.1
line_y = ymax * 1.08

# Compute exact start/end indices for groups
m_start = 0
m_end = len(m_labels) - 1
l_start = len(m_labels) + 1  # skip separator tick
l_end = l_start + len(l_labels) - 1

# Compute group centers for label placement
m_center = (m_start + m_end) / 2
l_center = (l_start + l_end) / 2

# Add text labels
ax.text(m_center, label_y, 'Meaningful',
        ha='center', va='bottom', fontsize=10, fontweight='bold')
ax.text(l_center, label_y, 'Obfuscated',
        ha='center', va='bottom', fontsize=10, fontweight='bold')

# Add horizontal lines
ax.hlines(line_y, m_start, m_end, color='gray', lw=0.8)
ax.hlines(line_y, l_start, l_end, color='gray', lw=0.8)

ax.set_ylim(ymin, ymax * 1.15)

# aesthetics for publication
ax.set_xlabel('Participant')
ax.set_ylabel('Fixation % on Type Annotations')
plt.xticks(rotation=45)
sns.despine(trim=True)
plt.tight_layout()
plt.savefig('output/FixationsOnTypePart.pdf', dpi=300, bbox_inches='tight')

In [None]:
task_order = (
    df_type.groupby('Task')['HitsTypeCombinedPct']
    .mean()
    .sort_values(ascending=False)
    .index
)

plt.figure(figsize=(10, 4))
# point estimates
sns.pointplot(
    data=df_type,
    x='Task',
    y='HitsTypeCombinedPct',
    color='#8CA8AC',
    dodge=False,
    order=task_order,
    linestyle='none',
    errorbar=None,
    markers='_',
    markersize=15,
    markeredgewidth=4,
)

sns.stripplot(data=df_type,
              x='Task',
              y='HitsTypeCombinedPct',
              color='#8CA8AC',
              order=task_order,
              jitter=0.15,
              size=6,
              dodge=False,
              alpha=0.8)
plt.ylabel('Fixation % on Type Annotations', fontsize=20)
plt.xlabel('')
plt.yticks(fontsize=20)
plt.xticks(rotation=80,
           fontsize=20,
           ha='right',
           rotation_mode='anchor',  # prevents rotation from shifting tick positions
           )
plt.savefig('output/FixationsOnTypeSnippet.pdf', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Aggregate per task
df_task = df_type.groupby('Task')[['HitsTypeCombinedPct', 'DifficultyNumeric', 'TAComprehension', 'YearsProgramming',
                                   'OverallExperienceNumeric']].mean().reset_index()

df_task_sorted = df_task.sort_values('HitsTypeCombinedPct', ascending=False)

# HitsType + Difficulty side by side
df_task_heat = df_task_sorted.set_index('Task')[['HitsTypeCombinedPct', 'DifficultyNumeric']]

fig, axes = plt.subplots(1, 2, figsize=(12, 6), gridspec_kw={'width_ratios': [1, 1]})

# HitsType heatmap (Blue)
sns.heatmap(df_task_heat[['HitsTypeCombinedPct']], annot=True, fmt=".1f", cmap='Blues', cbar=True, ax=axes[0])
axes[0].set_title('HitsTypeCombinedPct Fixations (%)')
axes[0].set_ylabel('Task')
axes[0].set_xlabel('')

# Difficulty heatmap (Red)
sns.heatmap(df_task_heat[['DifficultyNumeric']], annot=True, fmt=".2f", cmap='Reds', cbar=True, ax=axes[1])
axes[1].set_title('Difficulty (Normalized)')
axes[1].set_ylabel('')
axes[1].set_xlabel('')
axes[1].set_yticks([])

plt.tight_layout()
plt.show()


In [None]:
# Aggregate per participant
df_participant = df_type.groupby('ID')[
    ['HitsTypeCombinedPct', 'DifficultyNumeric', 'TAComprehension', 'YearsProgramming',
     'OverallExperienceNumeric']].mean().reset_index()

# HitsType vs Difficulty
df_corr = df_participant[['HitsTypeCombinedPct', 'DifficultyNumeric']].dropna()

# Spearman correlation
spearman_r, spearman_p = spearmanr(df_corr['HitsTypeCombinedPct'], df_corr['DifficultyNumeric'])
print(f"Spearman correlation: ρ = {spearman_r:.3f}, p = {spearman_p:.4f}")

plt.figure(figsize=(7, 5))
sns.regplot(
    data=df_corr,
    x='DifficultyNumeric',
    y='HitsTypeCombinedPct',
    ci=None,
)
# Force axes to start from 0
plt.xlim(left=0, right=1.0)
plt.ylim(bottom=0)
plt.title('Correlation between Difficulty and Type-related Fixations')
plt.xlabel('Difficulty')
plt.ylabel('Fixation % (Type + Return Type)')
plt.show()


In [None]:
from scipy.stats import kendalltau

df_corr = df_participant[['HitsTypeCombinedPct', 'YearsProgramming']].dropna()

# Spearman correlation
spearman_r, spearman_p = kendalltau(df_corr['HitsTypeCombinedPct'], df_corr['YearsProgramming'])
print(f"Spearman correlation: ρ = {spearman_r:.3f}, p = {spearman_p:.4f}")

plt.figure(figsize=(7, 5))
sns.regplot(
    data=df_corr,
    x='YearsProgramming',
    y='HitsTypeCombinedPct',
    ci=None,
)
# Force axes to start from 0
plt.xlim(left=0)
plt.ylim(bottom=0)
plt.title('Correlation between Programming Experience and Type-related Fixations')
plt.xlabel('Years of Programming Experience')
plt.ylabel('Fixation % (Type + Return Type)')
plt.show()


In [None]:
print(df_personal_information.CourseOfStudy.value_counts(normalize=True))

In [None]:
sns.countplot(df_personal_information, x='CourseOfStudy')
plt.show()


### People Distributions and Data

1. The age
2. Their experience
3. Their Course of Study

In [None]:
# average age and standard deviation
mean_age = df_personal_information['Age'].mean()
std_age = df_personal_information['Age'].std()
print(f'Mean Age: {mean_age:.1f}, SD: {std_age:.1f}')
sns.boxplot(data=df_personal_information, y='Age', )
plt.show()

In [None]:
sns.boxplot(data=df_personal_information, y='Age', hue='Meaningful', gap=.1)
plt.show()

# Calculate mean and standard deviation
group_stats_age = df_personal_information.groupby('Meaningful')['Age'].agg(['mean', 'std'])

# Print values
for group, group_data in group_stats_age.iterrows():
    print(f"{group} -> Mean: {group_data['mean']:.2f}, SD: {group_data['std']:.2f}")




The courses of study

In [None]:
sns.countplot(data=df_personal_information, x='CourseOfStudy', hue='Gender')
plt.xticks(rotation=90)
plt.show()

The number of semesters

In [None]:
sns.barplot(data=df_personal_information, x='CourseOfStudy', hue='Gender', y='Semester')
plt.xticks(rotation=90)
plt.show()

The number of semesters in general

In [None]:
# mean semester and standard deviation
mean_semester = df_personal_information['Semester'].mean()
std_semester = df_personal_information['Semester'].std()
print(f'Mean Semester: {mean_semester:.2f}, SD: {std_semester:.2f}')

In [None]:
sns.boxplot(data=df_personal_information, y='Semester', hue='Meaningful', gap=.1, )
plt.show()

# Calculate mean and standard deviation
group_stats = df_personal_information.groupby('Meaningful')['Semester'].agg(['mean', 'std'])

# Print values
for group, group_data in group_stats.iterrows():
    print(f"{group} -> Mean: {group_data['mean']:.2f}, SD: {group_data['std']:.2f}")

Classmates and overall experience

In [None]:
sns.boxplot(data=df_personal_information, y='OverallExperience', x='Semester', hue='Meaningful', gap=.1,
            # order=['Very Inexperienced', 'Inexperienced', 'Average', 'Experienced', 'Very Experienced']
            )
plt.show()

Classmates

In [None]:
sns.boxplot(data=df_personal_information,
            y='Classmates',
            x='Semester',
            hue='Meaningful',
            gap=.1,
            # order=['Very Inexperienced', 'Inexperienced', 'Average', 'Experienced', 'Very Experienced'],
            )
plt.show()

OverallExperience by ProgrammingLately

In [None]:
sns.catplot(data=df_personal_information,
            x='ProgrammingLately',
            y='Classmates',
            hue='Meaningful',
            # gap=.1,
            order=['Daily', 'Weekly', 'Monthly', 'Yearly'],
            kind='swarm'
            )
plt.show()

#### Standard Data for the personal information

In [None]:
print(f'Age:\n{df_personal_information["Age"].describe()}\n')
print(f'Semester:\n{df_personal_information["Semester"].describe()}\n')
print(f'YearsProgramming:\n{df_personal_information["YearsProgramming"].describe()}\n')
print(f'Gender:\n{df_personal_information.groupby("Gender").agg({"Gender": "count"})}\n')
print(f'Classmates:\n{df_personal_information.groupby("Classmates").agg({"Classmates": "count"})}\n')

Standard Data for personal information divided by Meaningful Data

In [None]:
print(f'Age:\n{df_personal_information.groupby("Meaningful").agg({"Age": ["mean", "std"]})}')
print(f'Semester:\n{df_personal_information.groupby("Meaningful").agg({"Semester": ["mean", "std"]})}\n')
print(
    f'YearsProgramming:\n{df_personal_information.groupby("Meaningful").agg({"YearsProgramming": ["mean", "std"]})}\n')
print(f'Gender:\n{df_personal_information.groupby(["Meaningful", "Gender"]).agg({"Gender": "count"})}\n')
print(f'EyeSight:\n{df_personal_information.groupby(["Meaningful", "Eyesight"]).agg({"Eyesight": "count"})}\n')
print(f'StudyBefore:\n{df_personal_information.groupby(["Meaningful", "StudyBefore"]).agg({"StudyBefore": "count"})}\n')

Efficacy for every participant:


In [None]:
df = df_results_overall.groupby(['ID', 'TypeAnnotation']).agg({'Time': 'sum', 'CorrectAnswer': 'count'})
df['efficacy'] = df.CorrectAnswer / (df.Time / 60)
# df.columns.droplevel(0)
# print(df)
for index, row in df.iterrows():
    print(f'Efficacy of Participant {index}: {row.CorrectAnswer / (row.Time / 60)}')

sns.violinplot(df, y='efficacy', hue='TypeAnnotation', cut=0)
plt.ylim(bottom=0)
plt.show()

Statistical Methods one after the other

In [None]:



def compute_statistics(data: DataFrame, with_TA, without_TA, input_type: str = 'continuous',
                       output_type: str = 'contiuous', population: str = 'within', p_value: float = 0.05):
    # input_dependence: str = 'independent',

    # check for normality
    with_TA_normality = stats.shapiro(with_TA)
    without_TA_normality = stats.shapiro(without_TA)

    # print(with_TA)
    # print(without_TA)

    # check for homogeneity of variance
    homogeneity_of_variance = stats.levene(with_TA, without_TA)
    # print(homogeneity_of_variance)

    if with_TA_normality.pvalue > p_value and without_TA_normality.pvalue > p_value:
        print(
            f'Both groups are normally distributed with pvalues {with_TA_normality.pvalue} and {without_TA_normality.pvalue}')
    else:
        print(
            f'At least one group is not normally distributed with pvalues {with_TA_normality.pvalue} and {without_TA_normality.pvalue}')

    if homogeneity_of_variance.pvalue > p_value:
        print(f'Both groups have homogeneity of variance with pvalue {homogeneity_of_variance.pvalue}')
    else:
        print(f'Both groups do not have homogeneity of variance with pvalue {homogeneity_of_variance.pvalue}')

    if with_TA_normality.pvalue > p_value and without_TA_normality.pvalue > p_value and homogeneity_of_variance.pvalue > p_value:
        if population == 'within':
            if input_type == 'categorical' and output_type == 'continuous':
                print(f'TTEST_REL: {stats.ttest_rel(without_TA, with_TA)}')
            elif input_type == 'categorical' and output_type == 'categorical':
                print(f'CHISQUARE: {stats.chisquare(without_TA, with_TA)}')
            elif len(with_TA) == len(without_TA):
                print(f'WILCOXON TWO-SIDED: {stats.wilcoxon(without_TA, with_TA)}')
                print(f'WILCOXON ONE-SIDED Greater: {stats.wilcoxon(without_TA, with_TA, alternative="greater")}')
                print(f'WILCOXON ONE-SIDED Less: {stats.wilcoxon(without_TA, with_TA, alternative="less")}')
            else:
                print(f'MANNWHITNEYU: {stats.mannwhitneyu(without_TA, with_TA)}')

        else:
            if input_type == 'categorical' and output_type == 'continuous':
                print(f'TTEST_IND: {stats.ttest_ind(without_TA, with_TA)}')
            else:
                print(f'MANNWHITNEYU: {stats.mannwhitneyu(without_TA, with_TA)}')

    else:
        if population == 'within' and len(with_TA) == len(without_TA):
            print(f'WILCOXON TWO-SIDED: {stats.wilcoxon(without_TA, with_TA)}')
        elif input_type == 'categorical' and output_type == 'categorical':
            print(f'CHISQUARE: {stats.chi2_contingency(without_TA, with_TA)}')
        else:
            u_statistic, p_value = stats.mannwhitneyu(without_TA, with_TA)
            print(f'The U-statistic is {u_statistic} and the p-value is {p_value}')

    return data

#### Table for Type Annotations and Correctness RQ 1.1

Do everything correctly for time

1. Check for normality
2. Check for the variances
3. then check for wilcoxon two-sided
4. then check for wilcoxon one-sided


In [None]:
table_data: dict[str, list[str] | str] = {
    'Time': ['mean', 'std'],
    'CorrectAnswer': ['sum', 'count'],
}

table: DataFrame = df_results_overall.groupby(['Task']).agg(table_data)

TA_mean = df_results_overall.query('TypeAnnotation == True')['Time']
NoTA_mean = df_results_overall.query('TypeAnnotation == False')['Time']

# qq = stats.probplot(TA_mean, dist="norm", plot=plt)
# plt.title("Normal Q-Q plot")
# plt.show()

data = compute_statistics(table, TA_mean, NoTA_mean, input_type='categorical', output_type='continuous',
                          population='within')

# print(f'{TA_mean.describe()} and {NoTA_mean.describe()}')

# qq = stats.probplot(NoTA_mean, dist="norm", plot=plt)
# plt.title("Normal Q-Q plot")
# plt.show()

print(f'Normality of TA:\n{stats.shapiro(TA_mean)}')
print(f'Normality of NoTA:\n{stats.shapiro(NoTA_mean)}')

print(f'Levene Test:\n{stats.levene(TA_mean, NoTA_mean)}')

# print(f'WILCOXON:\n{stats.wilcoxon(NoTA_mean, TA_mean)}')
# print(f'WILCOXON Longer with TA:\n{stats.wilcoxon(NoTA_mean, TA_mean, alternative="greater")}')
# print(f'WILCOXON Shorter with TA:\n{stats.wilcoxon(NoTA_mean, TA_mean, alternative="less")}')
for task_tuple, x in table.iterrows():
    print(
        f'{task_tuple} & {x.CorrectAnswer["sum"]}/{x.CorrectAnswer["count"]} ({round((x.CorrectAnswer["sum"] / x.CorrectAnswer["count"]) * 100)}\%) & {round(x.Time["mean"], 2)} $\pm$ {round(x.Time["std"], 2)}\\\\')

print(
    f'{df_results_overall.Time.mean()} + {df_results_overall.Time.std()} | {df_results_overall.CorrectAnswer.sum()}/{df_results_overall.CorrectAnswer.count()} ({df_results_overall.CorrectAnswer.sum() / df_results_overall.CorrectAnswer.count()})')

In [None]:
sns.boxplot(data=df_results_overall, y='Time', x='TypeAnnotation')
plt.show()

# Calculate mean and standard deviation
group_stats = df_results_overall.groupby('TypeAnnotation')['Time'].agg(['mean', 'std'])

# Print values
for group, group_data in group_stats.iterrows():
    print(f"{group} -> Mean: {group_data['mean']:.2f}, SD: {group_data['std']:.2f}")

Do the same as above for the correctness

In [None]:
# percentage_TA = table.query("TypeAnnotation == True")["Correctness"]
# percentage_NoTA = table.query("TypeAnnotation == False")["Correctness"]

# please create a contingency table for CorrectAnswer on TypeAnnotation

contingency_table = pd.crosstab(df_results_overall['TypeAnnotation'], df_results_overall['CorrectAnswer'])
print(stats.chi2_contingency(contingency_table))
print(contingency_table)

try:
    data = compute_statistics(DataFrame(), df_results_overall.query("TypeAnnotation == True")["CorrectAnswer"],
                              df_results_overall.query("TypeAnnotation == False")["CorrectAnswer"],
                              input_type='categorical', output_type='categorical', population='within')
except:
    pass

print(df_results_overall.groupby('TypeAnnotation')['CorrectAnswer'].agg(['sum', 'count']))

In [None]:
# Calculate correctness values for TA and NoTA
ta_correct = df_results_overall[df_results_overall['TypeAnnotation'] == True]['CorrectAnswer'].sum()
ta_total = len(df_results_overall[df_results_overall['TypeAnnotation'] == True])
ta_percent = round((ta_correct / ta_total) * 100)

nota_correct = df_results_overall[df_results_overall['TypeAnnotation'] == False]['CorrectAnswer'].sum()
nota_total = len(df_results_overall[df_results_overall['TypeAnnotation'] == False])
nota_percent = round((nota_correct / nota_total) * 100)

all_correct = df_results_overall['CorrectAnswer'].sum()
all_total = len(df_results_overall)
all_percent = round((all_correct / all_total) * 100)

# Print LaTeX commands
print("% The general correctness values for TA and without")
print(f"\\newcommand{{\\rqOneOneGeneralCorrectnessAllTATotal}}{{{ta_total}}}")
print(f"\\newcommand{{\\rqOneOneGeneralCorrectnessAllTACorrect}}{{{ta_correct}}}")
print(f"\\newcommand{{\\rqOneOneGeneralCorrectnessTAPercent}}{{{ta_percent}\\%}}")
print()
print(f"\\newcommand{{\\rqOneOneGeneralCorrectnessAllNoTATotal}}{{{nota_total}}}")
print(f"\\newcommand{{\\rqOneOneGeneralCorrectnessAllNoTACorrect}}{{{nota_correct}}}")
print(f"\\newcommand{{\\rqOneOneGeneralCorrectnessNoTAPercent}}{{{nota_percent}\\%}}")
print()
print(f"\\newcommand{{\\rqOneOneGeneralCorrectnessAllTotal}}{{{all_total}}}")
print(f"\\newcommand{{\\rqOneOneGeneralCorrectnessAllCorrect}}{{{all_correct}}}")
print(f"\\newcommand{{\\rqOneOneGeneralCorrectnessPercent}}{{{all_percent}\\%}}")

#### Measures for RQ 1.2

First we should check for VerticalNext, VerticalLater, Regression, HorizontalLater, and LineRegression

But first let's create the table to analyze

In [None]:
list_measure: list[str] = ['VerticalNext', 'VerticalLater', 'Regression', 'HorizontalLater', 'LineRegression',
                           'StoryOrder_Naive_Score', 'StoryOrder_Dynamic_Score', 'StoryOrder_Dynamic_Repetitions',
                           'SaccadeLength']

table: DataFrame = df_results_overall.groupby(['Task', 'TypeAnnotation']).agg(table_data)
table.dropna(inplace=True)

In [None]:
for measure in list_measure:
    print(f'\nMeasure: {measure}')
    data = compute_statistics(table, df_results_overall.query("TypeAnnotation == True")[measure].dropna(),
                              df_results_overall.query("TypeAnnotation == False")[measure].dropna(),
                              input_type='categorical', output_type='continuous', population='within')

#### Measures for RQ 1.3

This should include:

1. The difficulty for each of the snippets as giving by the people.
2. Did the Type Annotations help with the snippets?
3. Do Type Annotations help in general?

Do a likert scale analysis of the difficulty 

In [None]:
df1 = df_results_overall.groupby('TypeAnnotation')['Difficulty'].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

plt.figure(figsize=(10, 6))
sns.catplot(x='Difficulty',
            hue='TypeAnnotation',
            data=df1,
            kind='bar',
            y='percent',
            #legend=False,
            )

plt.xticks(np.arange(5), ['Very Easy', 'Easy', 'Neutral', 'Difficult', 'Very Difficult'])
# plt.title('Difficulty grouped by Type Annotation')
# plt.legend(title='Type Annotation', loc='upper right', labels=['Non-Annotated', 'Annotated'])
# TODO: uncomment the following line to save the figure
# plt.savefig(f'{figure_path}/rq13difficultyLikertCatPlot.pdf', bbox_inches='tight')
plt.show()

And now with meaningful

In [None]:
df1 = df_results_overall.groupby(['Meaningful', 'TypeAnnotation'])['Difficulty'].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

print(df1[df1['Meaningful'] == True])

plt.figure(figsize=(10, 6))
sns.catplot(x='Difficulty',
            hue='TypeAnnotation',
            data=df1[df1['Meaningful'] == True],
            kind='bar',
            y='percent',
            )

plt.xticks(np.arange(5), ['Very Easy', 'Easy', 'Neutral', 'Difficult', 'Very Difficult'])
# plt.title('Difficulty grouped by Meaningful')
# TODO: uncomment the following line to save the figure
#plt.savefig(f'{figure_path}/rq23difficultyLikertCatPlotMeaningful.pdf', bbox_inches='tight', dpi=1200)
plt.show()

In [None]:
df1 = df_results_overall.groupby(['Meaningful', 'TypeAnnotation'])['Difficulty'].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

# plt.figure(figsize=(10, 6))
# g = sns.FacetGrid(df1, col='Meaningful', hue='TypeAnnotation')
# g.map(sns.stripplot, 'Difficulty', 'percent', kind='bar')
sns.catplot(x='Difficulty',
            hue='TypeAnnotation',
            data=df1,
            kind='bar',
            y='percent',
            col='Meaningful',
            )

plt.xticks(np.arange(5), ['Very Easy', 'Easy', 'Neutral', 'Difficult', 'Very Difficult'])
# plt.title('Difficulty grouped by Meaningful')
# TODO: uncomment the following line to save the figure
#plt.savefig(f'{figure_path}/rq23difficultyLikertCatPlotCombined.pdf', bbox_inches='tight', dpi=1200)
plt.show()

And now both?

In [None]:
plt.figure(figsize=(10, 6))
sns.catplot(x='Difficulty',
            hue='TypeAnnotation',
            data=df1[df1['Meaningful'] == False],
            kind='bar',
            y='percent',
            )

plt.xticks(np.arange(5), ['Very Easy', 'Easy', 'Neutral', 'Difficult', 'Very Difficult'])
plt.title('Difficulty grouped by Type Annotation')
# TODO: uncomment the following line to save the figure
#plt.savefig(f'{figure_path}/rq23difficultyLikertCatPlotObfuscated.pdf', bbox_inches='tight')
plt.show()

Can we also find a statistical significance for this? using the chisquare test?

In [None]:
contingency_table = pd.crosstab(df_results_overall['TypeAnnotation'], df_results_overall['Difficulty'])

print(stats.chi2_contingency(contingency_table))
print(contingency_table)

In [None]:
cont_df = df_results_overall[df_results_overall['Meaningful'] == True]
contingency_table = pd.crosstab(cont_df['TypeAnnotation'], cont_df['Difficulty'])

print(stats.chi2_contingency(contingency_table))
print(contingency_table)

In [None]:
cont_df = df_results_overall[df_results_overall['Meaningful'] == False]
contingency_table = pd.crosstab(cont_df['TypeAnnotation'], cont_df['Difficulty'])

print(stats.chi2_contingency(contingency_table))
print(contingency_table)

In [None]:
contingency_table = pd.crosstab(df_results_overall['Meaningful'], df_results_overall['Difficulty'])

print(stats.chi2_contingency(contingency_table))
print(contingency_table)

How about some significance for the Comprehension within the study?

In [None]:
contingency_table = pd.crosstab(df_personal_information['TAComprehension'],
                                df_personal_information['TAComprehensionGeneral'])

print(stats.fisher_exact(contingency_table))
print(contingency_table)

In [None]:
contingency_table = pd.crosstab(df_personal_information['TAComprehension'], df_personal_information['Meaningful'])

print(stats.fisher_exact(contingency_table))
print(contingency_table)

In [None]:
contingency_table = pd.crosstab(df_personal_information['TAComprehensionGeneral'],
                                df_personal_information['Meaningful'])

print(stats.fisher_exact(contingency_table))
print(contingency_table)

Did the TA help with comprehension?

In [None]:
table_data: dict[str | list[str]] = {
    # 'Difficulty': ['mean'],
    'TAComprehension': [],
    # 'TAComprehensionGeneral': [],
}
print(df_personal_information.query('TAComprehension == False')['ID'])
df_personal_information['TAComprehension'].value_counts()

TA help with comprehension for Meaningful?

In [None]:
df_personal_information.groupby('Meaningful')['TAComprehension'].value_counts()

In [None]:
stats.chisquare(df_personal_information.groupby('Meaningful')['TAComprehension'].value_counts())

Do TA help in general?

In [None]:
print(df_personal_information.query('TAComprehensionGeneral == False')['ID'])

df_personal_information['TAComprehensionGeneral'].value_counts()

In [None]:
df_personal_information.groupby('Meaningful')['TAComprehensionGeneral'].value_counts()

Create the plot for the difficulty of the snippets grouped by meaningful

In [None]:
sns.boxplot(df_results_overall.query('Meaningful == True'), x='Task', y="Difficulty", hue="TypeAnnotation")
plt.xticks(rotation=90)
plt.yticks(np.arange(1, 6, 1))
# TODO: uncomment the following line to save the figure
#plt.savefig(f'{figure_path}/difficultyrq23_Meaningful.pdf', bbox_inches='tight')
plt.show()

In [None]:
sns.boxplot(df_results_overall.query('Meaningful == False'), x='Task', y="Difficulty", hue="TypeAnnotation")
plt.xticks(rotation=90)
plt.yticks(np.arange(1, 6, 1))
# TODO: uncomment the following line to save the figure
# plt.savefig(f'{figure_path}/difficultyrq23_Obfuscated.pdf', bbox_inches='tight')
plt.show()

### Measures for RQ 2.1

This shall include all information for the bahavioral measures as seen above

In [None]:
table_data: dict[str, list[str] | str] = {
    'Time': ['mean', 'std'],
    # 'CorrectAnswer': ['sum', 'count'],
}

table: DataFrame = df_results_overall.groupby(['Task', 'TypeAnnotation', 'Meaningful']).agg(table_data)

# TA_L_mean = table.query("TypeAnnotation == True and Meaningful == False")["Time"]["mean"]
# NoTA_L_mean = table.query("TypeAnnotation == False and Meaningful == False")["Time"]["mean"]
# TA_M_mean = table.query('TypeAnnotation == True and Meaningful == True')['Time']['mean']
# NoTA_M_mean = table.query('TypeAnnotation == False and Meaningful == True')['Time']['mean']

TA_L = df_results_overall.query("TypeAnnotation == True and Meaningful == False")["Time"]
NoTA_L = df_results_overall.query("TypeAnnotation == False and Meaningful == False")["Time"]
TA_M = df_results_overall.query("TypeAnnotation == True and Meaningful == True")["Time"]
NoTA_M = df_results_overall.query("TypeAnnotation == False and Meaningful == True")["Time"]

print(f'NORMALITY:\n{pg.normality(df_results_overall, dv="Time", group="TypeAnnotation")}')
print(f'NORMALITY:\n{pg.normality(df_results_overall, dv="Time", group="Meaningful")}')

print(f'Levene:\n{stats.levene(TA_L, NoTA_M, TA_M, NoTA_L)}')

print(f'{df_results_overall.query("Time == 0")}')

# print(f'Obfuscated: Wilcoxon NoTA -> TA:\n{stats.wilcoxon(NoTA_L_mean, TA_L_mean)}')
# print(f'Obfuscated: MannWhitneyU NoTA -> Meaningful NoTA:\n{stats.mannwhitneyu(NoTA_L_mean, NoTA_M_mean)}')
# print(f'Obfuscated: MannWhitneyU NoTA -> Meaningful TA:\n{stats.mannwhitneyu(NoTA_L_mean, TA_M_mean)}')
# print(f'Obfuscated: MannWhitneyU TA -> Meaningful TA:\n{stats.mannwhitneyu(TA_L_mean, TA_M_mean)}')
# print(f'Meaningful: MannWhitneyU NoTA -> Obfuscated TA:\n{stats.mannwhitneyu(NoTA_M_mean, TA_L_mean)}')
# print(f'Meaningful: Wilcoxon NoTA -> TA:\n{stats.wilcoxon(NoTA_M_mean, TA_M_mean)}')
print(pg.sphericity(df_results_overall, dv='Time', within=['Meaningful'], subject='ID'))
print(pg.sphericity(df_results_overall, dv='Time', within=['TypeAnnotation'], subject='ID'))

print(pg.sphericity(df_results_overall, dv='CorrectAnswer', within=['Meaningful'], subject='ID'))
print(pg.sphericity(df_results_overall, dv='CorrectAnswer', within=['TypeAnnotation'], subject='ID'))
# print(pg.sphericity(df_results_overall, dv='Time', within=['TypeAnnotation', 'Meaningful'], subject='ID',))
# (df_results_overall[['Time', 'TypeAnnotation', 'Meaningful']].corr())#.sum(1).sort_values(ascending=False)
# print(stats.bartlett())


# piv = df_results_overall.pivot(index='ID', columns=['Meaningful', 'TypeAnnotation'], values='Time')
# piv.head()
# print(pg.sphericity(piv))


In [None]:
# use statsmodel to analyze the time with variables annotation and meaningful
model = smf.mixedlm('Time ~ TypeAnnotation * Meaningful', data=df_results_overall,
                    groups=df_results_overall['Task']).fit()
print(model.summary())

Check for contingency in CorrectAnswer

In [None]:
df_results_overall['CorrectAnswerNumeric'] = df_results_overall['CorrectAnswer'].apply(lambda x: 1 if x else 0)
model = smf.logit('CorrectAnswerNumeric ~ C(Meaningful) * C(TypeAnnotation)', data=df_results_overall,
                  groups=df_results_overall['Task']).fit()
print(model.summary())
print(model.wald_test_terms(scalar=True))

In [None]:
model = smf.mixedlm('Time ~ C(Meaningful) * C(TypeAnnotation)', data=df_results_overall,
                    groups=df_results_overall['Task']).fit()
print(model.summary())
print(model.wald_test_terms(scalar=True))

#### RQ 2.2 Linearity

In [None]:
list_measure: list[str] = ['VerticalNext',
                           'VerticalLater', 'Regression', 'HorizontalLater', 'LineRegression', 'StoryOrder_Naive_Score',
                           'StoryOrder_Dynamic_Score', 'StoryOrder_Dynamic_Repetitions', 'SaccadeLength', 'Linearity'
                           ]

df_results_overall = df_results_overall[df_results_overall['LineRegression'].notna()].reset_index()
df_results_overall = df_results_overall[df_results_overall['SaccadeLength'].notna()].reset_index()

In [None]:
for measure in list_measure:
    print(f'\nMeasure: {measure}')
    if measure not in []:
        model = smf.mixedlm(f'{measure} ~ C(TypeAnnotation) * C(Meaningful)', data=df_results_overall,
                            groups=df_results_overall['Task']).fit()
        print(model.summary())
    else:
        model = smf.mixedlm(f'{measure} ~ C(Meaningful) * C(TypeAnnotation)', data=df_results_overall,
                            groups=df_results_overall['Task']).fit()
    print(model.wald_test_terms(scalar=True))

#### RQ 2.3

Check for the difficulty

In [None]:
# df_results_overall['DifficultyNumeric'] = df_results_overall['Difficulty'].apply(lambda x: 1 if x == 'Very Difficult' else  0)
model = smf.logit('DifficultyNumeric ~ C(Meaningful) * C(TypeAnnotation)', data=df_results_overall,
                  groups=df_results_overall['Task']).fit()
print(model.summary())
print(model.wald_test_terms(scalar=True))

TAComprehension influenced by Meaningful?

In [None]:
Q3 = df_results_overall.Time.quantile(0.75)
Q1 = df_results_overall.Time.quantile(0.25)
print(df_results_overall.Time.mean())
print(df_results_overall.Time.std())

IQR = Q3 - Q1
threshold = 1.5

outliers = df_results_overall[
    (df_results_overall['Time'] < Q1 - threshold * IQR) | (df_results_overall['Time'] > Q3 + threshold * IQR)]
print(f'{len(outliers.Time)}, {len(df_results_overall.Time)}')
sns.boxplot(df_results_overall, y='Time')

#### Miscellaneous

How did the people find themselves in comparison to others?

In [None]:
df1 = df_personal_information.Classmates.value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

print(df1)

plt.figure(figsize=(10, 6))
sns.catplot(x='Classmates',
            # hue='TypeAnnotation', 
            data=df1,
            kind='bar',
            y='percent',
            )

plt.xticks(np.arange(5), ['Very Inexperienced', 'Inexperienced', 'Average', 'Experienced', 'Very Experienced'])
# plt.title('Difficulty grouped by Meaningful')
# plt.savefig(f'{figure_path}/rq23difficultyLikertCatPlotMeaningful.pdf', bbox_inches='tight')
plt.show()

For OverallExperience

In [None]:
df1 = df_personal_information.OverallExperience.value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()

print(df1)

plt.figure(figsize=(10, 6))
sns.catplot(x='OverallExperience',
            # hue='TypeAnnotation', 
            data=df1,
            kind='bar',
            y='percent',
            )

plt.xticks(np.arange(5), ['Very Inexperienced', 'Inexperienced', 'Average', 'Experienced', 'Very Experienced'])
# plt.title('Difficulty grouped by Meaningful')
# plt.savefig(f'{figure_path}/rq23difficultyLikertCatPlotMeaningful.pdf', bbox_inches='tight')
plt.show()

Years Programming

In [None]:
# df1 = df_personal_information.OverallExperience.value_counts(normalize=True)
# df1 = df1.mul(100)
# df1 = df1.rename('percent').reset_index()

# print(df1)

plt.figure(figsize=(10, 6))
sns.countplot(x='YearsProgramming',
              # hue='TypeAnnotation',
              data=df_personal_information,
              # kind='bar',
              )

# plt.xticks(np.arange(5), ['Very Inexperienced', 'Inexperienced', 'Average', 'Experienced', 'Very Experienced'])
# plt.title('Difficulty grouped by Meaningful')
# plt.savefig(f'{figure_path}/rq23difficultyLikertCatPlotMeaningful.pdf', bbox_inches='tight')
plt.show()

In [None]:
df_results_overall.Linearity.describe()

All Snippets with their mean and SD with correctness

In [None]:
df = df_results_overall.groupby(['Task', 'TypeAnnotation']).agg({'Time': ['mean', 'std']})
df

## Currently unnecessary and unused data

In [None]:
df1 = df_results_overall.groupby(['TypeAnnotation', 'Meaningful'])['CorrectAnswer'].value_counts(normalize=True)
df1 = df1.mul(100)
df1 = df1.rename('percent').reset_index()
print(df1)

plt.figure(figsize=(10, 6))
sns.catplot(x='Meaningful',
            hue='TypeAnnotation',
            data=df1,
            kind='bar',
            y='percent',
            )
plt.show()

#### Table for Type Annotations and Meaningful

In [None]:
table_data: dict[str, list[str] | str] = {
    'Time': ['mean', 'std'],
    'CorrectAnswer': ['sum', 'count'],
}

df_results_overall.groupby(['Task', 'Meaningful', 'TypeAnnotation']).agg(table_data)

### Task Plots

In [None]:
all_tasks: list[str] = sorted(df_results_overall['Task'].unique())

# create a plot for the mean length of each task
plt.figure(figsize=(15, 10))
sns.boxplot(x='Task', y='Time', data=df_results_overall)
plt.title('Mean Time per Task')
plt.xticks(rotation=90)
plt.show()

The mean time per task by Type Annotation

In [None]:
all_tasks: list[str] = sorted(df_results_overall['Task'].unique())

# create a plot that shows the mean time for each task
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20, 10), sharey=True)
fig.suptitle('Mean Time per Task')

axs[0].boxplot(
    x=[df_results_overall.query(f'Task == @task and `TypeAnnotation` == True')['Time'] for task in all_tasks],
    labels=all_tasks, )
axs[0].set_xticklabels(all_tasks, rotation=90)
axs[0].set_title('Mean Time per Task and True Type Annotation')

axs[1].boxplot(
    x=[df_results_overall.query(f'Task == @task and `TypeAnnotation` == False')['Time'] for task in all_tasks],
    labels=all_tasks, )
axs[1].set_xticklabels(all_tasks, rotation=90)
axs[1].set_title('Mean Time per Task and False Type Annotation')

plt.show()

The mean time per task by Identifier Name

In [None]:
all_tasks: list[str] = sorted(df_results_overall['Task'].unique())

# create a plot that shows the mean time for each task
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20, 10), sharey=True)
fig.suptitle('Mean Time per Task')

axs[0].boxplot(x=[df_results_overall.query(f'Task == @task and `Meaningful` == True')['Time'] for task in all_tasks],
               labels=all_tasks, )
axs[0].set_xticklabels(all_tasks, rotation=90)
axs[0].set_title('Mean Time per Task and Meaningful Identifier Names')

axs[1].boxplot(x=[df_results_overall.query(f'Task == @task and `Meaningful` == False')['Time'] for task in all_tasks],
               labels=all_tasks, )
axs[1].set_xticklabels(all_tasks, rotation=90)
axs[1].set_title('Mean Time per Task and Obfuscated Identifier Names')

plt.show()

The mean time per task and for the combinations of Type Annoations and Identifier Names

In [None]:
all_tasks: list[str] = sorted(df_results_overall['Task'].unique())

# create a plot that shows the mean time for each task
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(20, 10), sharey=True)
fig.suptitle('Mean Time per Task')

axs[0, 0].boxplot(
    x=[df_results_overall.query(f'Task == @task and `TypeAnnotation` == False and `Meaningful` == True')['Time'] for
       task in all_tasks], labels=all_tasks, notch=False)
axs[0, 0].set_xticklabels(all_tasks, rotation=90)
axs[0, 0].set_title('Mean Time per Task without Type Annotations and with Meaningful Identifier Names')

axs[0, 1].boxplot(
    x=[df_results_overall.query(f'Task == @task and `TypeAnnotation` == True and `Meaningful` == True')['Time'] for task
       in all_tasks], labels=all_tasks, )
axs[0, 1].set_xticklabels(all_tasks, rotation=90)
axs[0, 1].set_title('Mean Time per Task with Type Annotations and with Meaningful Identifier Names')

axs[1, 0].boxplot(
    x=[df_results_overall.query(f'Task == @task and `TypeAnnotation` == False and `Meaningful` == False')['Time'] for
       task in all_tasks], labels=all_tasks, )
axs[1, 0].set_xticklabels(all_tasks, rotation=90)
axs[1, 0].set_title('Mean Time per Task without Type Annotations and with Obfuscated Identifier Names')

axs[1, 1].boxplot(
    x=[df_results_overall.query(f'Task == @task and `TypeAnnotation` == True and `Meaningful` == False')['Time'] for
       task in all_tasks], labels=all_tasks, )
axs[1, 1].set_xticklabels(all_tasks, rotation=90)
axs[1, 1].set_title('Mean Time per Task with Type Annotations and with Obfuscated Identifier Names')

plt.show()

A plot to describe how long each task took overall and how this is distributed among the participants.

A plot that shows how long each participant took grouped by task.

Combine the Type Annotation plots in the top row into one bigger plot with comparison.

In [None]:
plt.figure(figsize=(20, 10))
sns.violinplot(df_results_overall.query(f'`Meaningful` == True'), x='Task', y='Time', hue='TypeAnnotation', split=True,
               gap=.1, inner="quart", cut=0, order=all_tasks)

plt.xticks(all_tasks, rotation=90)
plt.title('Mean Time per Task with Meaningful Identifier Names')
# TODO: uncomment the following line to save the figure
# plt.savefig(f'{figure_path}/timePerTaskrq21_meaningful.pdf', bbox_inches='tight')

plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.violinplot(df_results_overall.query(f'`Meaningful` == False'), x='Task', y='Time', hue='TypeAnnotation', split=True,
               gap=.1, inner="quart", cut=0, order=all_tasks)

plt.xticks(all_tasks, rotation=90)
plt.title('Mean Time per Task with Obfuscated Identifier Names')

# TODO: uncomment the following line to save the figure
# plt.savefig(f'{figure_path}/timePerTaskrq21_obfuscated.pdf', bbox_inches='tight')
plt.show()

What were the Means and the Standard Deviation for meaningful and type annotation?

In [None]:
print(df_results_overall.groupby(['Meaningful', 'TypeAnnotation', ]).agg({'Time': ['mean', 'std']}))
print(df_results_overall.groupby(['Meaningful']).agg({'Time': ['mean', 'std']}))


As for the correctness:

In [None]:
print(df_results_overall.groupby(['Meaningful', 'TypeAnnotation', ]).agg({'CorrectAnswer': ['sum', 'count']}))
print(df_results_overall.groupby(['Meaningful']).agg({'CorrectAnswer': ['sum', 'count']}))
print(df_results_overall.groupby(['TypeAnnotation']).agg({'CorrectAnswer': ['sum', 'count']}))

Correlation heatmap and p-values for the time

In [None]:
df_correlation_meantimes_snippets: DataFrame = DataFrame()

df_correlation_meantimes_snippets['M-TA'] = \
    df_results_overall.query('TypeAnnotation == True and Meaningful == True').groupby('Task').agg({'Time': 'mean'})[
        'Time']
df_correlation_meantimes_snippets['L-N'] = \
    df_results_overall.query('TypeAnnotation == False and Meaningful == False').groupby('Task').agg({'Time': 'mean'})[
        'Time']
df_correlation_meantimes_snippets['L-TA'] = \
    df_results_overall.query('TypeAnnotation == True and Meaningful == False').groupby('Task').agg({'Time': 'mean'})[
        'Time']
df_correlation_meantimes_snippets['M-N'] = \
    df_results_overall.query('TypeAnnotation == False and Meaningful == True').groupby('Task').agg({'Time': 'mean'})[
        'Time']

matrix = df_correlation_meantimes_snippets.corr()
print(f'P-Values:\n{calculate_pvalues(df_correlation_meantimes_snippets)}')

sns.heatmap(matrix, cmap="Greens", annot=True)
plt.title(f'Correlation Mean-Times')
# TODO: uncomment the following line to save the figure
# plt.savefig(f'{figure_path}/correlation_meantimesrq21.pdf', bbox_inches='tight')
plt.show()

Correlation and p-values for the correctness

In [None]:
df_correlation_correctness_snippets: DataFrame = DataFrame()

df_correlation_correctness_snippets['M-TA'] = \
    df_results_overall.query('TypeAnnotation == True and Meaningful == True').groupby('Task').agg(
        {'CorrectAnswer': 'sum'})[
        'CorrectAnswer'] / \
    df_results_overall.query('TypeAnnotation == True and Meaningful == True').groupby('Task').agg(
        {'CorrectAnswer': 'count'})['CorrectAnswer']
df_correlation_correctness_snippets['L-N'] = \
    df_results_overall.query('TypeAnnotation == False and Meaningful == False').groupby('Task').agg(
        {'CorrectAnswer': 'sum'})['CorrectAnswer'] / \
    df_results_overall.query('TypeAnnotation == False and Meaningful == False').groupby('Task').agg(
        {'CorrectAnswer': 'count'})['CorrectAnswer']
df_correlation_correctness_snippets['L-TA'] = \
    df_results_overall.query('TypeAnnotation == True and Meaningful == False').groupby('Task').agg(
        {'CorrectAnswer': 'sum'})['CorrectAnswer'] / \
    df_results_overall.query('TypeAnnotation == True and Meaningful == False').groupby('Task').agg(
        {'CorrectAnswer': 'count'})['CorrectAnswer']
df_correlation_correctness_snippets['M-N'] = \
    df_results_overall.query('TypeAnnotation == False and Meaningful == True').groupby('Task').agg(
        {'CorrectAnswer': 'sum'})['CorrectAnswer'] / \
    df_results_overall.query('TypeAnnotation == False and Meaningful == True').groupby('Task').agg(
        {'CorrectAnswer': 'count'})['CorrectAnswer']

matrix = df_correlation_correctness_snippets.corr()
print(f'P-Values:\n{calculate_pvalues(df_correlation_correctness_snippets)}')

sns.heatmap(matrix, cmap="Greens", annot=True)
plt.title(f'Correlation Correctness')
# TODO: uncomment the following line to save the figure
# plt.savefig(f'{figure_path}/correlation_correctnessrq21.pdf', bbox_inches='tight')
plt.show()

Show the mean time with and without type annotations for each participant.

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(20, 10), sharey=True)

# first plot
plot_L_M = axs[0].boxplot(
    [df_results_overall.query(f'`ID` == @participant and `TypeAnnotation` == False and `Meaningful` == True')['Time']
     for participant in meaningful_participants], labels=meaningful_participants,
    positions=np.arange(len(meaningful_participants)) * 2.0 + 0.35, widths=0.6)
plot_TA_M = axs[0].boxplot(
    [df_results_overall.query(f'`ID` == @participant and `TypeAnnotation` == True and `Meaningful` == True')['Time'] for
     participant in meaningful_participants], labels=meaningful_participants,
    positions=np.arange(len(meaningful_participants)) * 2.0 - 0.35, widths=0.6)

# first plot settings
axs[0].set_title('Mean Time per Participant with Meaningful Identifier Names')
axs[0].set_xticks(np.arange(0, len(meaningful_participants) * 2, 2), meaningful_participants, rotation=90)
define_box_properties(plot_TA_M, 'blue', 'TypeAnnotation', axs[0])
define_box_properties(plot_L_M, 'orange', 'No Type Annotation', axs[0])

# second plot
plot_L_L = axs[1].boxplot(
    [df_results_overall.query(f'`ID` == @participant and `TypeAnnotation` == False and `Meaningful` == False')['Time']
     for participant in obfuscated_participants], labels=obfuscated_participants,
    positions=np.arange(len(obfuscated_participants)) * 2.0 + 0.35, widths=0.6)
plot_TA_L = axs[1].boxplot(
    [df_results_overall.query(f'`ID` == @participant and `TypeAnnotation` == True and `Meaningful` == False')['Time']
     for participant in obfuscated_participants], labels=obfuscated_participants,
    positions=np.arange(len(obfuscated_participants)) * 2.0 - 0.35, widths=0.6)

# second plot settings
axs[1].set_title('Mean Time per Participant with Obfuscated Identifier Names')
axs[1].set_xticks(np.arange(0, len(obfuscated_participants) * 2, 2), obfuscated_participants, rotation=90)
define_box_properties(plot_TA_L, 'blue', 'TypeAnnotation', axs[1])
define_box_properties(plot_L_L, 'orange', 'No Type Annotation', axs[1])

plt.show()


How long people took overall.

In [None]:
df_results_overall.groupby(['Meaningful', ]).agg({'Time': 'sum'}).unstack().T.plot(kind='bar', stacked=True,
                                                                                   figsize=(15, 10))
plt.show()

Do `meaningful` snippets make participants faster on average?

In [None]:
# compute if meaningful snippets were faster on average than obfuscated snippets
df_results_overall.groupby(['Meaningful']).agg({'Time': 'mean'}).unstack().T.plot(kind='bar', stacked=False,
                                                                                  figsize=(15, 10))
plt.show()

Do `Type Annotations` make people faster on average?

- [ ] TODO: Apparently, this is not the case. Why could this be? More to read? Where do people look during this time? What are they focusing on? Are they more correct?

In [None]:
# compute if type annotated snippets were faster on average than non type annotated snippets
df_results_overall.groupby(['TypeAnnotation']).agg({'Time': 'mean'}).unstack().T.plot(kind='bar', stacked=True,
                                                                                      figsize=(15, 10))
plt.show()

Do `Type Annotations` make people faster when we differentiate between `Meaningful` snippets?

In [None]:
df_results_overall.groupby(['Meaningful', 'TypeAnnotation']).agg({'Time': 'mean'}).unstack().T.plot(kind='bar',
                                                                                                    figsize=(15, 10))
plt.show()

Are snippets with `Type Annotations` more correct?

In [None]:
snippets_grouping: list[str | list[str]] = ['TypeAnnotation', 'Meaningful', ['TypeAnnotation', 'Meaningful'], ]

# create a plot consisting of len(snippets_grouping) subplots
fig, axs = plt.subplots(2, 2, figsize=(15, 10))

for i, x in enumerate(snippets_grouping):
    ax = df_results_overall.groupby(x).agg({'CorrectAnswer': np.count_nonzero}).unstack().T.plot(kind='bar',
                                                                                                 ax=axs[i // 2, i % 2],
                                                                                                 title=(lambda
                                                                                                            x: f'{x[0]} and {x[1]}' if len(
                                                                                                     x) == 2 else x)(x),
                                                                                                 ylabel='Number of Correct Answers',
                                                                                                 legend=False)

    ax.set_xticklabels(['False', 'True'], rotation=0)

    if len(x) != 2:
        ax.axhline(y=df_results_overall.groupby(x).agg({'CorrectAnswer': 'count'}).unstack()[0], color='orange',
                   linestyle='--', label='Max False')  # False
        ax.axhline(y=df_results_overall.groupby(x).agg({'CorrectAnswer': 'count'}).unstack()[1], color='green',
                   linestyle='--', label='Max True')  # True
        ax.legend(loc='lower center')

    for line in ax.lines:
        ax.annotate(str(int(line.get_ydata()[0])), xy=(line.get_xdata()[0], line.get_ydata()[0]), xytext=(280, 2),
                    textcoords='offset points', ha='right', va='bottom', fontsize=8)

    for p in ax.patches:
        ax.annotate(str(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center',
                    xytext=(0, 5), textcoords='offset points')

plt.show()

Are snippets that are `Meaningful` more correct?

In [None]:
df_results_overall.groupby(['TypeAnnotation', 'Meaningful']).agg({'CorrectAnswer': 'count'}).unstack()

A better representation of this:

In [None]:
correct_TA_M: int = df_results_overall.query('`TypeAnnotation` == True and Meaningful == True')['CorrectAnswer'].sum()
correct_TA_L: int = df_results_overall.query('`TypeAnnotation` == True and Meaningful == False')['CorrectAnswer'].sum()
correct_L_M: int = df_results_overall.query('`TypeAnnotation` == False and Meaningful == True')['CorrectAnswer'].sum()
correct_L_L: int = df_results_overall.query('`TypeAnnotation` == False and Meaningful == False')['CorrectAnswer'].sum()

total_TA_M: int = df_results_overall.query('`TypeAnnotation` == True and Meaningful == True')['CorrectAnswer'].count()
total_TA_L: int = df_results_overall.query('`TypeAnnotation` == True and Meaningful == False')['CorrectAnswer'].count()
total_L_M: int = df_results_overall.query('`TypeAnnotation` == False and Meaningful == True')['CorrectAnswer'].count()
total_L_L: int = df_results_overall.query('`TypeAnnotation` == False and Meaningful == False')['CorrectAnswer'].count()

print(f'Type Annotation and Meaningful: {correct_TA_M}/{total_TA_M} = {correct_TA_M / total_TA_M}')
print(f'Type Annotation and Obfuscated: {correct_TA_L}/{total_TA_L} = {correct_TA_L / total_TA_L}')
print(f'No Type Annotation and Meaningful: {correct_L_M}/{total_L_M} = {correct_L_M / total_L_M}')
print(f'No Type Annotation and Obfuscated: {correct_L_L}/{total_L_L} = {correct_L_L / total_L_L}')

print(
    f'\nType Annotation: {correct_TA_M + correct_TA_L}/{total_TA_M + total_TA_L} = {(correct_TA_M + correct_TA_L) / (total_TA_M + total_TA_L)}')
print(
    f'No Type Annotation: {correct_L_M + correct_L_L}/{total_L_M + total_L_L} = {(correct_L_M + correct_L_L) / (total_L_M + total_L_L)}')
print(
    f'Meaningful: {correct_TA_M + correct_L_M}/{total_TA_M + total_L_M} = {(correct_TA_M + correct_L_M) / (total_TA_M + total_L_M)}')
print(
    f'Obfuscated: {correct_TA_L + correct_L_L}/{total_TA_L + total_L_L} = {(correct_TA_L + correct_L_L) / (total_TA_L + total_L_L)}')


Correct Answers per Snippet

In [None]:
snippet_correctness_df: DataFrame = DataFrame(columns=['Task', 'Correctness', 'Meaningful', 'TypeAnnotation'])

counter: int = 0

for task in all_tasks:
    correct_TA_M: int = \
        df_results_overall.query(f'Task == "{task}" and `TypeAnnotation` == True and Meaningful == True')[
            'CorrectAnswer'].sum()
    correct_TA_L: int = \
        df_results_overall.query(f'Task == "{task}" and `TypeAnnotation` == True and Meaningful == False')[
            'CorrectAnswer'].sum()
    correct_L_M: int = \
        df_results_overall.query(f'Task == "{task}" and `TypeAnnotation` == False and Meaningful == True')[
            'CorrectAnswer'].sum()
    correct_L_L: int = \
        df_results_overall.query(f'Task == "{task}" and `TypeAnnotation` == False and Meaningful == False')[
            'CorrectAnswer'].sum()

    total_TA_M: int = df_results_overall.query(f'Task == "{task}" and `TypeAnnotation` == True and Meaningful == True')[
        'CorrectAnswer'].count()
    total_TA_L: int = \
        df_results_overall.query(f'Task == "{task}" and `TypeAnnotation` == True and Meaningful == False')[
            'CorrectAnswer'].count()
    total_L_M: int = df_results_overall.query(f'Task == "{task}" and `TypeAnnotation` == False and Meaningful == True')[
        'CorrectAnswer'].count()
    total_L_L: int = \
        df_results_overall.query(f'Task == "{task}" and `TypeAnnotation` == False and Meaningful == False')[
            'CorrectAnswer'].count()

    print(f'\nTask: {task}')

    print(
        f'\nNo Type Annotation: {correct_L_M + correct_L_L}/{total_L_M + total_L_L} = {(correct_L_M + correct_L_L) / (total_L_M + total_L_L)}')
    print(
        f'Type Annotation: {correct_TA_M + correct_TA_L}/{total_TA_M + total_TA_L} = {(correct_TA_M + correct_TA_L) / (total_TA_M + total_TA_L)}')

    snippet_correctness_df.loc[counter] = [task, correct_TA_M / total_TA_M, True, True]
    snippet_correctness_df.loc[counter + 1] = [task, correct_TA_L / total_TA_L, False, True]
    snippet_correctness_df.loc[counter + 2] = [task, correct_L_M / total_L_M, True, False]
    snippet_correctness_df.loc[counter + 3] = [task, correct_L_L / total_L_L, False, False]

    counter += 4

Statistic for the participants:
- Number of Snippets
- Number of Correct / Incorrect Snippets
- Meaningful / Obfuscated Snippets
- Mean Time for Type Annotation / No Type Annotation
- Overall Time Taken in Minutes

We start with the participants in the obfuscated group.

In [None]:
print(f'Obfuscated Participants')
for participant in obfuscated_participants:
    print(f'\nParticipant {participant}')
    number_of_snippets: int = df_results_overall.query(f'ID == "{participant}"')['CorrectAnswer'].count()
    correct_snippets: int = df_results_overall.query(f'ID == "{participant}"')['CorrectAnswer'].sum()
    print(
        f'Participant {participant} has {correct_snippets}/{number_of_snippets} correct snippets: {correct_snippets / number_of_snippets}')

    mean_time: float = df_results_overall.query(f'ID == "{participant}"')['Time'].mean()
    print(f'Participant {participant} has a mean time of {mean_time} seconds')

    mean_time_TA: float = df_results_overall.query(f'ID == "{participant}" and `TypeAnnotation` == True')['Time'].mean()
    mean_time_L: float = df_results_overall.query(f'ID == "{participant}" and `TypeAnnotation` == False')['Time'].mean()
    print(
        f'Participant {participant} has a mean time of {mean_time_TA} seconds with Type Annotations and {mean_time_L} seconds without Type Annotations, thus the change is {mean_time_L - mean_time_TA} seconds')

    overall_time: float = df_results_overall.query(f'ID == @participant')['Time'].sum()
    print(f'Participant {participant} took {overall_time // 60} minutes and {overall_time % 60} seconds in total')

print(f'\n\nMeaningful Participants')

for participant in meaningful_participants:
    print(f'\nParticipant {participant}')
    number_of_snippets: int = df_results_overall.query(f'ID == "{participant}"')['CorrectAnswer'].count()
    correct_snippets: int = df_results_overall.query(f'ID == "{participant}"')['CorrectAnswer'].sum()
    print(
        f'Participant {participant} has {correct_snippets}/{number_of_snippets} correct snippets: {correct_snippets / number_of_snippets}')

    mean_time: float = df_results_overall.query(f'ID == "{participant}"')['Time'].mean()
    print(f'Participant {participant} has a mean time of {mean_time} seconds')

    mean_time_TA: float = df_results_overall.query(f'ID == "{participant}" and `TypeAnnotation` == True')['Time'].mean()
    mean_time_L: float = df_results_overall.query(f'ID == "{participant}" and `TypeAnnotation` == False')['Time'].mean()
    print(
        f'Participant {participant} has a mean time of {mean_time_TA} seconds with Type Annotations and {mean_time_L} seconds without Type Annotations, thus the change is {mean_time_L - mean_time_TA} seconds')

    overall_time: float = df_results_overall.query(f'ID == @participant')['Time'].sum()
    print(f'Participant {participant} took {overall_time // 60} minutes and {overall_time % 60} seconds in total')

Check for the number of correct snippets as a ratio for participants.

In [None]:
correctness_df_by_participant_meaningful: DataFrame = DataFrame(columns=['ID', 'Correctness', 'TypeAnnotation'])
correctness_df_by_participant_obfuscated: DataFrame = DataFrame(columns=['ID', 'Correctness', 'TypeAnnotation'])

counter: int = 0
for participant in meaningful_participants:
    correct_TA: int = df_results_overall.query(f'ID == "{participant}" and `TypeAnnotation` == True')[
        'CorrectAnswer'].sum()
    correct_L: int = df_results_overall.query(f'ID == "{participant}" and `TypeAnnotation` == False')[
        'CorrectAnswer'].sum()

    total_TA: int = df_results_overall.query(f'ID == "{participant}" and `TypeAnnotation` == True')[
        'CorrectAnswer'].count()
    total_L: int = df_results_overall.query(f'ID == "{participant}" and `TypeAnnotation` == False')[
        'CorrectAnswer'].count()

    correctness_df_by_participant_meaningful.loc[counter] = [participant, correct_TA / total_TA, True]
    correctness_df_by_participant_meaningful.loc[counter + 1] = [participant, correct_L / total_L, False]

    counter += 2

counter = 0
for participant in obfuscated_participants:
    correct_TA: int = df_results_overall.query(f'ID == "{participant}" and `TypeAnnotation` == True')[
        'CorrectAnswer'].sum()
    correct_L: int = df_results_overall.query(f'ID == "{participant}" and `TypeAnnotation` == False')[
        'CorrectAnswer'].sum()

    total_TA: int = df_results_overall.query(f'ID == "{participant}" and `TypeAnnotation` == True')[
        'CorrectAnswer'].count()
    total_L: int = df_results_overall.query(f'ID == "{participant}" and `TypeAnnotation` == False')[
        'CorrectAnswer'].count()

    correctness_df_by_participant_obfuscated.loc[counter] = [participant, correct_TA / total_TA, True]
    correctness_df_by_participant_obfuscated.loc[counter + 1] = [participant, correct_L / total_L, False]

    counter += 2

correctness_df_by_participant_meaningful.groupby(['TypeAnnotation', 'ID']).agg(
    {'Correctness': 'mean'}).unstack().T.plot(kind='bar', figsize=(15, 10), title="Meaningful")

correctness_df_by_participant_obfuscated.groupby(['TypeAnnotation', 'ID']).agg(
    {'Correctness': 'mean'}).unstack().T.plot(kind='bar', figsize=(15, 10), title="Obfuscated")
plt.show()

Check for the time and correct answers per code snippet.

In [None]:
snippet_correctness_df.groupby(['Meaningful', 'TypeAnnotation', 'Task']).agg({'Correctness': 'mean'}).unstack().T.plot(
    kind='bar', figsize=(15, 10))

plt.show()

### Difficulty Rating

First let's check how the difficulty is for each task. I think this would be best as a table?

In [None]:
plt.figure(figsize=(20, 10))
sns.boxplot(df_results_overall.query(f'`Meaningful` == True'), x='Task', y='Difficulty', hue='TypeAnnotation',
            gap=.1,
            # inner="quart", 
            # cut=0, 
            order=all_tasks,
            # split=True,
            )

plt.xticks(all_tasks, rotation=90)
plt.yticks(np.arange(1, 6, 1))
plt.title('Difficulty per Task with Meaningful Identifier Names')

plt.show()

And now with obfuscated identifier names.

In [None]:
plt.figure(figsize=(20, 10))
sns.violinplot(df_results_overall.query(f'`Meaningful` == False'), x='Task', y='Difficulty',
               hue='TypeAnnotation',
               gap=.1,
               inner="quart",
               cut=0,
               order=all_tasks,
               split=True,
               )

plt.xticks(all_tasks, rotation=90)
plt.yticks(np.arange(1, 6, 1))
plt.title('Difficulty per Task with and with Obfuscated Identifier Names')

plt.show()

The difficulty that each participant felt.

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=1, figsize=(20, 10), sharey=True)

# first plot
plot_L_M = axs[0].boxplot([df_results_overall.query(
    f'`ID` == @participant and `TypeAnnotation` == False and `Meaningful` == True')['Difficulty'] for participant in
                           meaningful_participants], labels=meaningful_participants,
                          positions=np.arange(len(meaningful_participants)) * 2.0 + 0.35, widths=0.6)
plot_TA_M = axs[0].boxplot([df_results_overall.query(
    f'`ID` == @participant and `TypeAnnotation` == True and `Meaningful` == True')['Difficulty'] for participant in
                            meaningful_participants], labels=meaningful_participants,
                           positions=np.arange(len(meaningful_participants)) * 2.0 - 0.35, widths=0.6)

# first plot settings
axs[0].set_title('Mean Difficulty per Participant with Meaningful Identifier Names')
axs[0].set_xticks(np.arange(0, len(meaningful_participants) * 2, 2), meaningful_participants, rotation=90)
axs[0].set_yticks(np.arange(1, 6, 1))
define_box_properties(plot_TA_M, 'blue', 'Type Annotation', axs[0])
define_box_properties(plot_L_M, 'orange', 'No Type Annotation', axs[0])

# second plot
plot_L_L = axs[1].boxplot([df_results_overall.query(
    f'`ID` == @participant and `TypeAnnotation` == False and `Meaningful` == False')['Difficulty'] for participant in
                           obfuscated_participants], labels=obfuscated_participants,
                          positions=np.arange(len(obfuscated_participants)) * 2.0 + 0.35, widths=0.6)
plot_TA_L = axs[1].boxplot([df_results_overall.query(
    f'`ID` == @participant and `TypeAnnotation` == True and `Meaningful` == False')['Difficulty'] for participant in
                            obfuscated_participants], labels=obfuscated_participants,
                           positions=np.arange(len(obfuscated_participants)) * 2.0 - 0.35, widths=0.6)

# second plot settings
axs[1].set_title('Mean Difficulty per Participant with Obfuscated Identifier Names')
axs[1].set_xticks(np.arange(0, len(obfuscated_participants) * 2, 2), obfuscated_participants, rotation=90)
axs[1].set_yticks(np.arange(1, 6, 1))
define_box_properties(plot_TA_L, 'blue', 'Type Annotation', axs[1])
define_box_properties(plot_L_L, 'orange', 'No Type Annotation', axs[1])

plt.show()


Correct Answer by Difficulty

In [None]:
snippet_correctness_df.groupby(['Meaningful', 'TypeAnnotation']).agg({'Correctness': 'mean'}).unstack().T.plot(
    kind='bar', figsize=(15, 10))

plt.show()

Difficulty count by snippet

In [None]:
sns.set_theme(style="white")
pal = sns.cubehelix_palette(10, rot=-.25, light=.7)

g = sns.FacetGrid(df_results_overall, row="Task",
                  #   col="Type Annotation",
                  hue="Task",
                  height=1,
                  aspect=30,
                  palette=pal,
                  )
g.map_dataframe(sns.histplot, x="Difficulty", hue="TypeAnnotation", binwidth=1, binrange=(1, 6), multiple="stack")
# g.map_dataframe(sns.kdeplot, x="Difficulty", clip=(1, 5), fill=True)
# g.map_dataframe(sns.kdeplot, x="Difficulty", hue='Type Annotation', clip_on=(1, 5), color="black", lw=2, bw_adjust=.5)

g.refline(y=0, linewidth=2, linestyle="-", color=None, clip_on=False)


def label(x, color, label):
    ax = plt.gca()
    ax.text(0, .2, label, fontweight="bold", color=color, ha="left", va="center", transform=ax.transAxes)


g.map(label, "Task")
g.figure.subplots_adjust(hspace=-.5)

g.set_titles("")
g.set(yticks=[], ylabel="")
g.set(xticks=np.arange(1, 6, 1))
g.despine(bottom=True, left=True)

g.figure.tight_layout()


## Display META Data

In [None]:
# if the line 'Total' does not exist, create it
if not 'Total' in df_meta_data.index:
    df_meta_data.loc['Total'] = df_meta_data[
        [col for col in meta_data_columns if col not in unnecessary_columns_meta_data]].sum()
    # add the number of times a participant did not finish all snippets to the 'Total' line
    df_meta_data.loc['Total', 'NumberOfMissingSnippets'] = \
        df_meta_data.groupby('ID')['NumberOfMissingSnippets'].any().value_counts()[True]
    # df_meta_data.loc['Total']['NumberOfMissingSnippets'] = df_meta_data['NumberOfMissingSnippets'].any()
    df_meta_data = df_meta_data.fillna('')

# set the background color of the dataframe value to red if the value is False
df_meta_data.style.applymap(lambda x: 'background-color: darkred' if x == False else '')

### Overview

The corrected p-values for the mixed linear regression models.

In [None]:

# Example p-values
p_values = np.array([0.43, 0.033, 0.056, 0.031, 0.33, 0.1, 0.55, 0.97])

# Perform Benjamini-Hochberg FDR correction
alpha = 0.05
rejected, pvals_corrected, _, _ = multipletests(p_values, alpha=alpha, method='fdr_bh')

# Output the results
print("Original p-values:", p_values)
print("Corrected p-values:", pvals_corrected)
print("Rejected hypotheses:", rejected)

In [None]:

# Example p-values
p_values = np.array([0.43, 0.033, 0.056, 0.031, 0.33, 0.1, 0.55, 0.97])

# Perform Benjamini-Hochberg FDR correction
alpha = 0.05
rejected, pvals_corrected, _, _ = multipletests(p_values, alpha=alpha, method='fdr_bh')

# Output the results
print("Original p-values:", p_values)
print("Corrected p-values:", pvals_corrected)
print("Rejected hypotheses:", rejected)