<h1> Data Analysis for REYeker</h1>

In [1]:
# lib for dataframes
import pandas as pd

# lib for saving np images
from PIL import Image

# lib for plotting
import matplotlib.pyplot as plt

# lib for numerical computations
import numpy as np

# lib for regex
import re

# lib for crerating paths
from pathlib import Path

# REYeker lib
import modules.rEYEkerAnalysis as rEYEker

# for t testing
from scipy import stats

# lib for better plotting
import seaborn as sns
sns.set_theme('paper')

# lib for differ calculation
import difflib

<h2>1. Configuration</h2>

<h5>Database configuration </h5>

In [2]:
# path to the datafile
path_to_data = r'./data/Book4.xlsx'

# columns with visual stimulus data
visual_stimulus_data = ['TR20_01', 'TI20_01', 'BR20_01', 'BI20_01']

# columns with names of the algo
algo_names = ['TR_FIB', 'TI_FIB', 'BR_FIB', 'BI_FIB']

# columns with time data of visual stimulus
time_data = []

# columns with the given answers of the studen
answer_fields = ['TR10_01', 'TI10_01', 'BR10_01', 'BI10_01']

# regex pattern for correct answer
right_answer_patterns = ['2', '2','2','2']

# colums of response time
response_time_data = ['TIME042', 'TIME008', 'TIME059', 'TIME023']

<h5>Configuration for REYEker data </h5>

In [3]:
# file for loading rEYEker settings
settings_file = "data/example.json"

<h5>Configuration for saving images </h5>

In [4]:
# data for loading the images
image_paths = ['images/TR/TR_Fibonacci.png',
               'images/TI/TI_Fibonacci.png',
               'images/BR/BR_Fibonacci.png',
               'images/BI/BI_Fibonacci.png']

# where to save to heatmaps and sequence diagrams
folder_prefix = [
    'TR/',
    'TI/',
    'BR/',
    'BI/']

# used for saving the heatmaps and sequence diagrams
image_prefix = [
    'TR_Fibonacci_',
    'TI_Fibonacci_',
    'BR_Fibonacci_',
    'BI_Fibonacci_']

<h5>Configuration for Code Flow data import</h5>

In [5]:
# excel data in data/code_flow
code_flow_data = ['TR_Fibonacci.xlsx',
                  'TI_Fibonacci.xlsx',
                  'BR_Fibonacci.xlsx',
                  'BI_Fibonacci.xlsx']  

<h5>Configuration for alpha value for t-test </h5>

In [6]:
# confidence needed for t test
alpha = 0.05

<h5>Import the columns and create dataframe</h5>

In [7]:
total_colums = []

# create all dataframe headers
for i in range(len(visual_stimulus_data)):
    tmp_list = []
    tmp_list.append(visual_stimulus_data[i])
    
    if len(time_data) != 0:
        tmp_list.append(time_data[i])
    tmp_list.append(answer_fields[i])
    tmp_list.append(response_time_data[i])
    total_colums.append(tmp_list)

dataframes = []
raw = pd.read_excel(path_to_data)

# read all dataframes
for data_set in total_colums:
    dataframe = pd.DataFrame(raw, columns = data_set)
    dataframe = dataframe.iloc[1:]
    dataframe = dataframe.dropna()
    dataframes.append(dataframe)

<h4>Splitting Dataframes in right and wrong answers.</h4>

In [8]:
dataframes_right = []
dataframes_wrong = []

# iter over every dataframe
for idx, dataframe in enumerate(dataframes):
    right_answer_pattern = right_answer_patterns[i]
    regex = re.compile(right_answer_pattern)
    answer_field = answer_fields[idx]
    
    dataframe_right = pd.DataFrame(columns = total_colums[idx])
    dataframe_wrong = pd.DataFrame(columns = total_colums[idx])
    
    # iter over every row and check if the result is rightr
    for _index, row  in dataframe.iterrows():
        result = regex.match(str(row[answer_field]))
        if result is not None:
            dataframe_right = dataframe_right.append(row)
        else:
            dataframe_wrong = dataframe_wrong.append(row)
    
    
    
    dataframes_right.append(dataframe_right)
    dataframes_wrong.append(dataframe_wrong)

<h4>Remove Outliers</h4>

In [9]:
tmp_dataframes = []
tmp_dataframes_wrong = []

#iterate over all dataframes and remove outliers
for idx, dataframe in enumerate(dataframes_right):

    data = dataframe[response_time_data[idx]]
    cleared_dataframe = dataframe[data.between(data.quantile(.15), data.quantile(0.85))]
    tmp_dataframes.append(cleared_dataframe)
    
#iterate over all dataframes and remove outliers
for idx, dataframe in enumerate(dataframes_wrong):
    data = dataframe[response_time_data[idx]]
    cleared_dataframe = dataframe[data.between(data.quantile(.15), data.quantile(0.85))]
    tmp_dataframes_wrong.append(cleared_dataframe)    
    
dataframes = tmp_dataframes
dataframes_wong = tmp_dataframes_wrong 

<h4>Import REYeker Settings</h4>

In [10]:
(_data, _times, click_setting) = rEYEker.load_data_from_json("data/example.json")

<h4>Import Images Settings</h4>

In [11]:
images = []

# read in every image
for image_path in image_paths:
    img = rEYEker.load_image(image_path)
    images.append(img)

<h4> Cast Data to Valid format</h4>

Import the visual stimulus measured Data

In [12]:
visual_stimulus_data_array = []

#iter over every dataframe
for idx, dataframe in enumerate(dataframes):
    visual_stimulus_measurements = []
    visual_stimulus_row = visual_stimulus_data[idx]

    #iter over every row 
    for _idx, item in dataframe.iterrows():
        data_str = item[visual_stimulus_row]
        data_str = data_str.strip()
        coordinates_str = data_str.split(" ")
        coordinates = []
        
        # iter over every coordinate pair x-y
        for coordinate_str in coordinates_str:
            coordinate = coordinate_str.split("-")
            coordinate = (int(coordinate[0]), int(coordinate[1]))
            coordinates.append(coordinate)
            
        visual_stimulus_measurements.append(coordinates)
        
    visual_stimulus_data_array.append(visual_stimulus_measurements)

Import the Time Data

In [13]:
timestamps_data_array = []

#iter over every dataframe
for idx, dataframe in enumerate(dataframes):
    if len(time_data) <= idx:
        break
    time_measurements = []
    time_measurement_row = time_data[idx]

    #iter over every row 
    for _idx, item in dataframe.iterrows():
        data_str = item[time_measurement_row]
        data_str = data_str.strip()
        timestamps = data_str.split(" ")
        timestamps = [int(timestamp) for timestamp in timestamps]
        time_measurement_row.append(timestamps)
        
    timestamps_data_array.append(visual_stimulus_measurements)

<h4>Helper Functions</h4>

In [14]:
def save_images(ims, folder, image_name):
    """
    :brief saves an array of images to a certain location incrementing the postfix by a number
    :param ims:        array of images (np.ndarray)
    :param folder:     prefix of image/ folder location
    :param image_name: prefix for the image
    """
    
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    prefix = folder + image_name
    
    #TODO create folders if there are none present
    for idx, data in enumerate(ims):
        data = data*255
        data = np.uint8(data)
        im = Image.fromarray(data)
        im.save(prefix + str(idx) + '.png')
        
def compare_for_h0(arr_1, arr_2, alpha):
    t, p = stats.ttest_ind(arr_1, arr_2)
    if p > alpha:
        return (True, t, p)
    else:
        return (False, t, p)
    
def is_in(value, tup):
    return tup[0] <= value <= tup[1]

def get_0_offset(number):
    i = 0
    number = int(number)
    while number != 0:
        number = int(number / 10)
        i = i + 1
    return i

<h2>2. Create Single Heatmaps</h2>

create heatmaps

In [15]:
heatmaps_datasets_array = []

# iterate over all the datasets
for dataset_idx, stimulus_dataset in enumerate(visual_stimulus_data_array):
    time_stamp_array = None
    if len(timestamps_data_array) > dataset_idx:
        time_stamp_array = timestamps_data_array[dataset_idx]
    
    heatmap_array = []

    # iterate over all the measurements of the dataset
    for visual_idx, stimulus_measurement in enumerate(stimulus_dataset):
        times = None
        if time_stamp_array is not None and len(time_stamp_array) > visual_idx:
            times = time_stamp_array[visual_idx]
        
        im = rEYEker.draw_shape_heat_map(images[dataset_idx], stimulus_measurement,click_setting, times, should_copy=True)
        heatmap_array.append(im)
        
    heatmaps_datasets_array.append(heatmap_array)

save Heatmaps

In [16]:
for idx, heatmap_array in enumerate(heatmaps_datasets_array):
    save_images(heatmap_array, "./results/heatmaps/heatmaps/" +  folder_prefix[idx], image_prefix[idx])

<h2>3. Create Average Heatmaps</h2>

create heatmaps

In [17]:
average_heatmaps = []

# iterate over all the datasets
for idx, stimulus_dataset in enumerate(visual_stimulus_data_array):
    image = images[idx]
    visual_measurements = visual_stimulus_data_array[idx]
    time_measurements = None
    if len(timestamps_data_array) > idx:
        time_measurements = timestamps_data_array[idx]
    im = rEYEker.draw_average_shape_heat_map_rel(image, visual_measurements, click_setting, 1.0, .0, time_measurements, should_copy=True)
    average_heatmaps.append(im)

save heatmaps

In [18]:
for idx, heatmap in enumerate(average_heatmaps):
    save_images([heatmap], "./results/heatmaps/average_heatmap/", image_prefix[idx])

<h2>4. Create Sequence diagramms</h2>

create sequence diagrams

In [19]:
sequence_diagrams_datasets_array = []

# iterate over all the datasets
for dataset_idx, stimulus_dataset in enumerate(visual_stimulus_data_array):
    #if time will be needed someday
    #time_stamp_array = None
    #if len(timestamps_data_array) > dataset_idx:
    #    time_stamp_array = timestamps_data_array[dataset_idx]
    
    sequence_diagram_array = []

    # iterate over all the measurements of the dataset
    for visual_idx, stimulus_measurement in enumerate(stimulus_dataset):
        #if time will be needed someday
        #times = None
        #if time_stamp_array is not None and len(time_stamp_array) > visual_idx:
        #    times = time_stamp_array[visual_idx]
        try:
            im = rEYEker.draw_vertical_line_diagram(images[dataset_idx], stimulus_measurement, should_copy=True)
            sequence_diagram_array.append(im)
        except:
            #TODO
            print("W.I.P.:", end='')
            print("to many clicks for dataset " + str(dataset_idx) + " datset " + str(visual_idx))
    sequence_diagrams_datasets_array.append(sequence_diagram_array)

save sequence diagrams

In [20]:
for idx, sequence_diagram_array in enumerate(sequence_diagrams_datasets_array):
    save_images(sequence_diagram_array, "./results/sequence_diagrams/"  +  folder_prefix[idx], image_prefix[idx])

<h2>5. Generate Code Flow diagramm</h2>

<h4> User rEYEke_COdeFlow.ipynb to create the corresponding excel sheets </h4>

<h2>7. Analyse average of Data</h2>

<h3>7.1 Helper Functions</h3>

In [21]:
def save_displots(folder, indexing_array, dataframes):
    """
    folder:         prefix where to save
    indexing_array: how to index into the dataframe
    dataframes:     array of dataframes to plot
    """
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    for idx, dataframe in enumerate(dataframes):
        values = dataframe[indexing_array[idx]].values.astype(float)
        sns_plot = sns.displot(data=values, kde=True)
        sns_plot.savefig(folder + image_prefix[idx] + ".png")
        plt.close()

        
def save_combined_displot(folder, x_axis, dataframe):
    """
    folder:         prefix where to save
    x_axis:         value to use for x_axis
    dataframe:      dataframe with "Algorithm" field
    """
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    sns_plot = sns.displot(data=dataframe, x=x_axis, hue="Algorithm", kind="kde")
    sns_plot.savefig(folder + "Combined_Displot.png")
    plt.close()

In [22]:
def save_barplot(folder, y_axis, dataframes):
    """
    folder:         prefix where to save
    y_axis:         value to use for y_axis
    dataframes:     array of dataframes to plot
    """
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    sns_plot = sns.barplot(y=y_axis, x='Algorithm', data=dataframes, hue='Algorithm', estimator=np.median)
    sns_plot.legend_.remove()
    sns_plot.figure.savefig(folder + "Combined_Barplot.png")
    plt.close()

In [32]:
def save_boxplot(folder, y_axis, dataframes):
    """
    folder:         prefix where to save
    y_axis:         value to use for y_axis
    dataframes:     array of dataframes to plot
    """
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    sns_plot = sns.boxplot(y=y_axis, x='Algorithm', data=dataframes, hue='Algorithm')
    sns_plot.legend_.remove()
    sns_plot.figure.savefig(folder + "Combined_Boxplot.png")
    plt.close()

In [33]:
def save_violinplot(folder, y_axis, dataframes):
    """
    folder:         prefix where to save
    y_axis:         value to use for y_axis
    dataframes:     array of dataframes to plot
    """
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    sns_plot = sns.violinplot(y=y_axis, x='Algorithm', data=dataframes, hue='Algorithm')
    sns_plot.legend_.remove()
    sns_plot.figure.savefig(folder + "Combined_Violinplot.png")
    plt.close()

In [24]:
def save_implots(folder, x_df, x_axis, y_df, y_axis):
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    tmp_list = []
    for idx in range(len(x_df)):
        x_val = x_df[x_axis][idx]
        y_val = y_df[y_axis][idx]
        algorithm = x_df['Algorithm'][idx]
        tmp_list.append([x_val, y_val, algorithm])
        
    df_tmp = pd.DataFrame(tmp_list, columns=[x_axis, y_axis, 'Algorithm'])
    for idx in range(len(algo_names)):
        tmp_df = df_tmp[df_tmp["Algorithm"] == algo_names[idx]]
        sns_plot = sns.lmplot(data=tmp_df, x=x_axis, y=y_axis)
        sns_plot.set(ylim=(0, None))
        sns_plot.savefig(folder + algo_names[idx] + str(idx) + ".png")
        plt.close()

def save_combined_implot(folder, x_df, x_axis, y_df, y_axis):
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    tmp_list = []
    for idx in range(len(x_df)):
        x_val = x_df[x_axis][idx]
        y_val = y_df[y_axis][idx]
        algorithm = x_df['Algorithm'][idx]
        tmp_list.append([x_val, y_val, algorithm])
        
    df_tmp = pd.DataFrame(tmp_list, columns=[x_axis, y_axis, 'Algorithm'])
    sns_plot = sns.lmplot(data=df_tmp, x=x_axis, y=y_axis, hue="Algorithm")
    sns_plot.set(ylim=(0, None))
    sns_plot.savefig(folder + "Combined_Implot.png")
    plt.close()

<h3>7.2 Response Time</h3>

Create new Dataframe which holds all the response time data

In [25]:
columns = ['Response Time', 'Algorithm']
data = []
for idx, dataframe in enumerate(dataframes):
    for _idx, row in dataframe.iterrows():
        data.append([row[response_time_data[idx]], algo_names[idx]])
algo_df = pd.DataFrame(data, columns=columns)

Create and Save Displots

In [26]:
save_displots("./results/responseTime/displots/", response_time_data, dataframes)

save_combined_displot("./results/responseTime/displots/", "Response Time", algo_df)

In [27]:
#add joinplots when the outlier calculation is b4 the coordinates calculation

Create and save barplots

In [28]:
save_barplot("./results/responseTime/barplot/", 'Response Time', algo_df)

Create and save boxplot

In [29]:
save_boxplot("./results/responseTime/boxplot/", 'Response Time', algo_df)

Create and save violinplot

In [31]:
save_violinplot("./results/responseTime/violinplot/", 'Response Time', algo_df)

NameError: name 'save_violinplot' is not defined

<h3>7.3. Code Flow vs Visual Stimulus</h3>

Load daraframes

In [None]:
# read data
config_array = []
code_flow_array = []

for value in code_flow_data:
    sheet_config = pd.read_excel('./data/code_flow/' + value, sheet_name="config")
    sheet_config = sheet_config.astype('int32')
    
    sheet_code_flow = pd.read_excel('./data/code_flow/' + value, sheet_name="values")
    sheet_code_flow = sheet_code_flow.astype('int32')
    
    config_array.append(sheet_config)
    code_flow_array.append(sheet_code_flow)

# transform stimulus data into code lines
visual_stimulus_code_flow_arrays = []

for idx1, visual_stimulus_dataset in enumerate(visual_stimulus_data_array):
    converted_to_lines_array = []
    
    for dataset in visual_stimulus_dataset:
        converted_to_lines = []
        
        for (x, y) in dataset:
            num = -1
            
            for idx2, tup in config_array[idx1].iterrows():
                if is_in(y, tup):
                    num = idx2
            
            converted_to_lines.append(num)
            
        converted_to_lines_array.append(converted_to_lines)
        
    visual_stimulus_code_flow_arrays.append(converted_to_lines_array)

In [None]:
# gen sequence for visual stimulus flow
        
visual_stimulus_code_flow_sequence = []

for idx, visual_stimulus_code_flow_datasets in enumerate(visual_stimulus_code_flow_arrays):
    sequence_array = []
    multiplier_offset = 10**get_0_offset(len(config_array[idx]))
    
    for visual_stimulus_code_flow_dataset in visual_stimulus_code_flow_datasets:
        sequence = []
        
        for start in range(len(visual_stimulus_code_flow_dataset)-1):
            pre = visual_stimulus_code_flow_dataset[start]
            post = visual_stimulus_code_flow_dataset[start+1]
            #potential skip if pre and post is equal, may be useful
            num = pre * multiplier_offset + post
            sequence.append(num)
            
        sequence_array.append(sequence)
        
    visual_stimulus_code_flow_sequence.append(sequence_array)

In [None]:
# gen sequence for code flow
code_flow_sequence_array = [] 

for code_flow_dataset in code_flow_array:
    sequence = []
    multiplier_offset = 10**get_0_offset(len(code_flow_dataset))
    
    for start in range(len(code_flow_dataset)-1):
        pre = code_flow_dataset['code flow'][start]
        post = code_flow_dataset['code flow'][start+1]
        #potential skip if pre and post is equal, may be useful
        num = pre * multiplier_offset + post
        sequence.append(num)
        
    code_flow_sequence_array.append(sequence)

In [None]:
#create dataframes
similarity_array = []

for idx, code_flow_sequence in enumerate(code_flow_sequence_array):
    sim_data = []
    for visual_stimulus_sequence in visual_stimulus_code_flow_sequence[idx]:
        sim_data.append(difflib.SequenceMatcher(None, code_flow_sequence, visual_stimulus_sequence).ratio())
        
    similarity_array.append(sim_data)
    
cols = ['Similarity', 'Algorithm']
data = []
for idx in range(len(similarity_array)):
    for score in similarity_array[idx]:
        data.append([score, algo_names[idx]])

df_similarity = pd.DataFrame(data, columns=cols)
df_similarity_array = []
for idx in range(len(code_flow_sequence_array)):
    df = pd.DataFrame(df_similarity.loc[df_similarity['Algorithm'] == algo_names[idx]] ,columns=cols)
    df = df.drop('Algorithm', axis=1)
    df_similarity_array.append(df)   

In [None]:
save_displots("./results/codeFlowSimilarity/displots/", ["Similarity"]*len(similarity_array), df_similarity_array)

save_combined_displot("./results/displots/", "Similarity", df_similarity)

In [None]:
save_barplot("./results/codeFlowSimilarity/barplot/", 'Similarity', df_similarity)

In [None]:
save_boxplot("./results/codeFlowSimilarity/boxplot/", 'Similarity', df_similarity)

In [None]:
save_violinplot("./results/codeFlowSimilarity/violinplot/", 'Similarity', df_similarity)

<h3>7.4. Regression Code Flow and Response time</h3>

In [None]:
save_implots("./results/codeFlowSimilarity/implots/", df_similarity, "Similarity", algo_df, "Response Time")
save_combined_implot("./results/codeFlowSimilarity/implots/", df_similarity, "Similarity", algo_df, "Response Time")

<h3>7.5. Statistical Values</h3>

Create Excel sheet with data containing speedup and significance corresponding to response time

In [None]:
def create_statical_data(df_array, indexing_array, folder, name):
    def highlight_signficant(s):
        if s.name != '#':
            is_sig = df_significance[s.name]
            return ['background-color: green' if v else '' for v in is_sig]
        else:
            return ['' for v in s] 
    
    mean_array = []
    is_significant = []
    t_value_response_time = []
    p_value_response_time = []
    ratio = []

    for idx1, df1 in enumerate(df_array):
        different_tmp = [algo_names[idx1]]
        t_tmp = [algo_names[idx1]]
        p_tmp = [algo_names[idx1]]
        ratio_tmp = [algo_names[idx1]]
    
        for idx2, df2 in enumerate(df_array):
            (different, t, p) = compare_for_h0(df1[indexing_array[idx1]].values, df2[indexing_array[idx2]],  alpha)
            different_tmp.append(not different)
            t_tmp.append(t)
            p_tmp.append(p)
            ratio_tmp.append(df1[indexing_array[idx1]].mean() / df2[indexing_array[idx2]].mean())
        
        is_significant.append(different_tmp)
        t_value_response_time.append(t_tmp)
        p_value_response_time.append(p_tmp)
        ratio.append(ratio_tmp)
        mean_array.append(df1[indexing_array[idx1]].mean())
    
    df_significance = pd.DataFrame(is_significant, columns=['#'] + algo_names)

    df_t = pd.DataFrame(t_value_response_time, columns=['#'] + algo_names)
    df_p = pd.DataFrame(p_value_response_time, columns=['#'] + algo_names)
    df_ratio = pd.DataFrame(ratio, columns=['#'] + algo_names)
    df_ratio = df_ratio.style.apply(highlight_signficant)
    df_mean = pd.DataFrame([mean_array], columns=algo_names)

    Path(folder).mkdir(parents=True, exist_ok=True)

    writer = pd.ExcelWriter(folder + name + '.xlsx', engine='xlsxwriter')

    df_ratio.to_excel(writer, sheet_name='ratio col row', index=False)
    df_mean.to_excel(writer, sheet_name='mean value', index=False)
    df_significance.to_excel(writer, sheet_name='statistical difference', index=False)
    df_p.to_excel(writer, sheet_name='p values', index=False)
    df_t.to_excel(writer, sheet_name='t values', index=False)

    writer.save()

Create Excel sheet with data containing speedup and significance corresponding to len of measured visual stimulus data

In [None]:
create_statical_data(dataframes, response_time_data, './results/excel/', 'TotalResponseTime')
create_statical_data(len_dataframes, ['len']*len(len_dataframes), './results/excel/', 'NumberOfMeasurements')
create_statical_data(df_similarity_array, ['Similarity']*len(df_similarity_array), './results/excel/', 'SimilarityToCodeFlow')

<h2>8. Areas of Interest </h2>