<h1> Data Analysis for REYeker</h1>

In [1]:
# lib for dataframes
import pandas as pd

# lib for saving np images
from PIL import Image

# lib for plotting
import matplotlib.pyplot as plt

# lib for numerical computations
import numpy as np

# lib for regex
import re

# lib for crerating paths
from pathlib import Path

# REYeker lib
import modules.rEYEkerAnalysis as rEYEker

# for t testing
from scipy import stats

# lib for better plotting
import seaborn as sns
sns.set_theme('paper')

# lib for differ calculation
import difflib

<h2>1. Configuration</h2>

<h5>Database configuration </h5>

In [2]:
# path to the datafile
path_to_data = r'./data/Book4.xlsx'

# columns with visual stimulus data
visual_stimulus_data = ['TR20_01', 'TI20_01', 'BR20_01', 'BI20_01']

# columns with names of the algo
algo_names = ['TR_FIB', 'TI_FIB', 'BR_FIB', 'BI_FIB']

# columns with time data of visual stimulus
time_data = []

# columns with the given answers of the studen
answer_fields = ['TR10_01', 'TI10_01', 'BR10_01', 'BI10_01']

# regex pattern for correct answer
right_answer_patterns = ['2', '2','2','2']

# colums of response time
response_time_data = ['TIME042', 'TIME008', 'TIME059', 'TIME023']

<h5>Configuration for REYEker data </h5>

In [3]:
# file for loading rEYEker settings
settings_file = "data/example.json"

<h5>Configuration for saving images </h5>

In [1]:
# data for loading the images
image_paths = ['images/TR/TR_Fibonacci.png',
               'images/TI/TI_Fibonacci.png',
               'images/BR/BR_Fibonacci.png',
               'images/BI/BI_Fibonacci.png']

# where to save to heatmaps and sequence diagrams
folder_prefix = [
    'TR/',
    'TI/',
    'BR/',
    'BI/']

# used for saving the heatmaps and sequence diagrams
image_prefix = [
    'TR_Fibonacci_',
    'TI_Fibonacci_',
    'BR_Fibonacci_',
    'BI_Fibonacci_']

<h5>Configuration for Code Flow data import</h5>

In [39]:
# excel data in data/code_flow
code_flow_data = ['TR_Fibonacci.xlsx',
                  'TI_Fibonacci.xlsx',
                  'BR_Fibonacci.xlsx',
                  'BI_Fibonacci.xlsx']  

<h5>Configuration for alpha value for t-test </h5>

In [5]:
# confidence needed for t test
alpha = 0.05

<h5>Import the columns and create dataframe</h5>

In [6]:
total_colums = []

# create all dataframe headers
for i in range(len(visual_stimulus_data)):
    tmp_list = []
    tmp_list.append(visual_stimulus_data[i])
    
    if len(time_data) != 0:
        tmp_list.append(time_data[i])
    tmp_list.append(answer_fields[i])
    tmp_list.append(response_time_data[i])
    total_colums.append(tmp_list)

dataframes = []
raw = pd.read_excel(path_to_data)

# read all dataframes
for data_set in total_colums:
    dataframe = pd.DataFrame(raw, columns = data_set)
    dataframe = dataframe.iloc[1:]
    dataframe = dataframe.dropna()
    dataframes.append(dataframe)

<h4>Splitting Dataframes in right and wrong answers.</h4>

In [7]:
dataframes_right = []
dataframes_wrong = []

# iter over every dataframe
for idx, dataframe in enumerate(dataframes):
    right_answer_pattern = right_answer_patterns[i]
    regex = re.compile(right_answer_pattern)
    answer_field = answer_fields[idx]
    
    dataframe_right = pd.DataFrame(columns = total_colums[idx])
    dataframe_wrong = pd.DataFrame(columns = total_colums[idx])
    
    # iter over every row and check if the result is rightr
    for _index, row  in dataframe.iterrows():
        result = regex.match(str(row[answer_field]))
        if result is not None:
            dataframe_right = dataframe_right.append(row)
        else:
            dataframe_wrong = dataframe_wrong.append(row)
    
    
    
    dataframes_right.append(dataframe_right)
    dataframes_wrong.append(dataframe_wrong)

<h4>Remove Outliers</h4>

In [8]:
tmp_dataframes = []
tmp_dataframes_wrong = []

#iterate over all dataframes and remove outliers
for idx, dataframe in enumerate(dataframes_right):

    data = dataframe[response_time_data[idx]]
    cleared_dataframe = dataframe[data.between(data.quantile(.15), data.quantile(0.85))]
    tmp_dataframes.append(cleared_dataframe)
    
#iterate over all dataframes and remove outliers
for idx, dataframe in enumerate(dataframes_wrong):
    data = dataframe[response_time_data[idx]]
    cleared_dataframe = dataframe[data.between(data.quantile(.15), data.quantile(0.85))]
    tmp_dataframes_wrong.append(cleared_dataframe)    
    
dataframes = tmp_dataframes
dataframes_wong = tmp_dataframes_wrong 

<h4>Import REYeker Settings</h4>

In [9]:
(_data, _times, click_setting) = rEYEker.load_data_from_json("data/example.json")

<h4>Import Images Settings</h4>

In [10]:
images = []

# read in every image
for image_path in image_paths:
    img = rEYEker.load_image(image_path)
    images.append(img)

<h4> Cast Data to Valid format</h4>

Import the visual stimulus measured Data

In [32]:
visual_stimulus_data_array = []

#iter over every dataframe
for idx, dataframe in enumerate(dataframes):
    visual_stimulus_measurements = []
    visual_stimulus_row = visual_stimulus_data[idx]

    #iter over every row 
    for _idx, item in dataframe.iterrows():
        data_str = item[visual_stimulus_row]
        data_str = data_str.strip()
        coordinates_str = data_str.split(" ")
        coordinates = []
        
        # iter over every coordinate pair x-y
        for coordinate_str in coordinates_str:
            coordinate = coordinate_str.split("-")
            coordinate = (int(coordinate[0]), int(coordinate[1]))
            coordinates.append(coordinate)
            
        visual_stimulus_measurements.append(coordinates)
        
    visual_stimulus_data_array.append(visual_stimulus_measurements)

Import the Time Data

In [12]:
timestamps_data_array = []

#iter over every dataframe
for idx, dataframe in enumerate(dataframes):
    if len(time_data) <= idx:
        break
    time_measurements = []
    time_measurement_row = time_data[idx]

    #iter over every row 
    for _idx, item in dataframe.iterrows():
        data_str = item[time_measurement_row]
        data_str = data_str.strip()
        timestamps = data_str.split(" ")
        timestamps = [int(timestamp) for timestamp in timestamps]
        time_measurement_row.append(timestamps)
        
    timestamps_data_array.append(visual_stimulus_measurements)

<h4>Helper Functions</h4>

In [125]:
def save_images(ims, folder, image_name):
    """
    :brief saves an array of images to a certain location incrementing the postfix by a number
    :param ims:        array of images (np.ndarray)
    :param folder:     prefix of image/ folder location
    :param image_name: prefix for the image
    """
    
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    prefix = folder + image_name
    
    #TODO create folders if there are none present
    for idx, data in enumerate(ims):
        data = data*255
        data = np.uint8(data)
        im = Image.fromarray(data)
        im.save(prefix + str(idx) + '.png')
        
def compare_for_h0(arr_1, arr_2, alpha):
    t, p = stats.ttest_ind(arr_1, arr_2)
    if p > alpha:
        return (True, t, p)
    else:
        return (False, t, p)
    
def is_in(value, tup):
    return tup[0] <= value <= tup[1]

def get_0_offset(number):
    i = 0
    number = int(number)
    while number != 0:
        number = int(number / 10)
        i = i + 1
    return i

<h2>2. Create Single Heatmaps</h2>

create heatmaps

In [14]:
heatmaps_datasets_array = []

# iterate over all the datasets
for dataset_idx, stimulus_dataset in enumerate(visual_stimulus_data_array):
    time_stamp_array = None
    if len(timestamps_data_array) > dataset_idx:
        time_stamp_array = timestamps_data_array[dataset_idx]
    
    heatmap_array = []

    # iterate over all the measurements of the dataset
    for visual_idx, stimulus_measurement in enumerate(stimulus_dataset):
        times = None
        if time_stamp_array is not None and len(time_stamp_array) > visual_idx:
            times = time_stamp_array[visual_idx]
        
        im = rEYEker.draw_shape_heat_map(images[dataset_idx], stimulus_measurement,click_setting, times, should_copy=True)
        heatmap_array.append(im)
        
    heatmaps_datasets_array.append(heatmap_array)

save Heatmaps

In [15]:
for idx, heatmap_array in enumerate(heatmaps_datasets_array):
    save_images(heatmap_array, "./results/heatmaps/" +  folder_prefix[idx], image_prefix[idx])

<h2>3. Create Average Heatmaps</h2>

create heatmaps

In [16]:
average_heatmaps = []

# iterate over all the datasets
for idx, stimulus_dataset in enumerate(visual_stimulus_data_array):
    image = images[idx]
    visual_measurements = visual_stimulus_data_array[idx]
    time_measurements = None
    if len(timestamps_data_array) > idx:
        time_measurements = timestamps_data_array[idx]
    im = rEYEker.draw_average_shape_heat_map_rel(image, visual_measurements, click_setting, 1.0, .0, time_measurements, should_copy=True)
    average_heatmaps.append(im)

save heatmaps

In [17]:
for idx, heatmap in enumerate(average_heatmaps):
    save_images([heatmap], "./results/average_heatmap/", image_prefix[idx])

<h2>4. Create Sequence diagramms</h2>

create sequence diagrams

In [18]:
sequence_diagrams_datasets_array = []

# iterate over all the datasets
for dataset_idx, stimulus_dataset in enumerate(visual_stimulus_data_array):
    #if time will be needed someday
    #time_stamp_array = None
    #if len(timestamps_data_array) > dataset_idx:
    #    time_stamp_array = timestamps_data_array[dataset_idx]
    
    sequence_diagram_array = []

    # iterate over all the measurements of the dataset
    for visual_idx, stimulus_measurement in enumerate(stimulus_dataset):
        #if time will be needed someday
        #times = None
        #if time_stamp_array is not None and len(time_stamp_array) > visual_idx:
        #    times = time_stamp_array[visual_idx]
        try:
            im = rEYEker.draw_vertical_line_diagram(images[dataset_idx], stimulus_measurement, should_copy=True)
            sequence_diagram_array.append(im)
        except:
            #TODO
            print("W.I.P.:", end='')
            print("to many clicks for dataset " + str(dataset_idx) + " datset " + str(visual_idx))
    sequence_diagrams_datasets_array.append(sequence_diagram_array)

save sequence diagrams

In [19]:
for idx, sequence_diagram_array in enumerate(sequence_diagrams_datasets_array):
    save_images(sequence_diagram_array, "./results/sequence_diagrams/"  +  folder_prefix[idx], image_prefix[idx])

<h2>5. Generate Code Flow diagramm</h2>

<h4> User rEYEke_COdeFlow.ipynb to create the corresponding excel sheets </h4>

<h2>6. Compare Sequence Diagram and Code Flow</h2>

Load daraframes

In [105]:
# read data
config_array = []
code_flow_array = []

for value in code_flow_data:
    sheet_config = pd.read_excel('./data/code_flow/' + value, sheet_name="config")
    sheet_config = sheet_config.astype('int32')
    
    sheet_code_flow = pd.read_excel('./data/code_flow/' + value, sheet_name="values")
    sheet_code_flow = sheet_code_flow.astype('int32')
    
    config_array.append(sheet_config)
    code_flow_array.append(sheet_code_flow)

# transform stimulus data into code lines
visual_stimulus_code_flow_arrays = []

for idx1, visual_stimulus_dataset in enumerate(visual_stimulus_data_array):
    converted_to_lines_array = []
    
    for dataset in visual_stimulus_dataset:
        converted_to_lines = []
        
        for (x, y) in dataset:
            num = -1
            
            for idx2, tup in config_array[idx1].iterrows():
                if is_in(y, tup):
                    num = idx2
            
            converted_to_lines.append(num)
            
        converted_to_lines_array.append(converted_to_lines)
        
    visual_stimulus_code_flow_arrays.append(converted_to_lines_array)

In [101]:
# gen sequence for visual stimulus flow
        
visual_stimulus_code_flow_sequence = []

for idx, visual_stimulus_code_flow_datasets in enumerate(visual_stimulus_code_flow_arrays):
    sequence_array = []
    multiplier_offset = 10**get_0_offset(len(config_array[idx]))
    
    for visual_stimulus_code_flow_dataset in visual_stimulus_code_flow_datasets:
        sequence = []
        
        for start in range(len(visual_stimulus_code_flow_dataset)-1):
            pre = visual_stimulus_code_flow_dataset[start]
            post = visual_stimulus_code_flow_dataset[start+1]
            #potential skip if pre and post is equal, may be useful
            num = pre * multiplier_offset + post
            sequence.append(num)
            
        sequence_array.append(sequence)
        
    visual_stimulus_code_flow_sequence.append(sequence_array)

In [126]:
# gen sequence for code flow
code_flow_sequence = [] 

for code_flow_dataset in code_flow_array:
    sequence = []
    multiplier_offset = 10**get_0_offset(len(code_flow_dataset))
    
    for start in range(len(code_flow_dataset)-1):
        pre = code_flow_dataset['code flow'][start]
        post = code_flow_dataset['code flow'][start+1]
        #potential skip if pre and post is equal, may be useful
        num = pre * multiplier_offset + post
        sequence.append(num)
        
    code_flow_sequence.append(sequence)

In [130]:
difflib.SequenceMatcher(None, code_flow_sequence[0], code_flow_sequence[0]).ratio()

1.0

<h2>7. Analyse average of Data</h2>

Create new Dataframe which holds all the response time data

In [43]:
columns = ['Response Time', 'Algorithm']
data = []
for idx, dataframe in enumerate(dataframes):
    for _idx, row in dataframe.iterrows():
        data.append([row[response_time_data[idx]], algo_names[idx]])
algo_df = pd.DataFrame(data, columns=columns)


Calculate Mean, SD, Media

In [21]:
mean_response_times = []

standard_deviation_response_times = []

median_response_times = []


# do it for all the data
for idx, dataframe in enumerate(dataframes):
    mean = dataframe[response_time_data[idx]].mean()
    standard = dataframe[response_time_data[idx]].std()
    median = dataframe[response_time_data[idx]].median()
    
    mean_response_times.append(mean)
    standard_deviation_response_times.append(standard)
    median_response_times.append(median)

Create and Save Displots

In [22]:
folder = "./results/displots/"
Path(folder).mkdir(parents=True, exist_ok=True)

for idx, dataframe in enumerate(dataframes):
    values = dataframe[response_time_data[idx]].values.astype(float)
    sns_plot = sns.displot(data=values, kde=True)
    sns_plot.savefig(folder + image_prefix[idx] + ".png")
    plt.close()
    
sns_plot = sns.displot(data=algo_df, x="Response Time", hue="Algorithm", kind="kde")
sns_plot.savefig(folder + "Combined_Displot.png")
plt.close()

In [23]:
#add joinplots when the outlier calculation is b4 the coordinates calculation

Create and save barplots

In [24]:
folder = "./results/barplot/"
Path(folder).mkdir(parents=True, exist_ok=True)

sns_plot = sns.barplot(y='Response Time', x='Algorithm', data=algo_df, hue='Algorithm', estimator=np.median)
sns_plot.legend_.remove()
sns_plot.figure.savefig(folder + "Combined_Barplot.png")
plt.close()

Create and save boxplot

In [25]:
folder = "./results/boxplot/"
Path(folder).mkdir(parents=True, exist_ok=True)

sns_plot = sns.boxplot(y='Response Time', x='Algorithm', data=algo_df, hue='Algorithm')
sns_plot.legend_.remove()
sns_plot.figure.savefig(folder + "Combined_Boxplot.png")
plt.close()

Create and save violinplot

In [26]:
folder = "./results/violinplot/"
Path(folder).mkdir(parents=True, exist_ok=True)

sns_plot = sns.violinplot(y='Response Time', x='Algorithm', data=algo_df, hue='Algorithm')
sns_plot.legend_.remove()
sns_plot.figure.savefig(folder + "Combined_Violinplot.png")
plt.close()

Create Excel sheet with data containing speedup and significance corresponding to response time

In [27]:
def highlight_max(s):
    if s.name != '#':
        is_sig = df_significance[s.name]
        return ['background-color: green' if v else '' for v in is_sig]
    else:
        return ['' for v in s] 

mean_array = []
is_significant = []
t_value_response_time = []
p_value_response_time = []
speed_ratio = []

for idx1, df1 in enumerate(dataframes):
    different_tmp = [algo_names[idx1]]
    t_tmp = [algo_names[idx1]]
    p_tmp = [algo_names[idx1]]
    speed_tmp = [algo_names[idx1]]
    
    for idx2, df2 in enumerate(dataframes):
        (different, t, p) = compare_for_h0(df1[response_time_data[idx1]].values, df2[response_time_data[idx2]],  alpha)
        different_tmp.append(not different)
        t_tmp.append(t)
        p_tmp.append(p)
        speed_tmp.append(df1[response_time_data[idx1]].mean() / df2[response_time_data[idx2]].mean())
        
    is_significant.append(different_tmp)
    t_value_response_time.append(t_tmp)
    p_value_response_time.append(p_tmp)
    speed_ratio.append(speed_tmp)
    mean_array.append(df1[response_time_data[idx1]].mean())
    
df_significance = pd.DataFrame(is_significant, columns=['#'] + algo_names)

df_t = pd.DataFrame(t_value_response_time, columns=['#'] + algo_names)
df_p = pd.DataFrame(p_value_response_time, columns=['#'] + algo_names)
df_speed = pd.DataFrame(speed_ratio, columns=['#'] + algo_names)
df_speed = df_speed.style.apply(highlight_max)
df_mean = pd.DataFrame([mean_array], columns=algo_names)


folder = "./results/excel/"
Path(folder).mkdir(parents=True, exist_ok=True)

writer = pd.ExcelWriter(folder + 'response_time.xlsx', engine='xlsxwriter')

df_speed.to_excel(writer, sheet_name='speed ratio', index=False)
df_mean.to_excel(writer, sheet_name='mean response time', index=False)
df_significance.to_excel(writer, sheet_name='has statistical difference', index=False)
df_p.to_excel(writer, sheet_name='p values', index=False)
df_t.to_excel(writer, sheet_name='t values', index=False)

writer.save()

Create Excel sheet with data containing speedup and significance corresponding to len of measured visual stimulus data

In [28]:
def highlight_max(s):
    if s.name != '#':
        is_sig = df_significance[s.name]
        return ['background-color: green' if v else '' for v in is_sig]
    else:
        return ['' for v in s] 
    
len_of_visual = []
for dataset_row in visual_stimulus_data_array:
    tmp_len = []
    for dataset in dataset_row:
        tmp_len.append(len(dataset))
    len_of_visual.append(tmp_len)
    
len_dataframes = []
for len_array in len_of_visual:
    df = pd.DataFrame(len_array, columns=['len'])
    len_dataframes.append(df)

mean_array = []
is_significant = []
t_value_response_time = []
p_value_response_time = []
click_ratio = []

for idx1, df1 in enumerate(len_dataframes):
    different_tmp = [algo_names[idx1]]
    t_tmp = [algo_names[idx1]]
    p_tmp = [algo_names[idx1]]
    click_tmp = [algo_names[idx1]]
    
    for idx2, df2 in enumerate(len_dataframes):
        (different, t, p) = compare_for_h0(df1['len'].values, df2['len'],  alpha)
        different_tmp.append(not different)
        t_tmp.append(t)
        p_tmp.append(p)
        click_tmp.append(df1['len'].mean() / df2['len'].mean())
        
    is_significant.append(different_tmp)
    t_value_response_time.append(t_tmp)
    p_value_response_time.append(p_tmp)
    click_ratio.append(click_tmp)
    mean_array.append(df1['len'].mean())
    
df_significance = pd.DataFrame(is_significant, columns=['#'] + algo_names)
df_t = pd.DataFrame(t_value_response_time, columns=['#'] + algo_names)
df_p = pd.DataFrame(p_value_response_time, columns=['#'] + algo_names)
df_ratio = pd.DataFrame(click_ratio, columns=['#'] + algo_names)
df_ratio = df_ratio.style.apply(highlight_max)
df_mean = pd.DataFrame([mean_array], columns=algo_names)


folder = "./results/excel/"
Path(folder).mkdir(parents=True, exist_ok=True)

writer = pd.ExcelWriter(folder + 'click_len.xlsx', engine='xlsxwriter')

df_ratio.to_excel(writer, sheet_name='click ratio', index=False)
df_mean.to_excel(writer, sheet_name='mean response time', index=False)
df_significance.to_excel(writer, sheet_name='has statistical difference', index=False)
df_p.to_excel(writer, sheet_name='p values', index=False)
df_t.to_excel(writer, sheet_name='t values', index=False)

writer.save()

<h2>8. Areas of Interest </h2>