<h1> Data Analysis for REYeker</h1>

In [105]:
# lib for dataframes
import pandas as pd

# lib for saving np images
from PIL import Image

# lib forplotting
import matplotlib.pyplot as plt

# lib for numerical computations
import numpy as np

# lib for regex
import re

# liv for crerating paths
from pathlib import Path

# REYeker lib
import modules.rEYEkerAnalysis as rEYEker

# lib for better plotting
import seaborn as sns
sns.set_theme('paper')

<h2>1. Configuration</h2>

<h5>Database configuration </h5>

In [60]:
# path to the datafile
path_to_data = r'./data/Book4.xlsx'

# columns with visual stimulus data
visual_stimulus_data = ['TR20_01', 'TI20_01', 'BR20_01', 'BI20_01']

# columns with time data of visual stimulus
time_data = []

# columns with the given answers of the studen
answer_fields = ['TR10_01', 'TI10_01', 'BR10_01', 'BI10_01']

# regex pattern for correct answer
right_answer_patterns = ['2', '2','2','2']

# colums of response time
response_time_data = ['TIME042', 'TIME008', 'TIME059', 'TIME023']

<h5>Configuration for REYEker data </h5>

In [None]:
settings_file = "data/example.json"

<h5>Configuration for saving images </h5>

In [None]:
image_paths = ['images/TR/TR_Fibonacci.png',
               'images/TI/TI_Fibonacci.png',
               'images/BR/BR_Fibonacci.png',
               'images/BI/BI_Fibonacci.png']

folder_prefix = [
    'TR/',
    'TI/',
    'BR/',
    'BI/']

image_prefix = [
    'TR_Fibonacci_',
    'TI_Fibonacci_',
    'BR_Fibonacci_',
    'BI_Fibonacci_']

<h5>Import the columns and create dataframe</h5>

In [61]:
total_colums = []

# create all dataframe headers
for i in range(len(visual_stimulus_data)):
    tmp_list = []
    tmp_list.append(visual_stimulus_data[i])
    
    if len(time_data) != 0:
        tmp_list.append(time_data[i])
    tmp_list.append(answer_fields[i])
    tmp_list.append(response_time_data[i])
    total_colums.append(tmp_list)

dataframes = []
raw = pd.read_excel(path_to_data)

# read all dataframes
for data_set in total_colums:
    dataframe = pd.DataFrame(raw, columns = data_set)
    dataframe = dataframe.iloc[1:]
    dataframe = dataframe.dropna()
    dataframes.append(dataframe)

<h4>Splitting Dataframes in right and wrong answers.</h4>

In [62]:
dataframes_right = []
dataframes_wrong = []

# iter over every dataframe
for idx, dataframe in enumerate(dataframes):
    right_answer_pattern = right_answer_patterns[i]
    regex = re.compile(right_answer_pattern)
    answer_field = answer_fields[idx]
    
    dataframe_right = pd.DataFrame(columns = total_colums[idx])
    dataframe_wrong = pd.DataFrame(columns = total_colums[idx])
    
    # iter over every row and check if the result is rightr
    for _index, row  in dataframe.iterrows():
        result = regex.match(str(row[answer_field]))
        if result is not None:
            dataframe_right = dataframe_right.append(row)
        else:
            dataframe_wrong = dataframe_wrong.append(row)
    
    
    
    dataframes_right.append(dataframe_right)
    dataframes_wrong.append(dataframe_wrong)

<h4>Remove Outliers</h4>

In [63]:
tmp_dataframes = []
tmp_dataframes_wrong = []

#iterate over all dataframes and remove outliers
for idx, dataframe in enumerate(dataframes_right):

    data = dataframe[response_time_data[idx]]
    cleared_dataframe = dataframe[data.between(data.quantile(.15), data.quantile(0.85))]
    tmp_dataframes.append(cleared_dataframe)
    
#iterate over all dataframes and remove outliers
for idx, dataframe in enumerate(dataframes_wrong):
    data = dataframe[response_time_data[idx]]
    cleared_dataframe = dataframe[data.between(data.quantile(.15), data.quantile(0.85))]
    tmp_dataframes_wrong.append(cleared_dataframe)    
    
dataframes = tmp_dataframes
dataframes_wong = tmp_dataframes_wrong 

<h4>Import REYeker Settings</h4>

In [65]:
(_data, _times, click_setting) = rEYEker.load_data_from_json("data/example.json")

<h4>Import Images Settings</h4>

In [66]:
images = []

# read in every image
for image_path in image_paths:
    img = rEYEker.load_image(image_path)
    images.append(img)

<h4> Cast Data to Valid format</h4>

Import the visual stimulus measured Data

In [67]:
visual_stimulus_data_array = []

#iter over every dataframe
for idx, dataframe in enumerate(dataframes):
    visual_stimulus_measurements = []
    visual_stimulus_row = visual_stimulus_data[idx]

    #iter over every row 
    for _idx, item in dataframe.iterrows():
        data_str = item[visual_stimulus_row]
        data_str = data_str.strip()
        coordinates_str = data_str.split(" ")
        coordinates = []
        
        # iter over every coordinate pair x-y
        for coordinate_str in coordinates_str:
            coordinate = coordinate_str.split("-")
            coordinate = (int(coordinate[0]), int(coordinate[1]))
            coordinates.append(coordinate)
            
        visual_stimulus_measurements.append(coordinates)
        
    visual_stimulus_data_array.append(visual_stimulus_measurements)

Import the Time Data

In [68]:
timestamps_data_array = []

#iter over every dataframe
for idx, dataframe in enumerate(dataframes):
    if len(time_data) <= idx:
        break
    time_measurements = []
    time_measurement_row = time_data[idx]

    #iter over every row 
    for _idx, item in dataframe.iterrows():
        data_str = item[time_measurement_row]
        data_str = data_str.strip()
        timestamps = data_str.split(" ")
        timestamps = [int(timestamp) for timestamp in timestamps]
        time_measurement_row.append(timestamps)
        
    timestamps_data_array.append(visual_stimulus_measurements)

<h4>Helper Functions</h4>

In [69]:
def save_images(ims, folder, image_name):
    """
    :brief saves an array of images to a certain location incrementing the postfix by a number
    :param ims:        array of images (np.ndarray)
    :param folder:     prefix of image/ folder location
    :param image_name: prefix for the image
    """
    
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    prefix = folder + image_name
    
    #TODO create folders if there are none present
    for idx, data in enumerate(ims):
        data = data*255
        data = np.uint8(data)
        im = Image.fromarray(data)
        im.save(prefix + str(idx) + '.png')

<h2>2. Create Single Heatmaps</h2>

create heatmaps

In [11]:
heatmaps_datasets_array = []

# iterate over all the datasets
for dataset_idx, stimulus_dataset in enumerate(visual_stimulus_data_array):
    time_stamp_array = None
    if len(timestamps_data_array) > dataset_idx:
        time_stamp_array = timestamps_data_array[dataset_idx]
    
    heatmap_array = []

    # iterate over all the measurements of the dataset
    for visual_idx, stimulus_measurement in enumerate(stimulus_dataset):
        times = None
        if time_stamp_array is not None and len(time_stamp_array) > visual_idx:
            times = time_stamp_array[visual_idx]
        
        im = rEYEker.draw_shape_heat_map(images[dataset_idx], stimulus_measurement,click_setting, times, should_copy=True)
        heatmap_array.append(im)
        
    heatmaps_datasets_array.append(heatmap_array)

save Heatmaps

In [73]:
for idx, heatmap_array in enumerate(heatmaps_datasets_array):
    save_images(heatmap_array, "./results/heatmaps/" +  folder_prefix[idx], image_prefix[idx])

<h2>3. Create Average Heatmaps</h2>

create heatmaps

In [77]:
average_heatmaps = []

# iterate over all the datasets
for idx, stimulus_dataset in enumerate(visual_stimulus_data_array):
    image = images[idx]
    visual_measurements = visual_stimulus_data_array[idx]
    time_measurements = None
    if len(timestamps_data_array) > idx:
        time_measurements = timestamps_data_array[idx]
    im = rEYEker.draw_average_shape_heat_map_rel(image, visual_measurements, click_setting, 1.0, .0, time_measurements, should_copy=True)
    average_heatmaps.append(im)

save heatmaps

In [83]:
save_images(average_heatmaps, "./results/average_heatmap/", image_prefix[idx])

<h2>4. Create Sequence diagramms</h2>

create sequence diagrams

In [18]:
sequence_diagrams_datasets_array = []

# iterate over all the datasets
for dataset_idx, stimulus_dataset in enumerate(visual_stimulus_data_array):
    #if time will be needed someday
    #time_stamp_array = None
    #if len(timestamps_data_array) > dataset_idx:
    #    time_stamp_array = timestamps_data_array[dataset_idx]
    
    sequence_diagram_array = []

    # iterate over all the measurements of the dataset
    for visual_idx, stimulus_measurement in enumerate(stimulus_dataset):
        #if time will be needed someday
        #times = None
        #if time_stamp_array is not None and len(time_stamp_array) > visual_idx:
        #    times = time_stamp_array[visual_idx]
        try:
            im = rEYEker.draw_vertical_line_diagram(images[dataset_idx], stimulus_measurement, should_copy=True)
            sequence_diagram_array.append(im)
        except:
            #TODO
            print("W.I.P.:", end='')
            print("to many clicks for dataset " + str(dataset_idx) + " datset " + str(visual_idx))
    sequence_diagrams_datasets_array.append(sequence_diagram_array)

save sequence diagrams

In [84]:
for idx, sequence_diagram_array in enumerate(sequence_diagrams_datasets_array):
    save_images(sequence_diagram_array, "./results/sequence_diagrams/", image_prefix[idx])

<h2>5. Generate Code Flow diagramm</h2>

TODO

set regions of code and somehow construct the codeflow

<h2>6. Compare Sequence Diagram and Code Flow</h2>

TODO

maybe use needleman wunsch or see how many jumps are equal and which jumps that are

<h2>7. Analyse average of Data</h2>

Create new Dataframe which holds all the response time data

In [None]:
columns = ['Response Time', 'Algorithm']
data = []
for idx, dataframe in enumerate(dataframes):
    for _idx, row in dataframe.iterrows():
        data.append([row[response_time_data[idx]], visual_stimulus_data[idx]])
algo_df = pd.DataFrame(data, columns=columns)

Calculate Mean, SD, Media

In [23]:
mean_response_times = []

standard_deviation_response_times = []

median_response_times = []


# do it for all the data
for idx, dataframe in enumerate(dataframes):
    mean = dataframe[response_time_data[idx]].mean()
    standard = dataframe[response_time_data[idx]].std()
    median = dataframe[response_time_data[idx]].median()
    
    mean_response_times.append(mean)
    standard_deviation_response_times.append(standard)
    median_response_times.append(median)

Create and Save Displots

In [106]:
folder = "./results/displots/"
Path(folder).mkdir(parents=True, exist_ok=True)

for idx, dataframe in enumerate(dataframes):
    values = dataframe[response_time_data[idx]].values.astype(float)
    sns_plot = sns.displot(data=values, kde=True)
    sns_plot.savefig(folder + image_prefix[idx] + ".png")
    plt.close()
    
sns_plot = sns.displot(data=algo_df, x="Response Time", hue="Algorithm", kind="kde")
sns_plot.savefig(folder + "Combined_Displot.png")
plt.close()

In [None]:
#add joinplots when the outlier calculation is b4 the coordinates calculation

Create and save barplots

In [114]:
folder = "./results/barplot/"
Path(folder).mkdir(parents=True, exist_ok=True)

sns_plot = sns.barplot(y='Response Time', x='Algorithm', data=algo_df, hue='Algorithm', estimator=np.median)
sns_plot.legend_.remove()
sns_plot.figure.savefig(folder + "Combined_Barplot.png")
plt.close()

Create and save boxplot

In [49]:
folder = "./results/boxplot/"
Path(folder).mkdir(parents=True, exist_ok=True)

sns_plot = sns.boxplot(y='Response Time', x='Algorithm', data=algo_df, hue='Algorithm')
sns_plot.legend_.remove()
sns_plot.figure.savefig(folder + "Combined_Boxplot.png")
plt.close()

Create and save violinplot

In [115]:
folder = "./results/violinplot/"
Path(folder).mkdir(parents=True, exist_ok=True)

sns_plot = sns.violinplot(y='Response Time', x='Algorithm', data=algo_df, hue='Algorithm')
sns_plot.legend_.remove()
sns_plot.figure.savefig(folder + "Combined_Violinplot.png")
plt.close()

<h2>8. Areas of Interest </h2>