<h1> Data Analysis for REYeker</h1>

In [1]:
# lib for dataframes
import pandas as pd

# lib for saving np images
from PIL import Image

# lib for plotting
%matplotlib inline
import matplotlib.pyplot as plt

# lib for numerical computations
import numpy as np

# lib for regex
import re

# lib for crerating paths
from pathlib import Path

# REYeker lib
import modules.rEYEkerAnalysis as rEYEker

# for t testing
from scipy import stats

# lib for better plotting
import seaborn as sns
sns.set_theme('paper')

# lib for differ calculation
import difflib

<h2>1. Configuration</h2>

<h5>Database configuration </h5>

In [9]:
# path to the datafile
config_datasheet_path = r'./Book4.xlsx'

# columns with visual stimulus data
config_visual_stimulus_variable_array = ['TR20_01', 'TI20_01', 'BR20_01', 'BI20_01']

# columns with names of the algo
config_algo_names = ['TR_FIB', 'TI_FIB', 'BR_FIB', 'BI_FIB']

# columns with time data of visual stimulus
config_time_variable_array = []

# columns with the given answers of the studen
config_answer_variable_array = ['TR10_01', 'TI10_01', 'BR10_01', 'BI10_01']

# regex pattern for correct answer
config_answer_pattern_array = ['2', '2','2','2']

# colums of response time
config_response_time_variable_array = ['TIME042', 'TIME008', 'TIME059', 'TIME023']

<h5>Configuration for REYEker data </h5>

In [10]:
# file for loading rEYEker settings
config_reyeker_settings_path = "data/example.json"

<h5>Configuration for saving images </h5>

In [11]:
# data for loading the images
config_image_path_array = ['images/TR/TR_Fibonacci.png',
               'images/TI/TI_Fibonacci.png',
               'images/BR/BR_Fibonacci.png',
               'images/BI/BI_Fibonacci.png']

# where to save to heatmaps and sequence diagrams
config_folder_prefix_array = [
    'TR/',
    'TI/',
    'BR/',
    'BI/']

# used for saving the heatmaps and sequence diagrams
config_image_prefix_array = [
    'TR_Fibonacci_',
    'TI_Fibonacci_',
    'BR_Fibonacci_',
    'BI_Fibonacci_']

<h5>Configuration for Code Flow data import</h5>

In [12]:
# excel data in data/code_flow
config_code_flow_datasheet_array = ['TR_Fibonacci.xlsx',
                  'TI_Fibonacci.xlsx',
                  'BR_Fibonacci.xlsx',
                  'BI_Fibonacci.xlsx']  

##### Configuration for aoi data import

In [13]:
# excel data in data/aoi_categorized
config_aoi_datasheet_array = ['AOI_TR_Fibonacci.xlsx',
                  'AOI_TI_Fibonacci.xlsx',
                  'AOI_BR_Fibonacci.xlsx',
                  'AOI_BI_Fibonacci.xlsx']  

<h5>Configuration for alpha value for t-test </h5>

In [14]:
# confidence needed for t test
config_alpha = 0.05

<h5>Import the columns and create dataframe</h5>

In [17]:
needed_columns_array = []

# create all dataframe headers
for i in range(len(config_visual_stimulus_variable_array)):
    tmp_list = []
    tmp_list.append(config_visual_stimulus_variable_array[i])
    
    if len(config_time_variable_array) != 0:
        tmp_list.append(config_time_variable_array[i])

    tmp_list.append(config_answer_variable_array[i])
    tmp_list.append(config_response_time_variable_array[i])
    needed_columns_array.append(tmp_list)

df_array = []
raw = pd.read_excel(config_datasheet_path)

# read all dataframes
for data_set in needed_columns_array:
    dataframe = pd.DataFrame(raw, columns = data_set)
    dataframe = dataframe.iloc[1:]
    dataframe = dataframe.dropna()
    df_array.append(dataframe)

<h4>Splitting Dataframes in right and wrong answers.</h4>

In [9]:
df_array_right = []
df_array_wrong = []

# iter over every dataframe
for idx, dataframe in enumerate(df_array):
    right_answer_pattern = config_answer_pattern_array[idx]
    regex = re.compile(right_answer_pattern)
    answer_field = config_answer_variable_array[idx]
    
    dataframe_right = pd.DataFrame(columns = needed_columns_array[idx])
    dataframe_wrong = pd.DataFrame(columns = needed_columns_array[idx])
    
    # iter over every row and check if the result is rightr
    for _idx, row  in dataframe.iterrows():
        result = regex.match(str(row[answer_field]))
        if result is not None:
            dataframe_right = dataframe_right.append(row)
        else:
            dataframe_wrong = dataframe_wrong.append(row)
    
    
    
    df_array_right.append(dataframe_right)
    df_array_wrong.append(dataframe_wrong)

<h4>Remove Outliers</h4>

In [10]:
tmp_df_array = []
tmp_df_array_wrong = []

#iterate over all dataframes and remove outliers
for idx, dataframe in enumerate(df_array_right):

    data = dataframe[config_response_time_variable_array[idx]]
    cleared_dataframe = None
    if len(data) == 1:
        cleared_dataframe = dataframe
    else:
        cleared_dataframe = dataframe[(np.abs(stats.zscore(data.astype(float))) < 3)]
    tmp_df_array.append(cleared_dataframe)
    
#iterate over all dataframes and remove outliers
for idx, dataframe in enumerate(df_array_wrong):
    data = dataframe[config_response_time_variable_array[idx]]
    cleared_dataframe = dataframe[data.between(data.quantile(.15), data.quantile(0.85))]
    tmp_df_array_wrong.append(cleared_dataframe)
    
df_array = tmp_df_array
df_array_wong = tmp_df_array_wrong

<h4>Import REYeker Settings</h4>

In [11]:
(_data, _times, click_setting) = rEYEker.load_data_from_json(config_reyeker_settings_path)

<h4>Import Images Settings</h4>

In [12]:
image_array = []

# read in every image
for image_path in config_image_path_array:
    img = rEYEker.load_image(image_path)
    image_array.append(img)

<h4> Cast Data to Valid format</h4>

Import the visual stimulus measured Data

In [13]:
visual_stimulus_data_matrix = []

#iter over every dataframe
for idx, dataframe in enumerate(df_array):
    visual_stimulus_array = []
    visual_stimulus_row = config_visual_stimulus_variable_array[idx]

    #iter over every row 
    for _idx, item in dataframe.iterrows():
        data_str = item[visual_stimulus_row]
        data_str = data_str.strip()
        coordinates_str = data_str.split(" ")
        coordinates = []
        
        # iter over every coordinate pair x-y
        for coordinate_str in coordinates_str:
            coordinate = coordinate_str.split("-")
            coordinate = (int(coordinate[0]), int(coordinate[1]))
            coordinates.append(coordinate)
            
        visual_stimulus_array.append(coordinates)
        
    visual_stimulus_data_matrix.append(visual_stimulus_array)

Import the Time Data

In [14]:
timestamps_data_matrix = []

#iter over every dataframe
for idx, dataframe in enumerate(df_array):
    if len(config_time_variable_array) <= idx:
        break
    time_measurements = []
    time_measurement_row = config_time_variable_array[idx]

    #iter over every row 
    for _idx, item in dataframe.iterrows():
        data_str = item[time_measurement_row]
        data_str = data_str.strip()
        timestamps = data_str.split(" ")
        timestamps = [int(timestamp) for timestamp in timestamps]
        time_measurement_row.append(timestamps)
        
    timestamps_data_matrix.append(visual_stimulus_measurements)

<h4>Helper Functions</h4>

In [15]:
def save_images(image_array, folder, image_name):
    """
    :brief saves an array of images to a certain location incrementing the postfix by a number
    :param image_array:        array of images (np.ndarray)
    :param folder:     prefix of image/ folder location
    :param image_name: prefix for the image
    """
    
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    prefix = folder + image_name
    
    #TODO create folders if there are none present
    for idx, data in enumerate(image_array):
        data = data*255
        data = np.uint8(data)
        im = Image.fromarray(data)
        im.save(prefix + str(idx) + '.png')
        
def compare_for_h0(arr_1, arr_2, alpha):
    t, p = stats.ttest_ind(arr_1, arr_2)
    if p > alpha:
        return True, t, p
    else:
        return False, t, p
    
def is_in(value, tup):
    return tup[0] <= value <= tup[1]

def get_0_offset(number):
    i = 0
    number = int(number)
    while number != 0:
        number = int(number / 10)
        i = i + 1
    return i

<h2>2. Create Single Heatmaps</h2>

create heatmaps

In [16]:
heatmaps_matrix = []

# iterate over all the datasets
for dataset_idx, stimulus_dataset in enumerate(visual_stimulus_data_matrix):
    time_stamp_array = None
    if len(timestamps_data_matrix) > dataset_idx:
        time_stamp_array = timestamps_data_matrix[dataset_idx]
    
    heatmap_array = []

    # iterate over all the measurements of the dataset
    for visual_idx, stimulus_measurement in enumerate(stimulus_dataset):
        times = None
        if time_stamp_array is not None and len(time_stamp_array) > visual_idx:
            times = time_stamp_array[visual_idx]
        
        im = rEYEker.draw_shape_heat_map(image_array[dataset_idx], stimulus_measurement,click_setting, time_stamps=times, should_copy=True)
        heatmap_array.append(im)
        
    heatmaps_matrix.append(heatmap_array)

save Heatmaps

In [17]:
for idx, heatmap_array in enumerate(heatmaps_matrix):
    save_images(heatmap_array, "./results/heatmaps/heatmaps/" +  config_folder_prefix_array[idx], config_image_prefix_array[idx])

<h2>3. Create Average Heatmaps</h2>

create heatmaps

In [18]:
average_heatmap_array = []
avergae_heatmask_array = []

# iterate over all the datasets
for idx, stimulus_dataset in enumerate(visual_stimulus_data_matrix):
    image = image_array[idx]
    visual_measurements = visual_stimulus_data_matrix[idx]
    time_measurements = None
    if len(timestamps_data_matrix) > idx:
        time_measurements = timestamps_data_matrix[idx]
    im, mask = rEYEker.draw_average_shape_heat_map_rel(image, visual_measurements, click_setting, 1.0, .0, time_measurements, should_copy=True)
    average_heatmap_array.append(im)
    avergae_heatmask_array.append(mask)

save heatmaps

In [19]:
for idx, heatmap in enumerate(average_heatmap_array):
    save_images([heatmap], "./results/heatmaps/average_heatmap/", config_image_prefix_array[idx])

<h2>4. Create Sequence diagramms</h2>

create sequence diagrams

In [20]:
sequence_diagrams_matrix = []

# iterate over all the datasets
for dataset_idx, stimulus_dataset in enumerate(visual_stimulus_data_matrix):
    #if time will be needed someday
    #time_stamp_array = None
    #if len(timestamps_data_matrix) > dataset_idx:
    #    time_stamp_array = timestamps_data_matrix[dataset_idx]
    
    sequence_diagram_array = []

    # iterate over all the measurements of the dataset
    for visual_idx, stimulus_measurement in enumerate(stimulus_dataset):
        #if time will be needed someday
        #times = None
        #if time_stamp_array is not None and len(time_stamp_array) > visual_idx:
        #    times = time_stamp_array[visual_idx]
        try:
            im = rEYEker.draw_vertical_line_diagram(image_array[dataset_idx], stimulus_measurement, should_copy=True)
            sequence_diagram_array.append(im)
        except:
            #TODO
            print("W.I.P.:", end='')
            print("to many clicks for dataset " + str(dataset_idx) + " datset " + str(visual_idx))
    sequence_diagrams_matrix.append(sequence_diagram_array)

W.I.P.:to many clicks for dataset 3 datset 14


save sequence diagrams

In [21]:
for idx, sequence_diagram_array in enumerate(sequence_diagrams_matrix):
    save_images(sequence_diagram_array, "./results/sequence_diagrams/"  +  config_folder_prefix_array[idx], config_image_prefix_array[idx])

<h2>5. Generate Code Flow diagramm</h2>

<h4> User rEYEke_COdeFlow.ipynb to create the corresponding excel sheets </h4>

<h2>7. Analyse average of Data</h2>

<h3>7.1 Helper Functions</h3>

In [22]:
def save_displots(folder, indexing_array, df_array):
    """
    folder:         prefix where to save
    indexing_array: how to index into the dataframe
    df_array:     array of dataframes to plot
    """
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    for idx, dataframe in enumerate(df_array):
        values = dataframe[indexing_array[idx]].values.astype(float)
        sns_plot = sns.displot(data=values, kde=True)
        sns_plot.savefig(folder + config_image_prefix_array[idx] + ".png")
        plt.close()

        
def save_combined_displot(folder, x_axis, dataframe):
    """
    folder:         prefix where to save
    x_axis:         value to use for x_axis
    dataframe:      dataframe with "Algorithm" field
    """
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    sns_plot = sns.displot(data=dataframe, x=x_axis, hue="Algorithm", kind="kde")
    sns_plot.savefig(folder + "Combined_Displot.png")
    plt.close()

In [23]:
def save_barplot(folder, y_axis, df_array):
    """
    folder:         prefix where to save
    y_axis:         value to use for y_axis
    df_array:     array of dataframes to plot
    """
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    sns_plot = sns.barplot(y=y_axis, x='Algorithm', data=df_array, hue='Algorithm', estimator=np.median)
    sns_plot.legend_.remove()
    sns_plot.figure.savefig(folder + "Combined_Barplot.png")
    plt.close()

In [24]:
def save_boxplot(folder, y_axis, df_array):
    """
    folder:         prefix where to save
    y_axis:         value to use for y_axis
    df_array:     array of dataframes to plot
    """
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    sns_plot = sns.boxplot(y=y_axis, x='Algorithm', data=df_array, hue='Algorithm')
    sns_plot.legend_.remove()
    sns_plot.figure.savefig(folder + "Combined_Boxplot.png")
    plt.close()

In [25]:
def save_violinplot(folder, y_axis, df_array):
    """
    folder:         prefix where to save
    y_axis:         value to use for y_axis
    df_array:     array of dataframes to plot
    """
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    sns_plot = sns.violinplot(y=y_axis, x='Algorithm', data=df_array, hue='Algorithm')
    sns_plot.legend_.remove()
    sns_plot.figure.savefig(folder + "Combined_Violinplot.png")
    plt.close()

In [26]:
def save_implots(folder, x_df, x_axis, y_df, y_axis):
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    tmp_list = []
    for idx in range(len(x_df)):
        x_val = x_df[x_axis][idx]
        y_val = y_df[y_axis][idx]
        algorithm = x_df['Algorithm'][idx]
        tmp_list.append([x_val, y_val, algorithm])
        
    df_tmp = pd.DataFrame(tmp_list, columns=[x_axis, y_axis, 'Algorithm'])
    for idx in range(len(config_algo_names)):
        tmp_df = df_tmp[df_tmp["Algorithm"] == config_algo_names[idx]]
        sns_plot = sns.lmplot(data=tmp_df, x=x_axis, y=y_axis)
        sns_plot.set(ylim=(0, None))
        sns_plot.savefig(folder + config_algo_names[idx] + str(idx) + ".png")
        plt.close()

def save_combined_implot(folder, x_df, x_axis, y_df, y_axis):
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    tmp_list = []
    for idx in range(len(x_df)):
        x_val = x_df[x_axis][idx]
        y_val = y_df[y_axis][idx]
        algorithm = x_df['Algorithm'][idx]
        tmp_list.append([x_val, y_val, algorithm])
        
    df_tmp = pd.DataFrame(tmp_list, columns=[x_axis, y_axis, 'Algorithm'])
    sns_plot = sns.lmplot(data=df_tmp, x=x_axis, y=y_axis, hue="Algorithm")
    sns_plot.set(ylim=(0, None))
    sns_plot.savefig(folder + "Combined_Implot.png")
    plt.close()

<h3>7.2 Response Time</h3>

Create new Dataframe which holds all the response time data

In [27]:
columns = ['Response Time', 'Algorithm']
data = []
for idx, dataframe in enumerate(df_array):
    for _idx, row in dataframe.iterrows():
        data.append([row[config_response_time_variable_array[idx]], config_algo_names[idx]])
algo_df = pd.DataFrame(data, columns=columns)

Create and Save Displots

In [28]:
save_displots("./results/responseTime/displots/", config_response_time_variable_array, df_array)

save_combined_displot("./results/responseTime/displots/", "Response Time", algo_df)

Create and save barplots

In [29]:
save_barplot("./results/responseTime/barplot/", 'Response Time', algo_df)

Create and save boxplot

In [30]:
save_boxplot("./results/responseTime/boxplot/", 'Response Time', algo_df)

Create and save violinplot

In [31]:
save_violinplot("./results/responseTime/violinplot/", 'Response Time', algo_df)

<h3>7.3. Code Flow vs Visual Stimulus</h3>

Load daraframes

In [32]:
# read data
config_array = []
code_flow_array = []

for value in config_code_flow_datasheet_array:
    sheet_config = pd.read_excel('./data/code_flow/' + value, sheet_name="config")
    sheet_config = sheet_config.astype('int32')
    
    sheet_code_flow = pd.read_excel('./data/code_flow/' + value, sheet_name="values")
    sheet_code_flow = sheet_code_flow.astype('int32')
    
    config_array.append(sheet_config)
    code_flow_array.append(sheet_code_flow)

# transform stimulus data into code lines
visual_stimulus_code_flow_matrix = []

for idx1, visual_stimulus_dataset in enumerate(visual_stimulus_data_matrix):
    converted_to_lines_array = []
    
    for dataset in visual_stimulus_dataset:
        converted_to_lines = []
        
        for (x, y) in dataset:
            num = -1
            
            for idx2, tup in config_array[idx1].iterrows():
                if is_in(y, tup):
                    num = idx2
            
            converted_to_lines.append(num)
            
        converted_to_lines_array.append(converted_to_lines)
        
    visual_stimulus_code_flow_matrix.append(converted_to_lines_array)

XLRDError: No sheet named <'values'>

In [None]:
# gen sequence for visual stimulus flow
        
visual_stimulus_flow_matrix = []

for idx, visual_stimulus_code_flow_datasets in enumerate(visual_stimulus_code_flow_matrix):
    sequence_array = []
    multiplier_offset = 10**get_0_offset(len(config_array[idx]))
    
    for visual_stimulus_code_flow_dataset in visual_stimulus_code_flow_datasets:
        sequence = []
        
        for start in range(len(visual_stimulus_code_flow_dataset)-1):
            pre = visual_stimulus_code_flow_dataset[start]
            post = visual_stimulus_code_flow_dataset[start+1]
            #potential skip if pre and post is equal, may be useful
            num = pre * multiplier_offset + post
            sequence.append(num)
            
        sequence_array.append(sequence)
        
    visual_stimulus_flow_matrix.append(sequence_array)

In [None]:
# gen sequence for code flow
code_flow_sequence_array = []

for code_flow_dataset in code_flow_array:
    sequence = []
    multiplier_offset = 10**get_0_offset(len(code_flow_dataset))
    
    for start in range(len(code_flow_dataset)-1):
        pre = code_flow_dataset['code flow'][start]
        post = code_flow_dataset['code flow'][start+1]
        #potential skip if pre and post is equal, may be useful
        num = pre * multiplier_offset + post
        sequence.append(num)
        
    code_flow_sequence_array.append(sequence)

In [None]:
#create dataframes
similarity_matrix = []

for idx, code_flow_sequence in enumerate(code_flow_sequence_array):
    sim_data = []
    for visual_stimulus_sequence in visual_stimulus_flow_matrix[idx]:
        sim_data.append(difflib.SequenceMatcher(None, code_flow_sequence, visual_stimulus_sequence).ratio())
        
    similarity_matrix.append(sim_data)
    
cols = ['Similarity', 'Algorithm']
data = []
for idx in range(len(similarity_matrix)):
    for score in similarity_matrix[idx]:
        data.append([score, config_algo_names[idx]])

df_similarity = pd.DataFrame(data, columns=cols)
df_similarity_array = []
for idx in range(len(code_flow_array)):
    df = pd.DataFrame(df_similarity.loc[df_similarity['Algorithm'] == config_algo_names[idx]] ,columns=cols)
    df = df.drop('Algorithm', axis=1)
    df_similarity_array.append(df)

In [None]:
save_displots("./results/codeFlowSimilarity/displots/", ["Similarity"]*len(similarity_matrix), df_similarity_array)

save_combined_displot("./results/codeFlowSimilarity/displots/", "Similarity", df_similarity)

In [None]:
save_barplot("./results/codeFlowSimilarity/barplot/", 'Similarity', df_similarity)

In [None]:
save_boxplot("./results/codeFlowSimilarity/boxplot/", 'Similarity', df_similarity)

In [None]:
save_violinplot("./results/codeFlowSimilarity/violinplot/", 'Similarity', df_similarity)

<h3>7.4. Regression Code Flow and Response time</h3>

In [None]:
save_implots("./results/codeFlowSimilarity/implots/", df_similarity, "Similarity", algo_df, "Response Time")
save_combined_implot("./results/codeFlowSimilarity/implots/", df_similarity, "Similarity", algo_df, "Response Time")

<h3>7.5. Statistical Values</h3>

Create Excel sheet with data containing speedup and significance corresponding to response time

In [None]:
def create_statical_data(df_array, indexing_array, folder, name):
    def highlight_signficant(s):
        if s.name != '#':
            is_sig = df_significance[s.name]
            return ['background-color: green' if v else '' for v in is_sig]
        else:
            return ['' for v in s] 
    
    mean_array = []
    is_significant = []
    t_value_response_time = []
    p_value_response_time = []
    ratio = []

    for idx1, df1 in enumerate(df_array):
        different_tmp = [config_algo_names[idx1]]
        t_tmp = [config_algo_names[idx1]]
        p_tmp = [config_algo_names[idx1]]
        ratio_tmp = [config_algo_names[idx1]]
    
        for idx2, df2 in enumerate(df_array):
            (different, t, p) = compare_for_h0(df1[indexing_array[idx1]].values, df2[indexing_array[idx2]],  config_alpha)
            different_tmp.append(not different)
            t_tmp.append(t)
            p_tmp.append(p)
            ratio_tmp.append(df1[indexing_array[idx1]].mean() / df2[indexing_array[idx2]].mean())
        
        is_significant.append(different_tmp)
        t_value_response_time.append(t_tmp)
        p_value_response_time.append(p_tmp)
        ratio.append(ratio_tmp)
        mean_array.append(df1[indexing_array[idx1]].mean())
    
    df_significance = pd.DataFrame(is_significant, columns=['#'] + config_algo_names)

    df_t = pd.DataFrame(t_value_response_time, columns=['#'] + config_algo_names)
    df_p = pd.DataFrame(p_value_response_time, columns=['#'] + config_algo_names)
    df_ratio = pd.DataFrame(ratio, columns=['#'] + config_algo_names)
    df_ratio = df_ratio.style.apply(highlight_signficant)
    df_mean = pd.DataFrame([mean_array], columns=config_algo_names)

    Path(folder).mkdir(parents=True, exist_ok=True)

    writer = pd.ExcelWriter(folder + name + '.xlsx', engine='xlsxwriter')

    df_ratio.to_excel(writer, sheet_name='ratio col row', index=False)
    df_mean.to_excel(writer, sheet_name='mean value', index=False)
    df_significance.to_excel(writer, sheet_name='statistical difference', index=False)
    df_p.to_excel(writer, sheet_name='p values', index=False)
    df_t.to_excel(writer, sheet_name='t values', index=False)

    writer.save()

In [None]:
len_of_visual = []
for dataset_row in visual_stimulus_data_matrix:
    tmp_len = []
    for dataset in dataset_row:
        tmp_len.append(len(dataset))
    len_of_visual.append(tmp_len)
    
len_df_array = []
for len_array in len_of_visual:
    df = pd.DataFrame(len_array, columns=['len'])
    len_df_array.append(df)

Create Excel sheet with data containing speedup and significance corresponding to len of measured visual stimulus data

In [None]:
create_statical_data(df_array, config_response_time_variable_array, './results/excel/', 'TotalResponseTime')
create_statical_data(len_df_array, ['len']*len(len_df_array), './results/excel/', 'NumberOfMeasurements')
create_statical_data(df_similarity_array, ['Similarity']*len(df_similarity_array), './results/excel/', 'SimilarityToCodeFlow')

<h2>8. Areas of Interest </h2>

<h3>8.1. Helper</h3>

In [33]:
from matplotlib.patches import Rectangle

def save_aoi_sequences(folder, image, image_prefix, aoi_mask, aoi_names, visual_measurement_array):
    
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    color = []
    
    for idx, visual_measurement in enumerate(visual_measurement_array):
        im = image.copy()
        
        x = [x for (x,y) in visual_measurement]
        y = [y for (x,y) in visual_measurement]
        
        cats = [int(aoi_mask[y,x]) for (x,y) in visual_measurement]
        classes = [aoi_names[idx] for idx in cats]

        val = [idx for idx in aoi_names]
        for value in classes:
            val.append(value)
        
        classes = val
        
        df_classes = pd.DataFrame(classes, columns=["aois"])
        incremental_step = float(df_classes['aois'].value_counts().max()-1)/float(max(len(cats) - 1,1))
        
        y_val = [1+i*incremental_step for i in range(len(cats))]
        x_val = cats
        
        fig, ax1 = plt.subplots(figsize=(6,6))
        
        sns.countplot(x="aois", data=df_classes, ax=ax1);
        ax1.plot(x_val, y_val, '-o', linewidth=2, color='black');
        ax1.set_xticklabels(ax1.get_xticklabels(),rotation=35, horizontalalignment='right')
        
        ax1.figure.savefig(folder + image_prefix + "aoi_sequence_" + str(idx) + ".png")
        ax_color = ax1
        
        bars = [r for r in ax_color.get_children() if type(r)==Rectangle]
        colors = [c.get_facecolor() for c in bars[:-1]]
        colors = [[r,g,b] for (r,g,b,a) in colors]
        color= np.array(colors).astype(np.float32)
        plt.close()
    
    return color

def apply_aoi_mask(image, colors, aoi_mask):
    im = image.copy()
    for height in range(im.shape[0]):
        for width in range(im.shape[1]):
            im[height, width] = 0.4 * im[height, width] + 0.6 * colors[aoi_mask[height, width]]
    return im

<h3>8.1. Areas of Interest supervised </h3>

In [34]:
import modules.HeatmapHelpers as heathelpers

aoi_df_array = []
columns = ["startHeight", "stopHeight", "startWidth", "stopWidth", "Name"]

for aoi_sheet in config_aoi_datasheet_array:
    raw = pd.read_excel("./data/aoi_categorized/" + aoi_sheet)
    df = pd.DataFrame(raw, columns=columns)
    aoi_df_array.append(df)
    
aoi_mask_array = []
aoi_region_name_matrix = []

for idx in range(len(aoi_df_array)):
    image_shape = image_array[idx].shape[0], image_array[idx].shape[1]
    aoi_mask = np.zeros(image_shape)
    aoi_names = []
    for inner_idx, row in aoi_df_array[idx].iterrows():
        start_height = row["startHeight"]
        stop_height = row["stopHeight"]
        start_width = row["startWidth"]
        stop_width = row["stopWidth"]
        name = row["Name"]
        aoi_names.append(name)
        aoi_names = list(dict.fromkeys(aoi_names))
        aoi_mask[start_height:stop_height, start_width:stop_width] = aoi_names.index(name)
        
        
    aoi_mask_array.append(aoi_mask.astype(int))
    aoi_region_name_matrix.append(aoi_names)

visual_stimulus_data_matrix[idx]
used_color_matrix = []
for idx in range(len(aoi_df_array)):
    colors = save_aoi_sequences("./results/aois/supervised/" + config_folder_prefix_array[idx], image_array[idx], config_image_prefix_array[idx], aoi_mask_array[idx], aoi_region_name_matrix[idx], visual_stimulus_data_matrix[idx])
    used_color_matrix.append(colors)

In [35]:
aoi_visual_stimulus_matrix = []

for idx, visual_stimulus_array in enumerate(visual_stimulus_data_matrix):
    aoi_visual_stimulus_array = []
    
    for idx1, visual_stimulus in enumerate(visual_stimulus_array):
        
        aoi_field_array = [aoi_mask_array[idx][y,x] for (x,y) in visual_stimulus]
        aoi_visual_stimulus_array.append(aoi_field_array)
    
    aoi_visual_stimulus_matrix.append(aoi_visual_stimulus_array)

aoi_transition_tensor = []

for idx, aoi_visual_stimulus_array in enumerate(aoi_visual_stimulus_matrix):
    aoi_transition_matrix = np.zeros((len(aoi_region_name_matrix[idx]), len(aoi_region_name_matrix[idx])))
    
    for aoi_visul_stimulus in aoi_visual_stimulus_array:
        for idx1 in range(len(aoi_visul_stimulus) - 1):
            aoi_transition_matrix[aoi_visul_stimulus[idx1], aoi_visul_stimulus[idx1+1]] += 1
            
    aoi_transition_tensor.append(aoi_transition_matrix)

In [52]:
from chord import Chord


In [51]:
Chord(aoi_transition_tensor[1].astype(int).tolist() ,aoi_region_name_matrix[1]).show()

In [None]:
return

<h3>8.2. Areas of Interest unsupervised</h3>

How to analyze areas of interests?

1. [x] find max heat values or values above a treshhold
    1. [x] split to large areas by a certain treshhold
    2. [x] maybe create a bounding box of them and analyze them by prio for better computation time
        1. [x] fix aois to lines, mybe to columns if the works seems useful for an algorithm

2. [x] analyse how to jump between different regions of interests
    1. [x] categorize just hot areas as aois
    2. [x] categorize everything as aois
    %% 3. [ ] list how the code flow jumps between theese regions

3. [ ] analyse all the data for a mean and statical signifance
    1. [ ] maybe implement gernerialized string mean algorithm for medium jumps (NP complte)
    2. [ ] maybe use the same as in the code flow [#todo also implement genrerilized string mean for code flow]

In [None]:
import modules.HeatmapHelpers as heathelpers

treshhold = 0.25
chunk_width = 12
value = 1.0 / treshhold
upper_limit = 0.2

shape = average_heatmap_array[0].shape[0], average_heatmap_array[0].shape[1]

data = np.array(avergae_heatmask_array[0])
data = np.reshape(data, shape)
data = np.rint(data * value) / value
data[data < upper_limit] = 0.0

tmp = data.flatten().tolist()
im = image_array[0].copy()
heathelpers.draw_heat(im, tmp)

imgplot0 = plt.imshow(im)

In [None]:
shape = average_heatmap_array[0].shape[0], average_heatmap_array[0].shape[1]
horizontal_data = np.zeros(shape)

for height_idx in range(shape[0]):
    for val in range(0, shape[1], chunk_width):
        max_val = data[height_idx, val:(val+chunk_width)].max()
        horizontal_data[height_idx, val:(val+chunk_width)] = max_val

heat_mask = horizontal_data.flatten().tolist()
img = image_array[0].copy()
heathelpers.draw_heat(img, heat_mask)

imgplot0 = plt.imshow(img)

In [None]:
shape = average_heatmap_array[0].shape[0], average_heatmap_array[0].shape[1]
line_data = np.zeros(shape)

for _idx, row in config_array[0].iterrows():
    low = row["start"]
    high = row["stop"] + 1
    for val in range(0, shape[1], chunk_width):
        max_val = data[low:high, val:(val+chunk_width)].max()
        line_data[low:high, val:(val+chunk_width)] = max_val

heat_mask = line_data.flatten().tolist()
im = image_array[0].copy()
heathelpers.draw_heat(im, heat_mask)

imgplot0 = plt.imshow(im)

In [None]:
x = [x for (x,y) in visual_stimulus_data_matrix[0][0]]
y = [y for (x,y) in visual_stimulus_data_matrix[0][0]]

fig, ax = plt.subplots(figsize=(6,6))
ax.imshow(im)
ax.plot(x, y, ':o', linewidth=1, color='black');

In [None]:
from matplotlib import animation
from IPython.display import HTML

fig, ax = plt.subplots(figsize=(8,8));
ax.set_ylim((0, 416))
ax.set_xlim((0, 870))
ax.invert_yaxis()

ax.imshow(image_array[0])
line, = ax.plot([], [], "k-x", lw=2)

def init():
    line.set_data([], [])
    return (line,)


x_glob = np.array([x for (x,y) in visual_stimulus_data_matrix[0][0]])
y_glob = np.array([y for (x,y) in visual_stimulus_data_matrix[0][0]])

len_value = len(x_glob)

def animate(i):
    low = max(0, i-16)
    x = x_glob[low:i]
    y = y_glob[low:i]

    image = rEYEker.draw_shape_heat_map(image_array[0], visual_stimulus_data_matrix[0][0], click_setting, min_idx=low, max_idx=i, should_copy=True)
    ax.imshow(image)
    
    #line.set_data(x,y)
    return (line,)

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len_value, interval=100, blit=True)
plt.close()

In [None]:
HTML(anim.to_html5_video())

In [None]:
from queue import Queue

shape = (image_array[0].shape[0], image_array[0].shape[1])
categorize_matrix = np.full(shape, -1)

def get_adjacent(x, y, shape):
    result = []
    if x-1 >= 0:
        result.append((x-1,y))
    if x+1 < shape[0]:
        result.append((x+1,y))
    if y-1 >= 0:
        result.append((x,y-1))
    if y+1 < shape[1]:
        result.append((x,y+1))
    return result


def bfs_fill(start_x, start_y, cat_matrix, heat_mask, cat_value):
    q = Queue()
    value = heat_mask[start_x][start_y]
    cat_matrix[start_x][start_y] = cat_value
    q.put((start_x, start_y))
    
    while not q.empty():
        (x,y) = q.get()
        adjacent_array = get_adjacent(x, y, cat_matrix.shape)
        for (ad_x, ad_y) in adjacent_array:
            if cat_matrix[ad_x][ad_y] == -1 and heat_mask[ad_x][ad_y] == value:
                cat_matrix[ad_x][ad_y] = cat_value
                q.put((ad_x, ad_y))
    
cat_value = 0
for x in range(shape[0]):
    for y in range(shape[1]):
        if categorize_matrix[x][y] == -1:
            bfs_fill(x, y, categorize_matrix, line_data, cat_value)
            cat_value += 1
            
aoi_mask = categorize_matrix.astype(int)
aoi_names = [str(idx) for idx in range(cat_value)]

colors = save_aoi_sequences("./results/aois/unsupervised/" + config_folder_prefix_array[0], image_array[0], config_image_prefix_array[0], aoi_mask, aoi_names, visual_stimulus_data_matrix[0])

In [None]:
fig, ax = plt.subplots(figsize=(8,8))

im = image_array[0].copy()

for height in range(im.shape[0]):
    for width in range(im.shape[1]):
        im[height, width] = 0.4 * im[height, width] + 0.6 * colors[categorize_matrix[height, width]]

ax.imshow(im)
ax.plot(x, y, ':', linewidth=2, color='black');