# Areas of Interest Data Analysis

In [6]:
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np

from pathlib import Path

import re

import modules.rEYEkerAnalysis as rEYEker

from scipy import stats

from chord import Chord

In [120]:
config_prefix = ['BR', 'BI', 'TR', 'TI']

config_algo_names = ['BinarySearch', 'BubbleSort', 'Factorial', 'Fibonacci', 'IntegerBinary', 'MultiplyMatrix', 'PrimeFactors', 'ReverseString']

config_id_variable = "CASE"

config_click_variables = [
    ['A319_01', 'A420_01', 'A220_01', 'A118_01'],
    ['BR18_01', 'BI18_01', 'TR18_01', 'TI18_01'],
    ['BR19_01', 'BI19_01', 'TR19_01', 'TI19_01'],
    ['BR20_01', 'BI20_01', 'TR20_01', 'TI20_01'],
    ['A418_01', 'A317_01', 'A117_01', 'A218_01'],
    ['A219_01', 'A120_01', 'A320_01', 'A417_01'],
    ['A119_01', 'A217_01', 'A419_01', 'A318_01'],
    ['BR21_01', 'BI21_01', 'TR21_01', 'TI21_01']
]

config_answer_variables = [
    ['A310_01', 'A414_01', 'A214_01', 'A106_01'],
    ['BR02_01', 'BI02_01', 'TR02_01', 'TI02_01'],
    ['BR06_01', 'BI06_01', 'TR06_01', 'TI06_01'],
    ['BR10_01', 'BI10_01', 'TR10_01', 'TI10_01'],
    ['A406_01', 'A302_01', 'A102_01', 'A206_01'],
    ['A210_01', 'A114_01', 'A314_01', 'A402_01'],
    ['A110_01', 'A202_01', 'A410_01', 'A306_01'],
    ['BR14_01', 'BI14_01', 'TR14_01', 'TI14_01'],
]

config_time_variables = [
    ['TIME049','TIME070','TIME036','TIME015'],
    ['TIME021','TIME055','TIME005','TIME038'],
    ['TIME006','TIME040','TIME025','TIME061'],
    ['TIME059','TIME023','TIME042','TIME008'],
    ['TIME066','TIME051','TIME017','TIME032'],
    ['TIME034','TIME019','TIME053','TIME064'],
    ['TIME013','TIME030','TIME068','TIME047'],
    ['TIME044','TIME010','TIME057','TIME027'],
]


config_answer_patterns = [
    ['3', '3', '3', '3'],
    ['3.*16.*23.*42.*61.*75.*536','3.*16.*23.*42.*61.*75.*536','3.*16.*23.*42.*61.*75.*536','3.*16.*23.*42.*61.*75.*536'],
    ['6','6','120','120'],
    ['2', '2','2','2'],
    ['1.*0.*0.*0.*1','1.*0.*0.*0.*1','1.*0.*0.*0.*1','1.*0.*0.*0.*1'],
    ['6.*6.*6.*12.*12.*12.*18.*18.*18','6.*6.*6.*12.*12.*12.*18.*18.*18','6.*6.*6.*12.*12.*12.*18.*18.*18','6.*6.*6.*12.*12.*12.*18.*18.*18'],
    ['3.*5','3.*5','3.*5','3.*5'],
    ['gnikcar[tT].*ey[eE]','gnikcar[tT].*ey[eE]','gnikcar[tT].*ey[eE]','gnikcar[tT].*ey[eE]']
]


config_reyeker_settings_path = "data/example.json"

config_datasheet = r'./Book5.xlsx'

config_alpha = 0.05

In [97]:
image_paths = []

for algo_name in config_algo_names:
    image_array = []
    for prefix in config_prefix:
        image_array.append('images/' + prefix + '/' + prefix + '_' + algo_name + '.png')
    image_paths.append(image_array)
    
aoi_datasheets = []
for algo_name in config_algo_names:
    datasheet_array = []
    for prefix in config_prefix:
        datasheet_array.append('data/aoi_categorized/AOI_' + prefix + '_' + algo_name + '.xlsx')
    aoi_datasheets.append(datasheet_array)

In [122]:
df_matrix = []
raw = pd.read_excel(config_datasheet)

# load data
for row, _stuff in enumerate(config_algo_names):
    df_array = []
    for col, _stuff in enumerate(config_prefix):
        df = pd.DataFrame(raw, columns = [config_id_variable, config_click_variables[row][col], config_answer_variables[row][col], config_time_variables[row][col]])
        df = df.iloc[1:]
        df = df.dropna()
        df_array.append(df)
    df_matrix.append(df_array)

In [123]:
df_matrix[0][0]

Unnamed: 0,CASE,A319_01,A310_01,TIME049
2,638,136-143 182-47 475-78 284-96 190-147 437-143 1...,3,296
16,712,149-33 282-94 218-194 513-176 224-204 487-235 ...,3,432
33,750,464-76 144-124 406-176 534-235 307-273 541-370...,3,760
51,784,176-32 413-60 216-110 426-77 209-116 286-166 5...,3,438
73,829,197-66 457-40 232-133 383-167 152-140 194-198 ...,-1,449
80,841,195-40 450-92 201-104 196-137 535-171 208-217 ...,2341040,123
88,857,100-35 364-56 165-89 385-87 152-108 176-142 20...,3,467
109,938,200-69 374-73 230-115 182-155 385-177 238-190 ...,3,575


In [165]:
['BR', 'BI', 'TR', 'TI']
cols = ["id", "response_time", "correctness", "flag"]
df_BU_R = pd.DataFrame(columns=cols)
df_BU_I = pd.DataFrame(columns=cols)
df_TD_R = pd.DataFrame(columns=cols)
df_TD_I = pd.DataFrame(columns=cols)

for row_idx, df_row in enumerate(df_matrix):
    for col, df in enumerate(df_row):
        
        pattern = config_answer_patterns[row_idx][col]
        regex = re.compile(pattern)
        
        for index, row in df.iterrows():

            id_value =  row["CASE"]
            response_time = int(row[config_time_variables[row_idx][col]])
            answer = row[config_answer_variables[row_idx][col]]
            result = regex.match(str(answer))
            correctness = False
            
            if result is not None:
                correctness = True
                
            flag = ""
            data = pd.DataFrame([[id_value, response_time, correctness, flag]], columns=cols)
                  
            if col == 0:
                df_BU_R = df_BU_R.append(data)
            elif col == 1:
                df_BU_I = df_BU_I.append(data)
            elif col == 2:
                df_TD_R = df_TD_R.append(data)
            else:
                df_TD_I = df_TD_I.append(data)   

In [166]:
import math 

def std(df_with_response):
    n = len(df_with_response)
    
    mean = df_with_response["response_time"].mean()
    
    deviations = [(x["response_time"] - mean) ** 2 for idx, x in df_with_response.iterrows()]
    
    variance = sum(deviations)/n
    return math.sqrt(variance)

In [167]:
df_BU_R
df_BU_I
df_TD_R
df_TD_I

df_array_independent = [df_BU_R, df_BU_I, df_TD_R, df_TD_I]
df_means = [df[["response_time"]].mean() for df in df_array_independent]
df_stds = [df[["response_time"]].std() for df in df_array_independent]

In [168]:
for idx, df in enumerate(df_array_independent):
    mean = float(df_means[idx])
    stddev = float(df_stds[idx])
    for idx, row in df.iterrows():
        response_time = row["response_time"]
        if not(30 < response_time < mean+2*stddev):
            row["flag"] = "outlier"

In [170]:
cols = ["id", "response_time", "correctness", "flag"]

for idx1, df in enumerate(df_array_independent):
    for idx2, row in df.iterrows():
        id_value = row["id"]
        entries_id_BU_R = df_BU_R.loc[df_BU_R["id"] == id_value]
        entries_id_BU_I = df_BU_I.loc[df_BU_I["id"] == id_value] 
        entries_id_TD_R = df_TD_R.loc[df_TD_R["id"] == id_value]
        entries_id_TD_I = df_TD_R.loc[df_TD_R["id"] == id_value]
        
        df_per_participant = entries_id_BU_R
        df_per_participant = df_per_participant.append(entries_id_BU_I)
        df_per_participant = df_per_participant.append(entries_id_TD_R)
        df_per_participant = df_per_participant.append(entries_id_TD_I)
        
        rest = (4-len(df_per_participant) % 4) % 4
    
        if 0 < rest <= 2:
            
            len_BU_R = len(entries_id_BU_R)
            len_BU_I = len(entries_id_BU_I)
            len_TD_R = len(entries_id_TD_R)
            len_TD_I = len(entries_id_TD_I)
            
            if len_BU_R < len_BU_I or len_BU_R < len_TD_R or len_BU_R < len_TD_I:
                # Missing in BU_R
                tmp_df = pd.DataFrame([id_value, df_means[0], True, "inserted"], columns=cols)
                df_BU_R = df_BU_R.append(tmp_df)
            
            #if len_BU_I < len_BU_R or len_BU_I < len_TD_R or len_BU_I < len_TD_I:
            #    # Missing in BU_I
            #    tmp_df = pd.DataFrame([id_value, df_means[1], True, "inserted"], columns=cols)
            #    df_BU_R = df_BU_R.append(tmp_df)
            #
            #if len_TD_R < len_BU_R or len_TD_R < len_BU_I or len_TD_R < len_TD_I:
            #    # Missing in TD_R
            #    tmp_df = pd.DataFrame([id_value, df_means[2], True, "inserted"], columns=cols)
            #    df_TD_R = df_TD_R.append(tmp_df)
            #
            #if len_TD_I < len_BU_R or len_TD_I < len_BU_I or len_TD_I < len_TD_R:
            #    # Missing in TD_I
            #    tmp_df = pd.DataFrame([id_value, df_means[3], True, "inserted"], columns=cols)
            #    df_TD_I = df_TD_I.append(tmp_df)
            
        elif rest > 2:
            #drop it
            pass

ValueError: Shape of passed values is (4, 1), indices imply (4, 4)

In [162]:
df_array_independent[0]

Unnamed: 0,id,response_time,correctness,flag
0,638,296,True,2
0,712,432,True,0
0,750,760,True,0
0,784,438,True,0
0,829,449,False,0
...,...,...,...,...
0,889,69,True,0
0,895,69,True,0
0,938,739,True,0
0,948,174,True,0


In [None]:


for row, df_row in enumerate(df_matrix):
    tmp_df_array = []
    
    for col, df in enumerate(df_row):
        time_field = config_time_variables[row][col]
        data = df[time_field]
        
        cleared_df = None
        if len(data) == 1:
            cleared_dataframe = df
        else:
            cleared_dataframe = df[(np.abs(stats.zscore(data.astype(float))) <= 2)]
            
        tmp_df_array.append(cleared_dataframe)
        
    tmp_df_matrix.append(tmp_df_array)

In [5]:
tmp_df_matrix = []

#filter for right
for row, df_row in enumerate(df_matrix):
    tmp_df_array = []
    for col, df in enumerate(df_row):
        right_answer_pattern = config_answer_patterns[row][col]
        regex = re.compile(right_answer_pattern)
        answer_field = config_answer_variables[row][col]
        
        df_right = pd.DataFrame(columns = [config_click_variables[row][col], config_answer_variables[row][col], config_time_variables[row][col]])
        
        for _idx, df_row in df.iterrows():
            result = regex.match(str(df_row[answer_field]))
            if result is not None:
                df_right = df_right.append(df_row)
                
        tmp_df_array.append(df_right)
    tmp_df_matrix.append(tmp_df_array)
                
df_matrix = tmp_df_matrix

In [6]:
tmp_df_matrix = []

# filter for outliers
for row, df_row in enumerate(df_matrix):
    tmp_df_array = []
    
    for col, df in enumerate(df_row):
        time_field = config_time_variables[row][col]
        data = df[time_field]
        
        cleared_df = None
        if len(data) == 1:
            cleared_dataframe = df
        else:
            cleared_dataframe = df[(np.abs(stats.zscore(data.astype(float))) <= 2)]
            
        tmp_df_array.append(cleared_dataframe)
        
    tmp_df_matrix.append(tmp_df_array)
                
df_matrix = tmp_df_matrix

In [8]:
for idx, name in enumerate(config_prefix):
    df = pd.DataFrame()
    for algo in config_algo_names:
        pass
    

In [7]:
image_matrix = []
for row in range(len(config_algo_names)):
    img_array = []
    for col in range(len(config_prefix)):
        img = rEYEker.load_image(image_paths[row][col])
        img_array.append(img)
    image_matrix.append(img_array)

In [8]:
#cast data to valid format

visual_stimulus_tensor = []

for row in range(len(config_algo_names)):
    visual_stimulus_matrix = []
    
    for col in range(len(config_prefix)):
        visual_stimulus_array = []
        df = df_matrix[row][col]
        variable_name = config_click_variables[row][col]
        
        for _idx, item in df.iterrows():
            data_str = item[variable_name]
            data_str = data_str.strip()
            
            coordinate_str_array = data_str.split(" ")
            coordinates = []
            for coordinate_str in coordinate_str_array:
                coordinate = coordinate_str.split("-")
                coordinate = (int(coordinate[1]))
                coordinates.append(coordinate)
                
            visual_stimulus_array.append(coordinates)
        visual_stimulus_matrix.append(visual_stimulus_array)
    visual_stimulus_tensor.append(visual_stimulus_matrix)

In [11]:
iterative = ['none', 'main', 'Iterative definition', 'Pre calculation', 'Iteration Condition', 'Iteration Step', 'Return Result']
recursive = ['none','main', 'Recursive definition', 'Pre calculation', 'Recursive Condition', 'Recursive Step', 'Return Result']
order = ['0', '1', '2', '3', '4', '5', '6']

In [12]:
def is_in(df, y):
    for _idx, row in df.iterrows():
        if row["startHeight"] <= y <= row["stopHeight"]:
            return row["Name"]
    return "none"

In [13]:
cols = ["startHeight", "stopHeight", "startWidth", "stopWidth", "Name"]

relation_array = []
for row in range(len(config_prefix)):
    names = None
    if row % 2 == 0:
        names = recursive
    else:
        names = iterative
    
    relation = []
    for i in range(len(names)):
        relation.append([0]*len(names))
    
    for col in range(len(config_algo_names)):
        datasheet = aoi_datasheets[col][row]
        df = pd.read_excel(datasheet)
        
        for data_set in visual_stimulus_tensor[col][row]:
            for idx in range(len(data_set)-1):
                from_field = is_in(df, data_set[idx])
                to_field = is_in(df, data_set[idx+1])
        
                from_field = names.index(from_field)
                to_field = names.index(to_field)
                
                relation[from_field][to_field] = 1 + relation[from_field][to_field] 
                
    relation_array.append(relation)

In [14]:
for matrix in relation_array:
    for idx in range(len(matrix)):
        matrix[idx][idx] = 0

In [15]:
Chord(relation_array[0], order, padding=0.1).to_html("BR_general.html")
Chord(relation_array[1], order, padding=0.1).to_html("BI_general.html")

In [17]:
Chord(relation_array[2], order, padding=0.1).to_html("TR_general.html")
Chord(relation_array[3], order, padding=0.1).to_html("TI_general.html")

In [16]:
from pathlib import Path

file = Path('BR_general.html')
file.write_text(file.read_text().replace('True', 'true'))

file = Path('BI_general.html')
file.write_text(file.read_text().replace('True', 'true'))

file = Path('TR_general.html')
file.write_text(file.read_text().replace('True', 'true'))

file = Path('TI_general.html')
file.write_text(file.read_text().replace('True', 'true'))


559

In [24]:
names = ["Action", "Adventure", "Comedy", "Drama", "Fantasy", "Thriller"]

matrix = [
    [0, 5, 6, 4, 7, 4],
    [5, 0, 5, 4, 6, 5],
    [6, 5, 0, 4, 5, 5],
    [4, 4, 4, 0, 5, 5],
    [7, 6, 5, 5, 0, 4],
    [4, 5, 5, 5, 4, 0]
]

Chord(matrix, names).to_html()