In [81]:
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np

from pathlib import Path

import re

import modules.rEYEkerAnalysis as rEYEker

from scipy import stats

from chord import Chord

In [96]:
config_prefix = ['BR', 'BI', 'TR', 'TI']

config_algo_names = ['BinarySearch', 'BubbleSort', 'Factorial', 'Fibonacci', 'IntegerBinary', 'MultiplyMatrix', 'PrimeFactors', 'ReverseString']

config_id_variable = "CASE"

config_answer_variables = [
    ['A310_01', 'A414_01', 'A214_01', 'A106_01'],
    ['BR02_01', 'BI02_01', 'TR02_01', 'TI02_01'],
    ['BR06_01', 'BI06_01', 'TR06_01', 'TI06_01'],
    ['BR10_01', 'BI10_01', 'TR10_01', 'TI10_01'],
    ['A406_01', 'A302_01', 'A102_01', 'A206_01'],
    ['A210_01', 'A114_01', 'A314_01', 'A402_01'],
    ['A110_01', 'A202_01', 'A410_01', 'A306_01'],
    ['BR14_01', 'BI14_01', 'TR14_01', 'TI14_01'],
]

config_time_variables = [
    ['TIME049','TIME070','TIME036','TIME015'],
    ['TIME021','TIME055','TIME005','TIME038'],
    ['TIME006','TIME040','TIME025','TIME061'],
    ['TIME059','TIME023','TIME042','TIME008'],
    ['TIME066','TIME051','TIME017','TIME032'],
    ['TIME034','TIME019','TIME053','TIME064'],
    ['TIME013','TIME030','TIME068','TIME047'],
    ['TIME044','TIME010','TIME057','TIME027'],
]


config_answer_patterns = [
    ['3', '3', '3', '3'],
    ['3.*16.*23.*42.*61.*75.*536','3.*16.*23.*42.*61.*75.*536','3.*16.*23.*42.*61.*75.*536','3.*16.*23.*42.*61.*75.*536'],
    ['6','6','120','120'],
    ['2', '2','2','2'],
    ['1.*0.*0.*0.*1','1.*0.*0.*0.*1','1.*0.*0.*0.*1','1.*0.*0.*0.*1'],
    ['6.*6.*6.*12.*12.*12.*18.*18.*18','6.*6.*6.*12.*12.*12.*18.*18.*18','6.*6.*6.*12.*12.*12.*18.*18.*18','6.*6.*6.*12.*12.*12.*18.*18.*18'],
    ['3.*5','3.*5','3.*5','3.*5'],
    ['gnikcar[tT].*ey[eE]','gnikcar[tT].*ey[eE]','gnikcar[tT].*ey[eE]','gnikcar[tT].*ey[eE]']
]


config_datasheet = r'./Book5.xlsx'

In [97]:
df_matrix = []
raw = pd.read_excel(config_datasheet)

# load data
for row, _stuff in enumerate(config_algo_names):
    df_array = []
    for col, _stuff in enumerate(config_prefix):
        df = pd.DataFrame(raw, columns = [config_id_variable, config_answer_variables[row][col], config_time_variables[row][col]])
        df = df.iloc[1:]
        df = df.dropna()
        df_array.append(df)
    df_matrix.append(df_array)

In [98]:
['BR', 'BI', 'TR', 'TI']
cols = ["id", "response_time", "correctness", "flag"]
df_BU_R = pd.DataFrame(columns=cols)
df_BU_I = pd.DataFrame(columns=cols)
df_TD_R = pd.DataFrame(columns=cols)
df_TD_I = pd.DataFrame(columns=cols)

for row_idx, df_row in enumerate(df_matrix):
    for col, df in enumerate(df_row):
        
        pattern = config_answer_patterns[row_idx][col]
        regex = re.compile(pattern)
        
        for index, row in df.iterrows():

            id_value =  row["CASE"]
            response_time = int(row[config_time_variables[row_idx][col]])
            answer = row[config_answer_variables[row_idx][col]]
            result = regex.match(str(answer))
            correctness = False
            
            if result is not None:
                correctness = True
                
            flag = ""
            data = pd.DataFrame([[id_value, response_time, correctness, flag]], columns=cols)
                  
            if col == 0:
                df_BU_R = df_BU_R.append(data)
            elif col == 1:
                df_BU_I = df_BU_I.append(data)
            elif col == 2:
                df_TD_R = df_TD_R.append(data)
            else:
                df_TD_I = df_TD_I.append(data)  

In [99]:
df_array_independent = [df_BU_R, df_BU_I, df_TD_R, df_TD_I]
df_means = [df[["response_time"]].mean() for df in df_array_independent]
df_stds = [df[["response_time"]].std() for df in df_array_independent]
df_len_b4 = [len(df) for df in df_array_independent]

In [100]:
for idx, df in enumerate(df_array_independent):
    mean = float(df_means[idx])
    stddev = float(df_stds[idx])
    for idx, row in df.iterrows():
        response_time = row["response_time"]
        if not(30 <= response_time <= mean+2*stddev):
            row["flag"] = "outlier"

In [101]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_array_independent[0].loc[df_array_independent[0]["flag"]=="outlier"])

    id response_time correctness     flag
0  750           760        True  outlier
0  785           786       False  outlier
0  815           729        True  outlier
0  737            28        True  outlier
0  794            15        True  outlier
0  842             7       False  outlier
0  883          1287        True  outlier
0  958            25        True  outlier
0  821            10       False  outlier
0  872            20       False  outlier
0  703            22       False  outlier
0  942          1554        True  outlier
0  785          1243       False  outlier
0  854           866       False  outlier
0  760            17       False  outlier
0  852           746       False  outlier
0  764            16       False  outlier
0  850             4       False  outlier
0  938           739        True  outlier


In [102]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_array_independent[1].loc[df_array_independent[1]["flag"]=="outlier"])

    id response_time correctness     flag
0  717            28       False  outlier
0  821            17       False  outlier
0  850             9        True  outlier
0  890             3        True  outlier
0  712          1277        True  outlier
0  727          1094        True  outlier
0  842             9       False  outlier
0  886           924       False  outlier
0  815           877        True  outlier
0  716           824        True  outlier
0  773           948        True  outlier
0  794            13        True  outlier
0  842            21       False  outlier
0  852          1726       False  outlier
0  894            24        True  outlier


In [103]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_array_independent[2].loc[df_array_independent[2]["flag"]=="outlier"])

    id response_time correctness     flag
0  785           799       False  outlier
0  699            17       False  outlier
0  700            22        True  outlier
0  727            14       False  outlier
0  734             4       False  outlier
0  737            18        True  outlier
0  760            18       False  outlier
0  767            18       False  outlier
0  794             7        True  outlier
0  825            25       False  outlier
0  837            22       False  outlier
0  842             4       False  outlier
0  859            24       False  outlier
0  883            14       False  outlier
0  886             6        True  outlier
0  894             3        True  outlier
0  946            20       False  outlier
0  958             8        True  outlier
0  890             4       False  outlier
0  712           706        True  outlier
0  850             5        True  outlier
0  889            24       False  outlier
0  842            16        True  

In [104]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_array_independent[3].loc[df_array_independent[3]["flag"]=="outlier"])

    id response_time correctness     flag
0  727          1053        True  outlier
0  638          1994       False  outlier
0  712          1147       False  outlier
0  850            10       False  outlier
0  868          1967       False  outlier
0  703            27       False  outlier
0  803            23        True  outlier
0  821            13       False  outlier
0  867            20        True  outlier
0  872            25       False  outlier
0  883          3735       False  outlier
0  772            19       False  outlier


In [107]:
for idx, df in enumerate(df_array_independent):
    tmp_df = df.loc[df["flag"] != "outlier"]
    df_array_independent[idx] = tmp_df

In [108]:
df_means = [df[["response_time"]].mean() for df in df_array_independent]
df_stds = [df[["response_time"]].std() for df in df_array_independent]
df_len_after = [len(df) for df in df_array_independent]

df_deleted_amount = [df_len_b4[idx]-df_len_after[idx] for idx in range(len(df_len_after))]
df_deleted_amount

[19, 15, 36, 12]

In [109]:
cols = ["id", "response_time", "correctness", "flag"]
inserted = [] 

for idx1, df in enumerate(df_array_independent):
    for idx2, row in df.iterrows():
        id_value = row["id"]
        entries_id_BU_R = df_BU_R.loc[df_BU_R["id"] == id_value]
        entries_id_BU_I = df_BU_I.loc[df_BU_I["id"] == id_value] 
        entries_id_TD_R = df_TD_R.loc[df_TD_R["id"] == id_value]
        entries_id_TD_I = df_TD_R.loc[df_TD_R["id"] == id_value]
        
        df_per_participant = entries_id_BU_R
        df_per_participant = df_per_participant.append(entries_id_BU_I)
        df_per_participant = df_per_participant.append(entries_id_TD_R)
        df_per_participant = df_per_participant.append(entries_id_TD_I)
        
        rest = (4-len(df_per_participant) % 4) % 4
    
        if 0 < rest <= 2:
            inserted.append((id_value, rest))
            
            len_BU_R = len(entries_id_BU_R)
            len_BU_I = len(entries_id_BU_I)
            len_TD_R = len(entries_id_TD_R)
            len_TD_I = len(entries_id_TD_I)
            
            
            if len_BU_R < len_BU_I or len_BU_R < len_TD_R or len_BU_R < len_TD_I:
                # Missing in BU_R
                tmp_df = pd.DataFrame([[id_value, float(df_means[0]), False, "inserted"]], columns=cols)
                df_BU_R = df_BU_R.append(tmp_df)
            
            if len_BU_I < len_BU_R or len_BU_I < len_TD_R or len_BU_I < len_TD_I:
                # Missing in BU_I
                tmp_df = pd.DataFrame([[id_value, float(df_means[1]), False, "inserted"]], columns=cols)
                df_BU_I = df_BU_I.append(tmp_df)
            
            if len_TD_R < len_BU_R or len_TD_R < len_BU_I or len_TD_R < len_TD_I:
                # Missing in TD_R
                tmp_df = pd.DataFrame([[id_value, float(df_means[2]), False, "inserted"]], columns=cols)
                df_TD_R = df_TD_R.append(tmp_df)
            
            if len_TD_I < len_BU_R or len_TD_I < len_BU_I or len_TD_I < len_TD_R:
                # Missing in TD_I
                tmp_df = pd.DataFrame([[id_value, float(df_means[3]), False, "inserted"]], columns=cols)
                df_TD_I = df_TD_I.append(tmp_df)
                
        elif rest > 2:
            df_BU_R = df_BU_R.loc[df_BU_R["id"]!=id_value]
            df_BU_I = df_BU_I.loc[df_BU_I["id"]!=id_value]
            df_TD_R = df_TD_R.loc[df_TD_R["id"]!=id_value]
            df_TD_I = df_TD_I.loc[df_TD_I["id"]!=id_value]
            
df_array_independent = [df_BU_R, df_BU_I, df_TD_R, df_TD_I]

In [110]:
for id, missing in inserted:
    missing = str(missing) + " where missing"
    df_BU_R.loc[(df_BU_R["id"] == id) & (df_BU_R["flag"] == ""), "flag"] = missing
    df_BU_I.loc[(df_BU_I["id"] == id) & (df_BU_I["flag"] == ""), "flag"] = missing
    df_TD_R.loc[(df_TD_R["id"] == id) & (df_TD_R["flag"] == ""), "flag"] = missing
    df_TD_I.loc[(df_TD_I["id"] == id) & (df_TD_I["flag"] == ""), "flag"] = missing

In [111]:
df_len_after2 = [len(df) for df in df_array_independent]
df_deleted_amount = [df_len_b4[idx]-df_len_after2[idx] for idx in range(len(df_len_after2))]
df_deleted_amount

[-8, -8, -6, -6]

In [113]:
total_answers = [len(df.loc[df["correctness"]==True]) for df in df_array_independent]
total_correct = [len(df.loc[df["correctness"]==True]) for df in df_array_independent]
total_correct

[83, 98, 84, 104]