In [1]:
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np

from pathlib import Path

import re

import modules.rEYEkerAnalysis as rEYEker

from scipy import stats

from chord import Chord

In [16]:
config_prefix = ['BR', 'BI', 'TR', 'TI']

config_algo_names = ['BinarySearch', 'BubbleSort', 'Factorial', 'Fibonacci', 'IntegerBinary', 'MultiplyMatrix', 'PrimeFactors', 'ReverseString']

config_id_variable = "CASE"

config_answer_variables = [
    ['A310_01', 'A414_01', 'A214_01', 'A106_01'],
    ['BR02_01', 'BI02_01', 'TR02_01', 'TI02_01'],
    ['BR06_01', 'BI06_01', 'TR06_01', 'TI06_01'],
    ['BR10_01', 'BI10_01', 'TR10_01', 'TI10_01'],
    ['A406_01', 'A302_01', 'A102_01', 'A206_01'],
    ['A210_01', 'A114_01', 'A314_01', 'A402_01'],
    ['A110_01', 'A202_01', 'A410_01', 'A306_01'],
    ['BR14_01', 'BI14_01', 'TR14_01', 'TI14_01'],
]

config_time_variables = [
    ['TIME049','TIME070','TIME036','TIME015'],
    ['TIME021','TIME055','TIME005','TIME038'],
    ['TIME006','TIME040','TIME025','TIME061'],
    ['TIME059','TIME023','TIME042','TIME008'],
    ['TIME066','TIME051','TIME017','TIME032'],
    ['TIME034','TIME019','TIME053','TIME064'],
    ['TIME013','TIME030','TIME068','TIME047'],
    ['TIME044','TIME010','TIME057','TIME027'],
]


config_answer_patterns = [
    ['3', '3', '3', '3'],
    ['3.*16.*23.*42.*61.*75.*536','3.*16.*23.*42.*61.*75.*536','3.*16.*23.*42.*61.*75.*536','3.*16.*23.*42.*61.*75.*536'],
    ['6','6','120','120'],
    ['2', '2','2','2'],
    ['1.*0.*0.*0.*1','1.*0.*0.*0.*1','1.*0.*0.*0.*1','1.*0.*0.*0.*1'],
    ['6.*6.*6.*12.*12.*12.*18.*18.*18','6.*6.*6.*12.*12.*12.*18.*18.*18','6.*6.*6.*12.*12.*12.*18.*18.*18','6.*6.*6.*12.*12.*12.*18.*18.*18'],
    ['3.*5','3.*5','3.*5','3.*5'],
    ['gnikcar[tT].*ey[eE]','gnikcar[tT].*ey[eE]','gnikcar[tT].*ey[eE]','gnikcar[tT].*ey[eE]']
]


config_datasheet = r'./Book5.xlsx'

In [17]:
df_matrix = []
raw = pd.read_excel(config_datasheet)

# load data
for row, _stuff in enumerate(config_algo_names):
    df_array = []
    for col, _stuff in enumerate(config_prefix):
        df = pd.DataFrame(raw, columns = [config_id_variable, config_answer_variables[row][col], config_time_variables[row][col]])
        df = df.iloc[1:]
        df = df.dropna()
        df_array.append(df)
    df_matrix.append(df_array)

In [18]:
['BR', 'BI', 'TR', 'TI']
cols = ["id", "response_time", "correctness", "flag"]
df_BU_R = pd.DataFrame(columns=cols)
df_BU_I = pd.DataFrame(columns=cols)
df_TD_R = pd.DataFrame(columns=cols)
df_TD_I = pd.DataFrame(columns=cols)

for row_idx, df_row in enumerate(df_matrix):
    for col, df in enumerate(df_row):
        
        pattern = config_answer_patterns[row_idx][col]
        regex = re.compile(pattern)
        
        for index, row in df.iterrows():

            id_value =  row["CASE"]
            response_time = int(row[config_time_variables[row_idx][col]])
            answer = row[config_answer_variables[row_idx][col]]
            result = regex.match(str(answer))
            correctness = False
            
            if result is not None:
                correctness = True
                
            flag = ""
            data = pd.DataFrame([[id_value, response_time, correctness, flag]], columns=cols)
                  
            if col == 0:
                df_BU_R = df_BU_R.append(data)
            elif col == 1:
                df_BU_I = df_BU_I.append(data)
            elif col == 2:
                df_TD_R = df_TD_R.append(data)
            else:
                df_TD_I = df_TD_I.append(data)  

In [19]:
df_array_independent = [df_BU_R, df_BU_I, df_TD_R, df_TD_I]
df_means = [df[["response_time"]].mean() for df in df_array_independent]
df_stds = [df[["response_time"]].std() for df in df_array_independent]
df_len_b4 = [len(df) for df in df_array_independent]

In [20]:
for idx, df in enumerate(df_array_independent):
    mean = float(df_means[idx])
    stddev = float(df_stds[idx])
    for idx, row in df.iterrows():
        response_time = row["response_time"]
        if not(30 <= response_time <= mean+2*stddev):
            row["flag"] = "outlier"

In [39]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_array_independent[0].loc[df_array_independent[0]["flag"]=="outlier"])

    id response_time correctness     flag
0  750           195        True  outlier
0  815           195        True  outlier
0  854           195       False  outlier
0  852           195       False  outlier
0  764           195       False  outlier


In [40]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_array_independent[1].loc[df_array_independent[1]["flag"]=="outlier"])

    id response_time correctness     flag
0  717           198       False  outlier
0  815           198        True  outlier
0  716           198        True  outlier
0  773           198        True  outlier
0  852           198       False  outlier


In [41]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_array_independent[2].loc[df_array_independent[2]["flag"]=="outlier"])

    id response_time correctness     flag
0  699           149       False  outlier
0  700           149        True  outlier
0  734           149       False  outlier
0  767           149       False  outlier
0  825           149       False  outlier
0  837           149       False  outlier
0  859           149       False  outlier
0  946           149       False  outlier
0  889           149       False  outlier
0  784           149       False  outlier
0  735           149       False  outlier
0  941           149       False  outlier
0  804           149        True  outlier
0  851           149       False  outlier


In [42]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_array_independent[3].loc[df_array_independent[3]["flag"]=="outlier"])

    id response_time correctness     flag
0  638           193       False  outlier
0  868           193       False  outlier
0  803           193        True  outlier
0  867           193        True  outlier
0  772           193       False  outlier


In [25]:
df_means = [df.loc[df["flag"]!="outlier"][["response_time"]].mean() for df in df_array_independent]
df_stds = [df.loc[df["flag"]!="outlier"][["response_time"]].std() for df in df_array_independent]
df_len_after = [len(df.loc[df["flag"]!="outlier"]) for df in df_array_independent]

df_deleted_amount = [df_len_b4[idx]-df_len_after[idx] for idx in range(len(df_len_after))]
df_means

[response_time    195.423611
 dtype: float64,
 response_time    198.418919
 dtype: float64,
 response_time    149.170543
 dtype: float64,
 response_time    193.550633
 dtype: float64]

In [26]:
for idx, df in enumerate(df_array_independent):
    df.loc[df["flag"] == "outlier", "response_time"] = int(df_means[idx])

In [28]:
cols = ["id", "response_time", "correctness", "flag"]
inserted = [] 

for idx1, df in enumerate(df_array_independent):
    for idx2, row in df.iterrows():
        id_value = row["id"]
        entries_id_BU_R = df_BU_R.loc[(df_BU_R["id"] == id_value) & (df_BU_R["flag"] == "outlier")]
        entries_id_BU_I = df_BU_I.loc[(df_BU_I["id"] == id_value) & (df_BU_I["flag"] == "outlier")] 
        entries_id_TD_R = df_TD_R.loc[(df_TD_R["id"] == id_value) & (df_TD_R["flag"] == "outlier")]
        entries_id_TD_I = df_TD_R.loc[(df_TD_R["id"] == id_value) & (df_TD_R["flag"] == "outlier")]
        
        df_per_participant = entries_id_BU_R
        df_per_participant = df_per_participant.append(entries_id_BU_I)
        df_per_participant = df_per_participant.append(entries_id_TD_R)
        df_per_participant = df_per_participant.append(entries_id_TD_I)
        
        number_of_outliers_per_participant = len(df_per_participant)
        
        if number_of_outliers_per_participant > 2:
            df_BU_R = df_BU_R.loc[df_BU_R["id"]!=id_value]
            df_BU_I = df_BU_I.loc[df_BU_I["id"]!=id_value]
            df_TD_R = df_TD_R.loc[df_TD_R["id"]!=id_value]
            df_TD_I = df_TD_I.loc[df_TD_I["id"]!=id_value]
            
df_array_independent = [df_BU_R, df_BU_I, df_TD_R, df_TD_I]

In [29]:
for id, missing in inserted:
    missing = str(missing) + " where missing"
    df_BU_R.loc[(df_BU_R["id"] == id) & (df_BU_R["flag"] == ""), "flag"] = missing
    df_BU_I.loc[(df_BU_I["id"] == id) & (df_BU_I["flag"] == ""), "flag"] = missing
    df_TD_R.loc[(df_TD_R["id"] == id) & (df_TD_R["flag"] == ""), "flag"] = missing
    df_TD_I.loc[(df_TD_I["id"] == id) & (df_TD_I["flag"] == ""), "flag"] = missing

In [30]:
df_len_after2 = [len(df) for df in df_array_independent]
df_deleted_amount = [df_len_b4[idx]-df_len_after2[idx] for idx in range(len(df_len_after2))]
df_deleted_amount

[29, 27, 27, 28]

In [37]:
total_answers = [len(df) for df in df_array_independent]
total_correct = [len(df.loc[df["correctness"]==True]) for df in df_array_independent]
percentage = [float(correct)/float(total) for correct, total in zip(total_correct, total_answers)]
percentage

[0.44776119402985076,
 0.5514705882352942,
 0.4492753623188406,
 0.6197183098591549]

In [43]:
total_correct

[60, 75, 62, 88]

In [44]:
from scipy.stats import chisquare
chisquare(total_correct)

Power_divergenceResult(statistic=7.112280701754386, pvalue=0.06840381915557886)

In [45]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

response_times = []

model = ols('height ~ C(water) + C(sun) + C(water):C(sun)', data=df).fit()
sm.stats.anova_lm(model, typ=2)