In [2]:
import pandas as pd

import numpy as np

import re

In [3]:
config_prefix = ['BR', 'BI', 'TR', 'TI']

config_algo_names = ['BinarySearch', 'BubbleSort', 'Factorial', 'Fibonacci', 'IntegerBinary', 'MultiplyMatrix', 'PrimeFactors', 'ReverseString']

config_id_variable = "CASE"

config_answer_variables = [
    ['A310_01', 'A414_01', 'A214_01', 'A106_01'],
    ['BR02_01', 'BI02_01', 'TR02_01', 'TI02_01'],
    ['BR06_01', 'BI06_01', 'TR06_01', 'TI06_01'],
    ['BR10_01', 'BI10_01', 'TR10_01', 'TI10_01'],
    ['A406_01', 'A302_01', 'A102_01', 'A206_01'],
    ['A210_01', 'A114_01', 'A314_01', 'A402_01'],
    ['A110_01', 'A202_01', 'A410_01', 'A306_01'],
    ['BR14_01', 'BI14_01', 'TR14_01', 'TI14_01'],
]

config_time_variables = [
    ['TIME049','TIME070','TIME036','TIME015'],
    ['TIME021','TIME055','TIME005','TIME038'],
    ['TIME006','TIME040','TIME025','TIME061'],
    ['TIME059','TIME023','TIME042','TIME008'],
    ['TIME066','TIME051','TIME017','TIME032'],
    ['TIME034','TIME019','TIME053','TIME064'],
    ['TIME013','TIME030','TIME068','TIME047'],
    ['TIME044','TIME010','TIME057','TIME027'],
]

config_click_variables = [
    ['A319_01', 'A420_01', 'A220_01', 'A118_01'],
    ['BR18_01', 'BI18_01', 'TR18_01', 'TI18_01'],
    ['BR19_01', 'BI19_01', 'TR19_01', 'TI19_01'],
    ['BR20_01', 'BI20_01', 'TR20_01', 'TI20_01'],
    ['A418_01', 'A317_01', 'A117_01', 'A218_01'],
    ['A219_01', 'A120_01', 'A320_01', 'A417_01'],
    ['A119_01', 'A217_01', 'A419_01', 'A318_01'],
    ['BR21_01', 'BI21_01', 'TR21_01', 'TI21_01']
]


config_answer_patterns = [
    ['.*3.*', '.*3.*', '.*3.*', '.*3.*'],
    ['.*3.*16.*23.*42.*61.*75.*536.*','.*3.*16.*23.*42.*61.*75.*536.*','.*3.*16.*23.*42.*61.*75.*536.*','.*3.*16.*23.*42.*61.*75.*536.*'],
    ['.*6.*','.*6.*','.*120.*','.*120.*'],
    ['.*2.*', '.*2.*','.*2.*','.*2.*'],
    ['.*1.*0.*0.*0.*1.*','.*1.*0.*0.*0.*1.*','.*1.*0.*0.*0.*1.*','.*1.*0.*0.*0.*1.*'],
    ['.*6.*6.*6.*12.*12.*12.*18.*18.*18.*','.*6.*6.*6.*12.*12.*12.*18.*18.*18.*','.*6.*6.*6.*12.*12.*12.*18.*18.*18.*','.*6.*6.*6.*12.*12.*12.*18.*18.*18.*'],
    ['.*3.*5.*','.*3.*5.*','.*3.*5.*','.*3.*5.*'],
    ['.*gnikcar[tT].*ey[eE].*','.*gnikcar[tT].*ey[eE].*','.*gnikcar[tT].*ey[eE].*','.*gnikcar[tT].*ey[eE].*']
]


config_datasheet = r'./Book5.xlsx'

In [26]:
raw = pd.read_excel(config_datasheet)
df = pd.DataFrame(raw, columns=["PD04", "PD05"])
df = df.dropna()
df = df.iloc[1:]

from scipy.stats import ttest_ind


print(df["PD04"].mean())
print(df["PD04"].std())
print(df["PD05"].mean())
print(df["PD05"].std())

3.3846153846153846
1.2988466624355215
3.8547008547008548
1.268075906469415


In [19]:
df_matrix = []
raw = pd.read_excel(config_datasheet)

# load data
for row, _stuff in enumerate(config_algo_names):
    df_array = []
    for col, _stuff in enumerate(config_prefix):
        df = pd.DataFrame(raw, columns = [config_id_variable, 
                                          config_answer_variables[row][col], 
                                          config_time_variables[row][col],
                                          config_click_variables[row][col],])
        df = df.iloc[1:]
        df = df.dropna()
        df_array.append(df)
    df_matrix.append(df_array)

In [20]:
cols = ["id", "response_time", "correctness", "flag", "algo_name", "click_data"]
df_BU_R = pd.DataFrame(columns=cols)
df_BU_I = pd.DataFrame(columns=cols)
df_TD_R = pd.DataFrame(columns=cols)
df_TD_I = pd.DataFrame(columns=cols)

for row_idx, df_row in enumerate(df_matrix):
    algo_name = config_algo_names[row_idx]
    for col, df in enumerate(df_row): 
        pattern = config_answer_patterns[row_idx][col]
        regex = re.compile(pattern)
        
        for index, row in df.iterrows():

            id_value =  row["CASE"]
            response_time = int(row[config_time_variables[row_idx][col]])
            answer = row[config_answer_variables[row_idx][col]]
            click_data = row[config_click_variables[row_idx][col]]
            result = regex.match(str(answer))
            correctness = False
            
            if result is not None:
                correctness = True
                
            flag = ""
            data = pd.DataFrame([[id_value, response_time, correctness, flag, algo_name, click_data]], columns=cols)
                  
            if col == 0:
                df_BU_R = df_BU_R.append(data)
            elif col == 1:
                df_BU_I = df_BU_I.append(data)
            elif col == 2:
                df_TD_R = df_TD_R.append(data)
            else:
                df_TD_I = df_TD_I.append(data)

In [21]:
df_array_independent = [df_BU_R, df_BU_I, df_TD_R, df_TD_I]

In [22]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
cols = ["Subject","ProgrammingStyle", "Comprehension", "ResponseTime", "Algorithm", "Flag", "Correctness", "ClickData"]

def fill(list_data, dataframe, coding_style, comprehension_style):
    for idx, row in dataframe.iterrows():
        response_time = row["response_time"]
        algorithm = row["algo_name"]
        flag = row["flag"]
        click_data = row["click_data"]
        correctness = row["correctness"]
        id = row["id"]
        list_data.append([id,coding_style, comprehension_style, response_time, algorithm, flag, correctness, click_data])

data = []
fill(data, df_BU_R, "R", "BU")
fill(data, df_BU_I, "I", "BU")
fill(data, df_TD_R, "R", "TD")
fill(data, df_TD_I, "I", "TD")
df = pd.DataFrame(data, columns=cols)
df.to_excel("./results/preprocessed.xlsx", index=False)