In [99]:
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np

from pathlib import Path

import re
import modules.rEYEkerAnalysis as rEYEker

from scipy import stats

from chord import Chord

In [126]:
config_prefix = ['BR', 'BI', 'TR', 'TI']

config_algo_names = ['BinarySearch', 'BubbleSort', 'Factorial', 'Fibonacci', 'IntegerBinary', 'MultiplyMatrix', 'PrimeFactors', 'ReverseString']

config_id_variable = "CASE"

config_answer_variables = [
    ['A310_01', 'A414_01', 'A214_01', 'A106_01'],
    ['BR02_01', 'BI02_01', 'TR02_01', 'TI02_01'],
    ['BR06_01', 'BI06_01', 'TR06_01', 'TI06_01'],
    ['BR10_01', 'BI10_01', 'TR10_01', 'TI10_01'],
    ['A406_01', 'A302_01', 'A102_01', 'A206_01'],
    ['A210_01', 'A114_01', 'A314_01', 'A402_01'],
    ['A110_01', 'A202_01', 'A410_01', 'A306_01'],
    ['BR14_01', 'BI14_01', 'TR14_01', 'TI14_01'],
]

config_time_variables = [
    ['TIME049','TIME070','TIME036','TIME015'],
    ['TIME021','TIME055','TIME005','TIME038'],
    ['TIME006','TIME040','TIME025','TIME061'],
    ['TIME059','TIME023','TIME042','TIME008'],
    ['TIME066','TIME051','TIME017','TIME032'],
    ['TIME034','TIME019','TIME053','TIME064'],
    ['TIME013','TIME030','TIME068','TIME047'],
    ['TIME044','TIME010','TIME057','TIME027'],
]

config_click_variables = [
    ['A319_01', 'A420_01', 'A220_01', 'A118_01'],
    ['BR18_01', 'BI18_01', 'TR18_01', 'TI18_01'],
    ['BR19_01', 'BI19_01', 'TR19_01', 'TI19_01'],
    ['BR20_01', 'BI20_01', 'TR20_01', 'TI20_01'],
    ['A418_01', 'A317_01', 'A117_01', 'A218_01'],
    ['A219_01', 'A120_01', 'A320_01', 'A417_01'],
    ['A119_01', 'A217_01', 'A419_01', 'A318_01'],
    ['BR21_01', 'BI21_01', 'TR21_01', 'TI21_01']
]


config_answer_patterns = [
    ['.*3.*', '.*3.*', '.*3.*', '.*3.*'],
    ['.*3.*16.*23.*42.*61.*75.*536.*','.*3.*16.*23.*42.*61.*75.*536.*','.*3.*16.*23.*42.*61.*75.*536.*','.*3.*16.*23.*42.*61.*75.*536.*'],
    ['.*6.*','.*6.*','.*120.*','.*120.*'],
    ['.*2.*', '.*2.*','.*2.*','.*2.*'],
    ['.*1.*0.*0.*0.*1.*','.*1.*0.*0.*0.*1.*','.*1.*0.*0.*0.*1.*','.*1.*0.*0.*0.*1.*'],
    ['.*6.*6.*6.*12.*12.*12.*18.*18.*18.*','.*6.*6.*6.*12.*12.*12.*18.*18.*18.*','.*6.*6.*6.*12.*12.*12.*18.*18.*18.*','.*6.*6.*6.*12.*12.*12.*18.*18.*18.*'],
    ['.*3.*5.*','.*3.*5.*','.*3.*5.*','.*3.*5.*'],
    ['.*gnikcar[tT].*ey[eE].*','.*gnikcar[tT].*ey[eE].*','.*gnikcar[tT].*ey[eE].*','.*gnikcar[tT].*ey[eE].*']
]


config_datasheet = r'./Book5.xlsx'

In [127]:
df_matrix = []
raw = pd.read_excel(config_datasheet)

# load data
for row, _stuff in enumerate(config_algo_names):
    df_array = []
    for col, _stuff in enumerate(config_prefix):
        df = pd.DataFrame(raw, columns = [config_id_variable, 
                                          config_answer_variables[row][col], 
                                          config_time_variables[row][col],
                                          config_click_variables[row][col],])
        df = df.iloc[1:]
        df = df.dropna()
        df_array.append(df)
    df_matrix.append(df_array)

In [128]:
cols = ["id", "response_time", "correctness", "flag", "algo_name", "click_data"]
df_BU_R = pd.DataFrame(columns=cols)
df_BU_I = pd.DataFrame(columns=cols)
df_TD_R = pd.DataFrame(columns=cols)
df_TD_I = pd.DataFrame(columns=cols)

for row_idx, df_row in enumerate(df_matrix):
    algo_name = config_algo_names[row_idx]
    for col, df in enumerate(df_row): 
        pattern = config_answer_patterns[row_idx][col]
        regex = re.compile(pattern)
        
        for index, row in df.iterrows():

            id_value =  row["CASE"]
            response_time = int(row[config_time_variables[row_idx][col]])
            answer = row[config_answer_variables[row_idx][col]]
            click_data = row[config_click_variables[row_idx][col]]
            result = regex.match(str(answer))
            correctness = False
            
            if result is not None:
                correctness = True
                
            flag = ""
            data = pd.DataFrame([[id_value, response_time, correctness, flag, algo_name, click_data]], columns=cols)
                  
            if col == 0:
                df_BU_R = df_BU_R.append(data)
            elif col == 1:
                df_BU_I = df_BU_I.append(data)
            elif col == 2:
                df_TD_R = df_TD_R.append(data)
            else:
                df_TD_I = df_TD_I.append(data)

In [129]:
df_array_independent = [df_BU_R, df_BU_I, df_TD_R, df_TD_I]
df_means = [df[["response_time"]].mean() for df in df_array_independent]
df_stds = [df[["response_time"]].std() for df in df_array_independent]
df_len_b4 = [len(df) for df in df_array_independent]

corects = sum([len(df) for df in df_array_independent])
print("Before Outlier removal: " + str(sum(df_len_b4)))

Before Outlier removal: 650


In [130]:
for idx, df in enumerate(df_array_independent):
    mean = float(df_means[idx])
    stddev = float(df_stds[idx])
    for idx, row in df.iterrows():
        response_time = row["response_time"]
        if not(30 < response_time < mean+2*stddev):
            row["flag"] = "outlier"

In [131]:
dfs_length = [len(df.loc[df["flag"]=="outlier"]) for df in df_array_independent]
print("Number of Outliers: " + str(sum(dfs_length)))

Number of Outliers: 78


In [132]:
df_means = [float(df.loc[df["flag"]!="outlier"][["response_time"]].mean()) for df in df_array_independent]
df_stds = [float(df.loc[df["flag"]!="outlier"][["response_time"]].std()) for df in df_array_independent]
df_len_after = [len(df.loc[df["flag"]!="outlier"]) for df in df_array_independent]

df_deleted_amount = [df_len_b4[idx]-df_len_after[idx] for idx in range(len(df_len_after))]
print("means without outliers: " + str(df_means))

means without outliers: [198.0921985815603, 200.3082191780822, 150.1015625, 194.59235668789808]


In [133]:
for idx, df in enumerate(df_array_independent):
    df.loc[df["flag"] == "outlier", "response_time"] = int(df_means[idx])

In [134]:
cols = ["id", "response_time", "correctness", "flag"]

for idx1, df in enumerate(df_array_independent):
    for idx2, row in df.iterrows():
        id_value = row["id"]
        entries_id_BU_R = df_BU_R.loc[(df_BU_R["id"] == id_value) & (df_BU_R["flag"] == "outlier")]
        entries_id_BU_I = df_BU_I.loc[(df_BU_I["id"] == id_value) & (df_BU_I["flag"] == "outlier")] 
        entries_id_TD_R = df_TD_R.loc[(df_TD_R["id"] == id_value) & (df_TD_R["flag"] == "outlier")]
        entries_id_TD_I = df_TD_R.loc[(df_TD_R["id"] == id_value) & (df_TD_R["flag"] == "outlier")]
        
        df_per_participant = entries_id_BU_R
        df_per_participant = df_per_participant.append(entries_id_BU_I)
        df_per_participant = df_per_participant.append(entries_id_TD_R)
        df_per_participant = df_per_participant.append(entries_id_TD_I)
        
        number_of_outliers_per_participant = len(df_per_participant)
        
        if number_of_outliers_per_participant > 3:
            print("Participant with the following number got deleted: " + str(id_value) + ". Because " + str(number_of_outliers_per_participant) + " where missing.")
            df_BU_R = df_BU_R.loc[df_BU_R["id"]!=id_value]
            df_BU_I = df_BU_I.loc[df_BU_I["id"]!=id_value]
            df_TD_R = df_TD_R.loc[df_TD_R["id"]!=id_value]
            df_TD_I = df_TD_I.loc[df_TD_I["id"]!=id_value]
            
df_array_independent = [df_BU_R, df_BU_I, df_TD_R, df_TD_I]

Participant with the following number got deleted: 712. Because 5 where missing.
Participant with the following number got deleted: 785. Because 4 where missing.
Participant with the following number got deleted: 794. Because 4 where missing.
Participant with the following number got deleted: 842. Because 7 where missing.
Participant with the following number got deleted: 821. Because 4 where missing.
Participant with the following number got deleted: 880. Because 4 where missing.


In [135]:
df_len_after2 = [len(df) for df in df_array_independent]
df_deleted_amount = [df_len_b4[idx]-df_len_after2[idx] for idx in range(len(df_len_after2))]
print("Deleted per Style: " + str(df_deleted_amount))

Deleted per Style: [10, 9, 10, 9]


In [136]:
total_answers = [len(df) for df in df_array_independent]
total_correct = [len(df.loc[df["correctness"]==True]) for df in df_array_independent]
percentage = [float(correct)/float(total) for correct, total in zip(total_correct, total_answers)]
print("numer of total correct answers: " + str(total_correct))
print("numer of total answers: " + str(total_answers))
print("percentages of correct answers: " + str(percentage))

numer of total correct answers: [83, 91, 89, 107]
numer of total answers: [149, 151, 153, 159]
percentages of correct answers: [0.5570469798657718, 0.6026490066225165, 0.5816993464052288, 0.6729559748427673]


In [137]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
cols = ["Subject","ProgrammingStyle", "Comprehension", "ResponseTime", "Algorithm", "Flag", "Correctness", "ClickData"]

def fill(list_data, dataframe, coding_style, comprehension_style):
    for idx, row in dataframe.iterrows():
        response_time = row["response_time"]
        algorithm = row["algo_name"]
        flag = row["flag"]
        click_data = row["click_data"]
        correctness = row["Correctness"]
        id = row["id"]
        list_data.append([id,coding_style, comprehension_style, response_time, algorithm, flag, correctness, click_data])

data = []
fill(data, df_BU_R, "R", "BU")
fill(data, df_BU_I, "I", "BU")
fill(data, df_TD_R, "R", "TD")
fill(data, df_TD_I, "I", "TD")
df = pd.DataFrame(data, columns=cols)
df.to_excel("preprocessed.xlsx", index=False)

In [138]:
df_BU_R

Unnamed: 0,id,response_time,correctness,flag,algo_name,click_data
0,638,296,True,,BinarySearch,136-143 182-47 475-78 284-96 190-147 437-143 1...
0,750,198,True,outlier,BinarySearch,464-76 144-124 406-176 534-235 307-273 541-370...
0,784,438,True,,BinarySearch,176-32 413-60 216-110 426-77 209-116 286-166 5...
0,829,449,False,,BinarySearch,197-66 457-40 232-133 383-167 152-140 194-198 ...
0,841,123,True,,BinarySearch,195-40 450-92 201-104 196-137 535-171 208-217 ...
...,...,...,...,...,...,...
0,889,69,True,,ReverseString,77-54 266-109 54-19 125-87 411-91 256-128 480-...
0,895,69,True,,ReverseString,150-61 147-29 170-60 432-82 301-105 397-136 21...
0,938,198,True,outlier,ReverseString,124-23 143-87 401-105 200-130 389-150 435-102 ...
0,948,174,True,,ReverseString,152-82 402-73 388-125 322-141 329-61 331-113 3...
