In [1]:
# Import relevant packages
import pandas as pd
import numpy as np
import scipy.stats as stats
import statistics

In [3]:
# Do security patterns really help designers?

# FDR-correction for p values reported by the authors for correctness
# 0.691 -> 6/6*0.05 = 0.05
# 0.367 -> 5/6*0.05 = 0.042
# 0.185 -> 4/6*0.05 = 0.033
# 0.101 -> 3/6*0.05 = 0.025
# 0.037 -> 2/6*0.05 = 0.017 (not significant)
# 0.001 -> 1/6*0.05 = 0.008 (only this one is significant)

# Correctness and number of covered misuse cases
data = pd.read_csv('Do security patterns really help designers/security.csv',sep=',',decimal='.')

group1 = data[data.patterns == 'Y']
group1 = group1.drop('patterns', axis = 1)
group1.reset_index(inplace=True, drop=True)
group1AggSum = group1.groupby(['Code']).sum()

group2 = data[data.patterns == 'N']
group2 = group2.drop('patterns', axis = 1)
group2.reset_index(inplace=True, drop=True)
group2AggSum = group2.groupby(['Code']).sum()

print('Mean Number of Caught Misuse Cases (with patterns):',statistics.mean(group1AggSum.ncovered))
print('Mean Number of Caught Misuse Cases (no patterns):',statistics.mean(group2AggSum.ncovered))
print('Significance test (Wilcoxon):',stats.ranksums(group1AggSum.ncovered, group2AggSum.ncovered))
print("\n")
print('Mean Number of Correctness (with patterns):',statistics.mean(group1AggSum.correctness))
print('Mean Number of Correctness (no patterns):',statistics.mean(group2AggSum.correctness))
print('Significance test (Wilcoxon):',stats.ranksums(group1AggSum.correctness, group2AggSum.correctness))
print("\n")

# rerun the analysis of the authors per task
group1 = data[data.patterns == 'Y']
group1B = group1[group1.taskID == 'B']
group1C = group1[group1.taskID == 'C']
group1D = group1[group1.taskID == 'D']
group1E = group1[group1.taskID == 'E']
group1F = group1[group1.taskID == 'F']
group1G = group1[group1.taskID == 'G']
group1B.reset_index(inplace=True, drop=True)
group1C.reset_index(inplace=True, drop=True)
group1D.reset_index(inplace=True, drop=True)
group1E.reset_index(inplace=True, drop=True)
group1F.reset_index(inplace=True, drop=True)
group1G.reset_index(inplace=True, drop=True)

group2 = data[data.patterns == 'N']
group2B = group2[group2.taskID == 'B']
group2C = group2[group2.taskID == 'C']
group2D = group2[group2.taskID == 'D']
group2E = group2[group2.taskID == 'E']
group2F = group2[group2.taskID == 'F']
group2G = group2[group2.taskID == 'G']
group2B.reset_index(inplace=True, drop=True)
group2C.reset_index(inplace=True, drop=True)
group2D.reset_index(inplace=True, drop=True)
group2E.reset_index(inplace=True, drop=True)
group2F.reset_index(inplace=True, drop=True)
group2G.reset_index(inplace=True, drop=True)

print('Significance test (Wilcoxon) B (misuse cases):',stats.ranksums(group1B.ncovered, group2B.ncovered))
print('Significance test (Wilcoxon) C (misuse cases):',stats.ranksums(group1C.ncovered, group2C.ncovered))
print('Significance test (Wilcoxon) D (misuse cases):',stats.ranksums(group1D.ncovered, group2D.ncovered))
print('Significance test (Wilcoxon) E (misuse cases):',stats.ranksums(group1E.ncovered, group2E.ncovered))
print('Significance test (Wilcoxon) F (misuse cases):',stats.ranksums(group1F.ncovered, group2F.ncovered))
print('Significance test (Wilcoxon) G (misuse cases):',stats.ranksums(group1G.ncovered, group2G.ncovered))
print("\n")
print('Significance test (Wilcoxon) B (correctness):',stats.ranksums(group1B.correctness, group2B.correctness))
print('Significance test (Wilcoxon) C (correctness):',stats.ranksums(group1C.correctness, group2C.correctness))
print('Significance test (Wilcoxon) D (correctness):',stats.ranksums(group1D.correctness, group2D.correctness))
print('Significance test (Wilcoxon) E (correctness):',stats.ranksums(group1E.correctness, group2E.correctness))
print('Significance test (Wilcoxon) F (correctness):',stats.ranksums(group1F.correctness, group2F.correctness))
print('Significance test (Wilcoxon) G (correctness):',stats.ranksums(group1G.correctness, group2G.correctness))

Mean Number of Caught Misuse Cases (with patterns): 8.03125
Mean Number of Caught Misuse Cases (no patterns): 6.625
Significance test (Wilcoxon): RanksumsResult(statistic=1.8663743904652719, pvalue=0.06198900352317222)


Mean Number of Correctness (with patterns): 4.09375
Mean Number of Correctness (no patterns): 3
Significance test (Wilcoxon): RanksumsResult(statistic=3.5246278956628334, pvalue=0.00042407804151509186)


Significance test (Wilcoxon) B (misuse cases): RanksumsResult(statistic=1.557461211615052, pvalue=0.1193610229091287)
Significance test (Wilcoxon) C (misuse cases): RanksumsResult(statistic=1.1495120012027238, pvalue=0.2503449209783549)
Significance test (Wilcoxon) D (misuse cases): RanksumsResult(statistic=0.9251872672604672, pvalue=0.3548685054869629)
Significance test (Wilcoxon) E (misuse cases): RanksumsResult(statistic=0.18844459036110225, pvalue=0.8505281477251081)
Significance test (Wilcoxon) F (misuse cases): RanksumsResult(statistic=0.5887963117081294, pvalue=

In [6]:
sum(group1AggSum.ncovered)

257

In [93]:
# Do security patterns really help designers?
# Time (our paper), resp. Productivity (original paper)
data = pd.read_csv('Do security patterns really help designers/time.csv',sep=',',decimal='.')

group1 = data[data.patterns == 'Y']
group1 = group1.drop('patterns', axis = 1)
group1.reset_index(inplace=True, drop=True)
group1AggSum = group1.groupby(['Code']).sum()

group2 = data[data.patterns == 'N']
group2 = group2.drop('patterns', axis = 1)
group2.reset_index(inplace=True, drop=True)
group2AggSum = group2.groupby(['Code']).sum()

print('Time (with patterns):',statistics.mean(group1AggSum.time))
print('Time (no patterns):',statistics.mean(group2AggSum.time))

print('Normality Check:',stats.shapiro(group1AggSum.time))
print('Normality Check:',stats.shapiro(group2AggSum.time))

print('Significance test (t-test):',stats.ttest_ind(group1AggSum.time, group2AggSum.time))

# rerun the analysis of the authors per task
group1 = data[data.patterns == 'Y']
group1B = group1[group1.taskID == 'B']
group1C = group1[group1.taskID == 'C']
group1D = group1[group1.taskID == 'D']
group1E = group1[group1.taskID == 'E']
group1F = group1[group1.taskID == 'F']
group1G = group1[group1.taskID == 'G']
group1B.reset_index(inplace=True, drop=True)
group1C.reset_index(inplace=True, drop=True)
group1D.reset_index(inplace=True, drop=True)
group1E.reset_index(inplace=True, drop=True)
group1F.reset_index(inplace=True, drop=True)
group1G.reset_index(inplace=True, drop=True)
print('Normality Check Y/B:',stats.shapiro(group1B.time[np.isfinite(group1B.time)]))
print('Normality Check Y/C:',stats.shapiro(group1C.time[np.isfinite(group1C.time)]))
print('Normality Check Y/D:',stats.shapiro(group1D.time[np.isfinite(group1D.time)]))
print('Normality Check Y/E:',stats.shapiro(group1E.time[np.isfinite(group1E.time)]))
print('Normality Check Y/F:',stats.shapiro(group1F.time[np.isfinite(group1F.time)]))
print('Normality Check Y/G:',stats.shapiro(group1G.time[np.isfinite(group1G.time)]))
print("\n")
group2 = data[data.patterns == 'N']
group2B = group2[group2.taskID == 'B']
group2C = group2[group2.taskID == 'C']
group2D = group2[group2.taskID == 'D']
group2E = group2[group2.taskID == 'E']
group2F = group2[group2.taskID == 'F']
group2G = group2[group2.taskID == 'G']
group2B.reset_index(inplace=True, drop=True)
group2C.reset_index(inplace=True, drop=True)
group2D.reset_index(inplace=True, drop=True)
group2E.reset_index(inplace=True, drop=True)
group2F.reset_index(inplace=True, drop=True)
group2G.reset_index(inplace=True, drop=True)
print('Normality Check N/B:',stats.shapiro(group2B.time[np.isfinite(group2B.time)]))
print('Normality Check N/C:',stats.shapiro(group2C.time[np.isfinite(group2C.time)]))
print('Normality Check N/D:',stats.shapiro(group2D.time[np.isfinite(group2D.time)]))
print('Normality Check N/E:',stats.shapiro(group2E.time[np.isfinite(group2E.time)]))
print('Normality Check N/F:',stats.shapiro(group2F.time[np.isfinite(group2F.time)]))
print('Normality Check N/G:',stats.shapiro(group2G.time[np.isfinite(group2G.time)]))
print("\n")
print('Significance test (Wilcoxon) B:',stats.ranksums(group1B.time, group2B.time))
print('Significance test (Wilcoxon) C:',stats.ranksums(group1C.time, group2C.time))
print('Significance test (Wilcoxon) D:',stats.ranksums(group1D.time, group2D.time))
print('Significance test (Wilcoxon) E:',stats.ranksums(group1E.time, group2E.time))
print('Significance test (Wilcoxon) F:',stats.ranksums(group1F.time, group2F.time))
print('Significance test (Wilcoxon) G:',stats.ranksums(group1G.time, group2G.time))

Time (with patterns): 10102.125
Time (no patterns): 8716.34375
Normality Check: (0.9436916708946228, 0.09532733261585236)
Normality Check: (0.9610158801078796, 0.2927635908126831)
Significance test (t-test): Ttest_indResult(statistic=1.2421792065484774, pvalue=0.21884861096861943)
