Calculate statistics by merge

In [1]:
%matplotlib inline
import pandas as pd
from scipy import stats
import numpy as np

import rpy2
import rpy2.robjects as ro
import rpy2.robjects.packages as rpackages
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.vectors import StrVector
from rpy2.rinterface import FloatSexpVector
from rpy2.robjects.conversion import localconverter
pandas2ri.activate()
base = importr('base')
utils = importr('utils')

utils.chooseCRANmirror(ind=1) # select the first mirror in the list
packnames = ['orddom', 'dplyr', 'nortest', 'effsize']

# Selectively install what needs to be install.
# We are fancy, just because we can.
names_to_install = [x for x in packnames if not rpackages.isinstalled(x)]
if len(names_to_install) > 0:
    utils.install_packages(StrVector(names_to_install))

In [2]:
orddom = importr('orddom')
dplyr = importr('dplyr')
rstats = importr('stats')
nortest = importr('nortest')
effsize = importr('effsize')

In [3]:
df = pd.read_excel("../Dataset-SBES2020.xlsx")
len(df)

182273

In [4]:
with_conflicts = df[df["Conflicts"] == "YES"]
without_conflicts = df[df["Conflicts"] != "YES"]
len(with_conflicts), len(without_conflicts)

(17819, 164454)

In [5]:
len(with_conflicts) / len(df) * 100

9.775995347637883

In [6]:
def unpaired_t_test(rx, ry, x, y):
    result = rstats.t_test(rx, ry, paired=False)
    return result[result.names.index('statistic')][0], result[result.names.index('p.value')][0]

def mann_whitney(rx, ry, x, y):
    result = rstats.wilcox_test(rx, ry, alternative="two.sided")
    return result[result.names.index('statistic')][0], result[result.names.index('p.value')][0]

def cohend(rx, ry, x, y):
    e = effsize.cohen_d(rx, ry, paired=False)
    return e[e.names.index('estimate')][0], e[e.names.index('magnitude')].levels[0]

def cliffsdelta(rx, ry, x, y):
    e = effsize.cliff_delta(rx, ry, paired=False)
    return e[e.names.index('estimate')][0], e[e.names.index('magnitude')].levels[0]



P = 0.05
UNPAIRED_TESTS = {
    True: ("Unpaired T test", unpaired_t_test, 'Cohen\'s D', cohend),
    False: ("Mann-Whitney", mann_whitney, 'Cliff\'s Delta', cliffsdelta),
}


HEADER = ["Project", "Attribute", "W/ C Mean", "W/O C Mean",
 "W/ C Kurtosis", "W/O C Kurtosis", "W/ C Normal Kurtosis", "W/O C Normal Kurtosis",
 "W/ C Anderson-Darling", "W/O C Anderson-Darling", "W/ C Normal Anderson-Darling", "W/O C Normal Anderson-Darling",
 "Inconsistencies",
 "Test", "P-value",
 "Effect Size", "Delta", "Meaning"
]

def calculate_row(project, attr, with_conflicts, without_conflicts, P=P, TESTS=UNPAIRED_TESTS):
    row = [project, attr]
    wc_attr = with_conflicts[attr]
    wo_attr = without_conflicts[attr]
    row.append(wc_attr.mean())
    row.append(wo_attr.mean())
    
    with localconverter(ro.default_converter + pandas2ri.converter):
         rwc_attr = ro.conversion.py2ri(wc_attr)
         rwo_attr = ro.conversion.py2ri(wo_attr)

    row.append(stats.normaltest(wc_attr).pvalue) # With Conflicts Kurtosis
    row.append(stats.normaltest(wo_attr).pvalue) # Without Conflicts Kurtosis
    row.append(row[-2] >= P)
    row.append(row[-1] >= P)
    row.append(pandas2ri.ri2py_floatvector(nortest.ad_test(rwc_attr)[1])[0]) # With Conflicts Anderson-Darling
    row.append(pandas2ri.ri2py_floatvector(nortest.ad_test(rwo_attr)[1])[0]) # Without Conflicts Anderson-Darling
    row.append(row[-2] >= P)
    row.append(row[-1] >= P)
    row.append(row[-2] != row[-6] or row[-1] != row[-5])
    test_name, test, effect_name, effect = TESTS[row[-5] and row[-6]]
    s, pvalue = test(rwo_attr, rwc_attr, wo_attr, wc_attr)
    row.append(test_name)
    row.append(pvalue)
    if pvalue < P:
        estimate, meaning = effect(rwo_attr, rwc_attr, wo_attr, wc_attr)
        row.append(effect_name)
        row.append(estimate)
        row.append(meaning)
    else:
        row.append("No")
        row.append('-')
        row.append('-')
    return row
    
def calculate_attributes(result, project, attributes, with_conflicts, without_conflicts, P=P, TESTS=UNPAIRED_TESTS):
    for attr in attributes: 
        result.append(calculate_row(project, attr, with_conflicts, without_conflicts, P=P, TESTS=TESTS))
    


In [7]:
attributes = ["Branching-duration", "Total-duration", "Commits B1", "Commits B2", "Committers B1", "Committers B2", "Changed Files B1", "Changed Files B2"]

result = []
calculate_attributes(result, "<all>", attributes, with_conflicts, without_conflicts, P=P, TESTS=UNPAIRED_TESTS)

In [8]:
result = []
calculate_attributes(result, "<all>", attributes, with_conflicts, without_conflicts, P=P, TESTS=UNPAIRED_TESTS)
for project in df["Project"].unique():
    p_wc = with_conflicts[with_conflicts["Project"] == project]
    p_wo = without_conflicts[without_conflicts["Project"] == project]
    calculate_attributes(result, project, attributes, p_wc, p_wo, P=P, TESTS=UNPAIRED_TESTS)
    
len(result)

  "anyway, n=%i" % int(n))


648

In [9]:
new_df = pd.DataFrame(result, columns=HEADER)

In [10]:
new_df.to_csv("statistics.csv")

In [11]:
for project in df["Project"].unique():
    p_wc = with_conflicts[with_conflicts["Project"] == project]
    p_wo = without_conflicts[without_conflicts["Project"] == project]
    if len(p_wc) < 20 or len(p_wo) < 20:
        print(project, len(p_wc), len(p_wo))

qbittorrent 10 1310


In [12]:
 len(df["Project"].unique())

80

This notebook created the `statistics.csv` file