In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score

In [3]:
# load separately classified data
df1 = pd.read_csv('../data/mauczka_label_finished.csv')
df2 = pd.read_csv('../data/mauczka_label_finished2.csv')

In [4]:
# merge data
df1.rename(columns={'label_internal_quality': 'label_internal_quality1', 'label_external_quality': 'label_external_quality1'}, inplace=True)
df2.rename(columns={'label_internal_quality': 'label_internal_quality2', 'label_external_quality': 'label_external_quality2'}, inplace=True)
df2.drop(columns=['project_url', 'message', 'internal_quality', 'external_quality', 'sw_adaptive', 'project', 'has_label'], inplace=True)
dfj = df1.merge(df2, on='revision_hash', how='inner')

In [6]:
# set consensus label to NaN
dfj['label_internal_quality'] = np.NaN
dfj['label_external_quality'] = np.NaN

In [7]:
# save data for consensus labeling session
dfj.to_csv('../data/mauczka_label_consensus.csv', index=False)

In [8]:
# load finished consensus data
cons = pd.read_csv('../data/mauczka_label_consensus_finished2.csv')

In [10]:
# merge back, either we have committee decision which means label_internal/external_quality is bool.
# if not the case it is NaN and we can overwrite with our consensus.
idx = cons[(cons['label_internal_quality1'] == cons['label_internal_quality2']) & (cons['label_external_quality1'] == cons['label_external_quality2'])].index

cons.loc[idx, 'label_internal_quality'] = cons.loc[idx, 'label_internal_quality1']
cons.loc[idx, 'label_external_quality'] = cons.loc[idx, 'label_external_quality1']

In [14]:
# save consensus data with labels
cons.to_csv('../data/mauczka_label_two_authors.csv', index=False)

In [2]:
# read consensus data with labels
cons = pd.read_csv('../data/mauczka_label_two_authors.csv')

In [5]:
# save csv for manual inspection for guideline differences
cons[(cons['internal_quality'] != cons['label_internal_quality'])][['message', 'internal_quality', 'label_internal_quality']].to_csv('../data/internal.csv')
cons[(cons['external_quality'] != cons['label_external_quality'])][['message', 'external_quality', 'label_external_quality']].to_csv('../data/external.csv')

In [3]:
# set identified guideline differences
cons['guideline_differences'] = False

# assert guideline differences for corrective
# test addition or changes labeled as bug (6)
cons.loc[201, 'guideline_differences'] = True
cons.loc[210, 'guideline_differences'] = True
cons.loc[213, 'guideline_differences'] = True
cons.loc[244, 'guideline_differences'] = True
cons.loc[284, 'guideline_differences'] = True
cons.loc[316, 'guideline_differences'] = True

# assert guideline differences for perfective
# license changes labeled as perfective (15)
cons.loc[0, 'guideline_differences'] = True
cons.loc[1, 'guideline_differences'] = True
cons.loc[2, 'guideline_differences'] = True
cons.loc[3, 'guideline_differences'] = True
cons.loc[4, 'guideline_differences'] = True
cons.loc[5, 'guideline_differences'] = True
cons.loc[6, 'guideline_differences'] = True
cons.loc[57, 'guideline_differences'] = True
cons.loc[61, 'guideline_differences'] = True
cons.loc[71, 'guideline_differences'] = True
cons.loc[75, 'guideline_differences'] = True
cons.loc[109, 'guideline_differences'] = True
cons.loc[124, 'guideline_differences'] = True
cons.loc[221, 'guideline_differences'] = True
cons.loc[249, 'guideline_differences'] = True

# bugfix labeled as perfective (8)
cons.loc[18, 'guideline_differences'] = True
cons.loc[19, 'guideline_differences'] = True
cons.loc[20, 'guideline_differences'] = True
cons.loc[21, 'guideline_differences'] = True
cons.loc[25, 'guideline_differences'] = True
cons.loc[28, 'guideline_differences'] = True
cons.loc[193, 'guideline_differences'] = True
cons.loc[197, 'guideline_differences'] = True

# repository work, merging, tagging labeled as perfective (13)
cons.loc[24, 'guideline_differences'] = True
cons.loc[37, 'guideline_differences'] = True
cons.loc[38, 'guideline_differences'] = True
cons.loc[152, 'guideline_differences'] = True
cons.loc[160, 'guideline_differences'] = True
cons.loc[161, 'guideline_differences'] = True
cons.loc[162, 'guideline_differences'] = True
cons.loc[163, 'guideline_differences'] = True
cons.loc[164, 'guideline_differences'] = True
cons.loc[165, 'guideline_differences'] = True
cons.loc[166, 'guideline_differences'] = True
cons.loc[167, 'guideline_differences'] = True
cons.loc[168, 'guideline_differences'] = True

# build configuration not labeled as perfective and release repository work (9)
cons.loc[103, 'guideline_differences'] = True
cons.loc[104, 'guideline_differences'] = True
cons.loc[108, 'guideline_differences'] = True
cons.loc[182, 'guideline_differences'] = True
cons.loc[281, 'guideline_differences'] = True
cons.loc[282, 'guideline_differences'] = True
cons.loc[318, 'guideline_differences'] = True
cons.loc[322, 'guideline_differences'] = True
cons.loc[323, 'guideline_differences'] = True

# empty commit message (2)
cons.loc[66, 'guideline_differences'] = True
cons.loc[128, 'guideline_differences'] = True

In [4]:
# show combined disagreement data
consg = cons[cons['guideline_differences'] == False].copy()
disagreements = consg[((consg['internal_quality'] != consg['label_internal_quality']) | (consg['external_quality'] != consg['label_external_quality']))]
print('combined disagreements', len(disagreements), '/', len(consg))
print('agreement percent', (len(consg) - len(disagreements)) * 100 / len(consg))

combined disagreements 70 / 286
agreement percent 75.52447552447552


In [11]:
tmp = cons[cons['guideline_differences'] == False][['internal_quality', 'external_quality', 'label_internal_quality', 'label_external_quality']].copy()
def cat1(row):
    if row['internal_quality']:
        return 'perfective'
    if row['external_quality']:
        return 'corrective'
    return 'neither'

def cat2(row):
    if row['label_internal_quality']:
        return 'perfective'
    if row['label_external_quality']:
        return 'corrective'
    return 'neither'

tmp['cat1_label'] = tmp.apply(cat1, axis=1)
tmp['cat2_label'] = tmp.apply(cat2, axis=1)
cohen_kappa_score(tmp['cat1_label'], tmp['cat2_label'], labels=['neither', 'corrective', 'perfective'])

0.6250698849067879

In [22]:
# show inter-rater agreement
def user1(row):
    if row['label_internal_quality1']:
        return 'perfective'
    if row['label_external_quality1']:
        return 'corrective'
    return 'neither'

def user2(row):
    if row['label_internal_quality2']:
        return 'perfective'
    if row['label_external_quality2']:
        return 'corrective'
    return 'neither'

cdf = cons[['label_internal_quality1', 'label_external_quality1', 'label_internal_quality2', 'label_external_quality2']].copy()
cdf['user1_label'] = cdf.apply(user1, axis=1)
cdf['user2_label'] = cdf.apply(user2, axis=1)

cohen_kappa_score(cdf['user1_label'], cdf['user2_label'], labels=['neither', 'corrective', 'perfective'])

0.621105144072186