In [1]:
import pandas as pd
import numpy as np
from os.path import join as PJOIN

In [2]:
PREFIX = "comments"
PROJECT_NAME = "libpng"
SUFFIX = "all_marked"
FILE_NAME = PREFIX + "_" + PROJECT_NAME + "_" + SUFFIX + ".xlsx"
FILE_PATH = PJOIN("DATA","ANNOTATED",FILE_NAME)
OUTPUT_FILE_PATH = PJOIN("DATA","GENERATED",FILE_NAME)
MAP = {'U':'U', 'PU':'P', 'NU':'N'}

# Rules

In [3]:
# c is a vector of size 31 [comment text, C1, C2, ......., C30]
def get_label(c):
    #IF C18 OR C19 OR C20  OR  C21  OR C22 OR C28 OR C29 THEN  U
    if c[18] or c[19] or c[20] or c[21] or c[22] or c[28] or c[29]:
        return 'U'
    #IF C9 AND C3 THEN U
    if c[9] and c[3]:
        return 'U'
    #IF C11 AND C3 THEN U
    if c[11] and c[3]:
        return 'U'
    #IF (C25 OR C23 OR C26 OR C27 ) AND C3 THEN U
    if (c[25] or c[23] or c[26] or c[27]) and c[3]:
        return 'U'
    #IF (C25 OR C23 OR C26 OR C27) AND C4 AND (C9  OR C11) THEN U
    if (c[25] or c[23] or c[26] or c[27]) and c[4] and (c[9] or c[11]):
        return 'U'
    #IF C10 AND C15  AND C3 THEN U
    if c[10] and c[15] and c[3]:
        return 'U'
    #IF C8 AND C15  AND C3 THEN U
    if c[8] and c[15] and c[3]:
        return 'U'
    #IF (C18 OR C19 OR C20 OR C21) AND C17 THEN U
    if (c[18] or c[19] or c[20] or c[21]) and c[17]:
        return 'U'
    
    #IF C9 AND (C4 OR C5) THEN PU
    if c[9] and (c[4] or c[5]):
        return 'PU'
    #IF C11 AND (C4  OR C5) THEN PU
    if c[11] and (c[4] or c[5]):
        return 'PU'
    #IF C10 AND C14  AND C3 THEN PU
    if c[1] and c[14] and c[3]:
        return 'PU'
    #IF C8 AND C14  AND C3 THEN PU
    if c[8] and c[14] and c[3]:
        return 'PU'
    #IF C17 AND NOT (C18 OR C19 OR C20 OR C21)  THEN PU
    if c[17] and not((c[18] or c[19] or c[20] or c[21])):
        return 'PU'
    
    #IF C12 OR C16 THEN  NU
    if c[12] or c[16]:
        return 'NU'
    #IF (C8 OR C9)  AND C1 THEN NU
    if (c[8] or c[9]) and c[1]:
        return 'NU'
    #IF (C10 OR C11) AND C1 THEN NU
    if (c[10] or c[11]) and c[1]:
        return 'NU'
    #IF C10 AND C15  AND C1 THEN U
    if c[10] and c[15] and c[1]:
        return 'U'
    #IF C8 AND C15  AND C1 THEN U
    if c[8] and c[15] and c[1]:
        return 'U'
    
    #Low Problem Domain AND Low Scope AND Concepts don't match Symbols = U
    if c[10] and c[15] and c[3]:
        return 'U'
    #Low Problem Domain AND Low Scope AND Concepts Partially Match = U
    elif c[10] and c[15] and c[4]:
        return 'U'
    #Low  Problem Domain AND Concepts Partially Match = PU
    elif c[10] and c[4]:
        return 'PU'
    #Low Program Domain AND Concepts Partially Match = PU
    elif c[8] and c[4]:
        return 'PU'
    else:
        return 'NU'
    
    print("###NOMATCH")
    return 'NU'
    


In [4]:
exl_file = pd.read_excel(FILE_PATH)
exl_file.head()

Unnamed: 0,Filename,Comment text,Start line,End line,No. of words,Program Domain Concepts,Problem Domain Concepts,Copyright/License,Bug/Fix/Patch/Version,Build,...,C22,C23,C24,C25,C26,C27,C28,C29,C30,Score
0,repos/libpng-code/pngset.c,pngset.c - storage of image information into i...,2,17,116,"{'storage': ['storag', 'Time Complexity / Spac...","['image', 'libpng']",True,False,False,...,,,,,,,1.0,,,U
1,repos/libpng-code/pngset.c,override with app values,62,61,4,"{'values': ['valu', 'Data-Structure and its Co...",[],False,False,False,...,,,,,,,,,,P
2,repos/libpng-code/pngset.c,FLOATING_POINT,133,133,1,{},[],False,False,False,...,,,,,,,,,,N
3,repos/libpng-code/pngset.c,cHRM,135,135,1,{},[],False,False,False,...,,,,,,,,,,P
4,repos/libpng-code/pngset.c,eXIf,182,182,1,{},[],False,False,False,...,,,,,,,,,,P


In [5]:
exl_np = np.array(exl_file)

In [6]:
classes = exl_np[:,[1]+list(range(16,46))]
#classes[0]

In [7]:
for c in classes:
    for i in range(1,31):
        if np.isnan(c[i]):
            c[i] = False
        elif int(c[i]) == 1:
            c[i] = True
        else:
            c[i] = False

In [8]:
#classes[0]

In [9]:
labels = []
for c in classes:
    labels.append(MAP[get_label(c)])

In [10]:
exl_file['Calculated Score'] = labels

In [12]:
exl_file.to_excel(OUTPUT_FILE_PATH, index=False)

# Analysis

In [10]:
intuitive_labels = exl_np[:,-1]

In [11]:
lables_np = np.array(labels)

0.15689738236711095

In [35]:
num_nan = np.sum(np.array([x !=x for x in intuitive_labels]))

In [36]:
sum(lables_np == intuitive_labels)/(len(lables_np) - num_nan)

0.4865537848605578

In [17]:
losses = {}

In [18]:
for i in range(len(intuitive_labels)):
    if intuitive_labels[i]!=intuitive_labels[i]:
        continue
    li = intuitive_labels[i]
    lc = lables_np[i]
    if li not in losses:
        losses[li] = {}
    if lc not in losses[li]:
        losses[li][lc] = 0
    losses[li][lc] += 1

In [19]:
losses

{'N': {'N': 161, 'P': 1, 'U': 121},
 'P': {'N': 272, 'P': 6, 'U': 131},
 'U': {'N': 414, 'P': 92, 'U': 810}}