In [2]:
import pandas as pd
import regex as re
import ast

In [3]:
variables_overview = pd.read_csv('../input/variables_overview.csv', sep='\t')
variables_overview.head()

Unnamed: 0,ID,Name,Scale,Collection,Aggregation,Note,Description,Mapping,Project exclusive,Incomparable,Project dependent
0,F1,Bug ID,Nominal,automated,,,Identifier of the bug within the issue trackin...,,,True,
1,F2,Bug severity,Nominal,automated,max,Project exclusivity (only available in NOVA),"Severity of the bug, e.g., major or minor",,True,True,
2,F3,Bug labels,List of nominals,automated,,,"Additional labels of the bug, e.g., components...",,,True,True
3,F4,# Bug discussants,Integer,automated,,,Number of people involved in the reporting and...,,,True,
4,F4.1,Bug discussant roles,List of nominals,automated,,,,,,True,


In [4]:
all_information = pd.read_csv('../input/all_information.csv', sep='\t')
all_information.head()

Unnamed: 0,Bug_ID,Project,Notes,Duplicate_Bug_IDs,BIC,BFC,BIC_CodeReview,BIC_IntroducingIssue,BIC_Files,BIC_Files_ManualMatch,BIC_BUG_ML,BIC_II_ML,BIC_OtherMedia,CGC,CG_CodeReview,CG_IntroducingIssue,CG_OtherMedia,CG_II_ML
0,Elastic Search 1134,elasticsearch,,[],61ad8b614a94dabf8a263cf1edd35faa50ede36d,cbb1c35f94a36e8871301fce435d516db3cd4256,[],[1104],[ modules/elasticsearch/src/main/java/org/elas...,{},[],[],[],10660d390d760cde78fb115107c04e1cf104a4da,[],[],[],[]
1,Elastic Search 1154,elasticsearch,,[],b3337c312765e51cec7bde5883bbc0a08f56fb65,31ea01bbc68f64cd7787b97479cb5deba1b529b9,[],[],[ modules/elasticsearch/src/main/java/org/elas...,"{ ""modules/elasticsearch/src/main/java/org/ela...",[],[],[],7a38e384c9ff783f17e9db92a730bdf654dbda6b,[],[],[],[]
2,Elastic Search 1162,elasticsearch,,[],d4547c629f53ad76ea463dc0acb1f26f0a2b784b,b70694ce631d7b55be6edd7b9049237456a6e4b4,[],[],[ modules/elasticsearch/src/main/java/org/elas...,{},[],[],[],e5b041c8efd408fdc71fd2f2c84439e5a4985244,[],[],[],[]
3,Elastic Search 12193,elasticsearch,,[],15a62448343fd24f8e63f43b1e4b16f50005e4a5,2ea45fd753b89c12431dab08c4827835c616cc1b,[],[],[ core/pom.xml ],{},[],[],[],e88535a67e9594f3135465b5021ba9b502fef950,[],[],[],[]
4,Elastic Search 1380,elasticsearch,,[],adc3dc0e994ababa7917c81a61cc93f4690060d4,bd87f8de3ac84eb408d5ada0976664545c9228a0,[],[873],[ modules/elasticsearch/src/main/java/org/elas...,{},[],[],[],16a046f686c26309ee0041df8987a4d2ffedd956,[],[],[],[]


In [5]:
all_bug_ids = all_information['Bug_ID'].dropna().values
all_bug_ids = list(all_bug_ids) + [f'{id} - CG' for id in all_bug_ids]

if len(all_bug_ids) != 142:
    raise ValueError(f'Number of bug ids is not 142 ({len(all_bug_ids)})')

In [6]:
true_values = ['yes', 'true', True]
false_values = ['no', 'false', False]

def get_bool_or_none(x: pd.Series) -> bool:
    if len(x) > 1:
        print(f'--- Warning ---\nExpected one boolean but found {len(x)}\n--- xxxxxxx ---')
        return None
    
    y = x.iloc[0]

    if y in true_values:
        return True
    elif y in false_values:
        return False
    else:
        return None

def get_any_bool_or_none(x: pd.Series) -> bool:

    if any((y in true_values) for y in x):
        return True
    elif any((y in false_values) for y in x):
        return False
    else:
        return None

def str_to_list(s: str) -> list:
    l = s.replace('\n', '').strip('[]').split(',')
    l = [x.strip(' ') for x in l]
    return [x for x in l if x != '']

def first_element_of_str_list(s: str) -> str:
    l = str_to_list(s)
    if len(l) > 0:
        return l[0]
    return None

def get_main_tuples(variable_group_id):
    if variable_group_id == 'f':
        # Main equals bug_id
        bug_ids = [bug_id for bug_id in all_information['Bug_ID'] if pd.notna(bug_id)]
        return [(bug_id, bug_id.split(' ')[-1]) for bug_id in bug_ids] + [(f'{bug_id} - CG', bug_id.split(' ')[-1]) for bug_id in bug_ids]
    elif variable_group_id == 'i':
        # Choose the introducing issue in order of list in all_information. In cases where there are multiple, the first one will be I2 feature, followed by I2 improvement.
        # The only case where there are I2 bugs involved, all introducin issues are bugs
        introducint_issues = [(bug_id, bic_ii) for bug_id, bic_ii in all_information[['Bug_ID', 'BIC_IntroducingIssue']].values] + [(f'{bug_id} - CG', cg_ii) for bug_id, cg_ii in all_information[['Bug_ID', 'CG_IntroducingIssue']].values]
        return [(bug_id, first_element_of_str_list(ii)) for bug_id, ii in introducint_issues if pd.notna(ii)]
    elif variable_group_id == 'r':
        # Choose the code review for the main branch (first one in list from all_information)
        code_reviews = [(bug_id, bic_cr) for bug_id, bic_cr in all_information[['Bug_ID', 'BIC_CodeReview']].values] + [(f'{bug_id} - CG', cg_cr) for bug_id, cg_cr in all_information[['Bug_ID', 'CG_CodeReview']].values]
        return [(bug_id, first_element_of_str_list(cr)) for bug_id, cr in code_reviews if pd.notna(cr)]

    raise ValueError(f'get_main_tuples not implemented for {variable_group_id}')

In [7]:
def get_variable_title(variable_id, variable_aggregation=pd.NA):
    return f'{variable_id} - {variables_overview[variables_overview["ID"] == variable_id]["Name"].values[0]}' + (f' [{variable_aggregation}]' if pd.notna(variable_aggregation) else '')

In [8]:
def get_manual_variable_df(variable_group_id):

    variable_group = pd.DataFrame({'F1 - Bug ID': all_bug_ids})
    manual_variable_ids = []

    filename = f'manual_labeling/output/disagreements_resolved/{variable_group_id}_variables.xlsx'

    try:
        xls = pd.ExcelFile(filename)
    except FileNotFoundError:
        print(f'--- Warning: File {filename} not found')
        return variable_group, []

    # each variable is in its own sheet

    for variable_id in xls.sheet_names:
        df: pd.DataFrame = xls.parse(variable_id)
        df = df[df['Agreement'] != '_____']
        df = df[df['Agreement'] != False]

        variable_id_mapping = variables_overview[variables_overview["Mapping"] == variable_id]['ID']
        if len(variable_id_mapping) and pd.notna(variable_id_mapping.values[0]):
            variable_id = variable_id_mapping.values[0]

        if variable_id not in variables_overview['ID'].values:
            print(f'--- Warning: Variable {variable_id} not found in variables_overview.csv')
            continue

        manual_variable_ids.append(variable_id)
        variable_type = str(variables_overview[variables_overview["ID"] == variable_id]["Scale"].values[0])
        variable_aggregation = variables_overview[variables_overview["ID"] == variable_id]["Aggregation"].values[0]
        variable_title = get_variable_title(variable_id, variable_aggregation)



        def concat(x):
            x = x.dropna()
            return '_'.join(x.astype(str))

        df[variable_title] = df[['Target type', 'Topic', 'Action']].apply(lambda x: concat(x), axis=1)

        agg_lambda = lambda x: x

        if variable_type.startswith('List'):
            agg_lambda = lambda x: list(x.dropna().unique())

            variable_df = df.groupby('Bug_ID')[variable_title].agg(agg_lambda).to_frame().reset_index()

        elif variable_type.startswith('Boolean'):
            if variable_aggregation == "any":
                agg_lambda = lambda x: get_any_bool_or_none(x)
            else:
                agg_lambda = lambda x: get_bool_or_none(x)

            variable_df = df.groupby('Bug_ID')[variable_title].agg(agg_lambda).to_frame().reset_index()
            variable_df = variable_df.astype({variable_title: 'boolean'}).astype({variable_title: 'string'})
                
        else:
            if variable_aggregation == "main":
                main_tuples = get_main_tuples(variable_group_id)

                variable_df = df[df[['Bug_ID', 'Target']].astype('string').apply(tuple, axis=1).isin(main_tuples)].groupby('Bug_ID')[variable_title].agg(agg_lambda).to_frame().reset_index()
            elif pd.notna(variable_aggregation):
                raise ValueError(f'Unhandled aggregation {variable_aggregation} in variable {variable_id}')
            else:
                # default
                variable_df = df.groupby('Bug_ID')[variable_title].agg(agg_lambda).to_frame().reset_index()

        if variable_df['Bug_ID'].duplicated().any():
            raise ValueError(f'Variable {variable_id} has duplicates in F1 - Bug ID\n{variable_df[variable_df["Bug_ID"].duplicated()]}')
            # variable_df = variable_df.drop_duplicates(subset='Bug_ID', keep='first')

        variable_df = variable_df.rename(columns={'Bug_ID': 'F1 - Bug ID'})
        variable_group = variable_group.merge(variable_df, on='F1 - Bug ID', how='outer')

    return variable_group, manual_variable_ids

In [9]:
def simplify_approval_R8(x):
    '''Simplifies the approval codes for R8'''
    if x.startswith('approval::for::submitting'):
        return 'approval::submission'
    if x.startswith('not::approval::for::submitting'):
        return 'not::approval::submission'
    if x.startswith('approval::for::workflow'):
        return 'approval::submission'
    if x.startswith('not::approval::for::workflow'):
        return 'not::approval::submission'
    return x

def simplify_labels(x):
    if re.search(r'v([\d\.]+)', x):
        return 'version'
    if 'stable' in x:
        return 'stable'
    if x.startswith(':'):
        return 'relevant component'
    if x.startswith('>'):
        return x[1:]
    if 'backport' in x:
        return 'backport'
    if x in ['good first issue', 'help wanted', 'discuss', 'verification-done', 'verification-needed', 'Team:Delivery', 'forward-port-needed', 'low-hanging-fruit']:
        return 'organization label'
    if x in ['blocker', 'patch', 'documentation']:
        return 'issue type'
    
    # default for remaining labels
    return 'relevant component'

def get_automatic_variable_df(variable_group_id, manual_variable_ids):

    try:
        automatic_variable_df = pd.read_csv(f'automatic_labeling/output/{variable_group_id}_variables.csv', sep=',', dtype=str)
    except FileNotFoundError:
        print(f'--- Warning: File {variable_group_id}_variables.csv not found')
        automatic_variable_df = pd.DataFrame({'F1 - Bug ID': all_bug_ids})
    
    expected_automatic_variable_ids = set(variables_overview['ID'].to_list()) - set(manual_variable_ids)
    automatic_variable_ids = []

    for col in automatic_variable_df.columns:
        variable_id = col.split(' ')[0]

        if variable_id not in expected_automatic_variable_ids:
            print(f'--- Warning: Unexpected variable {variable_id} in automatic variables')
            automatic_variable_df = automatic_variable_df.drop(columns=[col])
            continue

        automatic_variable_ids.append(variable_id)
        variable_aggregation = variables_overview[variables_overview["ID"] == variable_id]["Aggregation"].values[0]
        new_col_name = get_variable_title(variable_id, variable_aggregation)
        automatic_variable_df = automatic_variable_df.rename(columns={col: new_col_name})
        

        if variable_id == 'R8':
            automatic_variable_df[new_col_name] = automatic_variable_df[new_col_name].apply(lambda x: ast.literal_eval(x) if not pd.isna(x) else None)
            automatic_variable_df[new_col_name] = automatic_variable_df[new_col_name].apply(lambda x: list(set([simplify_approval_R8(y) for y in ast.ex])) if x is not None else None)

        if variable_id in ['F3', 'I4']:
            automatic_variable_df[new_col_name] = automatic_variable_df[new_col_name].apply(lambda x: ast.literal_eval(x) if not pd.isna(x) else None)
            automatic_variable_df[new_col_name] = automatic_variable_df[new_col_name].apply(lambda x: list(set([simplify_labels(y) for y in x])) if x is not None else None)

    return automatic_variable_df, automatic_variable_ids

In [10]:
variable_group_ids = ['b', 'c', 'ci', 'f', 'i', 'ml', 'o', 'r']
# variable_group_ids = ['c']

manual_variables_cnt = 0
automatic_variables_cnt = 0

df = pd.DataFrame({'F1 - Bug ID': all_bug_ids})

for variable_group_id in variable_group_ids:

    manual_variable_df, manual_variable_ids = get_manual_variable_df(variable_group_id)
    print(manual_variable_ids)
    automatic_variable_df, automatic_variable_ids = get_automatic_variable_df(variable_group_id, manual_variable_ids)

    df = df.merge(manual_variable_df, on='F1 - Bug ID', how='outer')
    df = df.merge(automatic_variable_df, on='F1 - Bug ID', how='outer')
    
    if len(df) != 142:
        raise ValueError(f'Length of {variable_group_id} is {len(df)} instead of 142 (2*71)')

# sort columns by letter and then by number
regex_num = r'(\d+)'
df = df[sorted(df.columns, key=lambda x: float(re.findall(regex_num, x)[0]) if re.findall(regex_num, x) else x)]
regex_letter = r'^(\w+)\d+'
df = df[sorted(df.columns, key=lambda x: re.findall(regex_letter, x)[0] if re.findall(regex_letter, x) else x)]

df['CG'] = df['F1 - Bug ID'].apply(lambda x: x.endswith('CG'))

df.to_excel(f'output/coded_dataset/variables.xlsx', index=False)

['B1', 'B2', 'B3']
['C2.1', 'C2.2', 'C2.3', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22']
[]
['F5', 'F6', 'F7']
['I2', 'I6', 'I8']
['ML1', 'ML3', 'ML4', 'ML6']
['O1', 'O2', 'O4', 'O5']
['R2', 'R7', 'R8', 'R9', 'R10', 'R11']
