# Init

In [21]:
import pandas as pd
import math
import itertools
import tqdm
import re
import regex

In [None]:
AUTHOR_L = 'Lukas'
AUTHOR_A = 'Ana'

AUTHORS = [AUTHOR_L, AUTHOR_A]
OUT_FOLDER = f'output/disagreements/'

def get_constants(author):

    BUG_TAGS = f'input/rater_files/Tagger_{author[0]}.xlsx'
    CG_TAGS = f'input/rater_files/CG_Tagger_{author[0]}.xlsx'

    return BUG_TAGS, CG_TAGS

In [None]:
all_information = pd.read_csv('../../input/all_information.csv', sep="\t", keep_default_na=False)
all_information = all_information.astype(str)
all_information.head()

Unnamed: 0,Bug_ID,Project,Duplicate_Bug_IDs,BIC,BFC,BIC_CodeReview,BIC_IntroducingIssue,BIC_Files,BIC_Files_ManualMatch,BIC_BUG_ML,BIC_II_ML,BIC_OtherIssues,CGC,CG_CodeReview,CG_IntroducingIssue,CG_OtherIssues,CG_II_ML
0,Elastic Search 1134,elasticsearch,[],61ad8b614a94dabf8a263cf1edd35faa50ede36d,cbb1c35f94a36e8871301fce435d516db3cd4256,[],[1104],[ modules/elasticsearch/src/main/java/org/elas...,{},[],[],[],10660d390d760cde78fb115107c04e1cf104a4da,[],[],[],[]
1,Elastic Search 1154,elasticsearch,[],b3337c312765e51cec7bde5883bbc0a08f56fb65,31ea01bbc68f64cd7787b97479cb5deba1b529b9,[],[],[ modules/elasticsearch/src/main/java/org/elas...,"{ ""modules/elasticsearch/src/main/java/org/ela...",[],[],[],7a38e384c9ff783f17e9db92a730bdf654dbda6b,[],[],[],[]
2,Elastic Search 1162,elasticsearch,[],d4547c629f53ad76ea463dc0acb1f26f0a2b784b,b70694ce631d7b55be6edd7b9049237456a6e4b4,[],[],[ modules/elasticsearch/src/main/java/org/elas...,{},[],[],[],e5b041c8efd408fdc71fd2f2c84439e5a4985244,[],[],[],[]
3,Elastic Search 12193,elasticsearch,[],15a62448343fd24f8e63f43b1e4b16f50005e4a5,2ea45fd753b89c12431dab08c4827835c616cc1b,[],[],[ core/pom.xml ],{},[],[],[],e88535a67e9594f3135465b5021ba9b502fef950,[],[],[],[]
4,Elastic Search 1380,elasticsearch,[],adc3dc0e994ababa7917c81a61cc93f4690060d4,bd87f8de3ac84eb408d5ada0976664545c9228a0,[],[873],[ modules/elasticsearch/src/main/java/org/elas...,{},[],[],[],16a046f686c26309ee0041df8987a4d2ffedd956,[],[],[],[]


In [24]:
def str_to_list(s: str) -> list:
    l = s.replace('\n', '').replace('\'', '').strip('[]').split(',')
    l = [x.strip(' ') for x in l]
    return [x for x in l if x != '']

In [25]:
none_variables = set([None, 'None', 'none', 'NA', 'na', 'nan', 'NaN', '', str(float('NaN'))])
none_variables

{'', 'NA', 'NaN', None, 'None', 'na', 'nan', 'none'}

In [None]:
def get_sub_df(df, bug_id, filter_value = None, filter_column = None, sub_filter_sub_column=None):
    sub_df = df[df['CLICK: Variables description']['ID- Bug'] == bug_id]
    if filter_value:
        sub_df = sub_df[sub_df[filter_column][sub_filter_sub_column] == filter_value]
    return sub_df

def get_variable_labels(df, variable_name):
    if variable_name not in df.columns:
        print(f'WARNING: Variable {variable_name} not found in the data frame')
        return []

    variable_labels = df[variable_name].iloc[:, 0]
    variable_labels = [str(x) for x in variable_labels if str(x) not in none_variables]
    return variable_labels

merged_variables_xls = pd.ExcelFile('input/rater_files/merged_codebook.xlsx')
merged_variable_sheets = [sheet for sheet in merged_variables_xls.sheet_names if 'final' in sheet.lower()]



all_merged_variables = set()
all_variables = set()

def get_merged_variable(variable, variable_tag, author):
    variable = variable.lower()
    all_variables.add(variable)

    sheet_name = f'Final_CB_{variable_tag}_V5'
    if sheet_name not in merged_variable_sheets:
        return variable
    
    merged_variables_df = merged_variables_xls.parse(sheet_name)

    potential_matches = merged_variables_df[merged_variables_df[author].str.lower() == variable]['Final Merged codes']
    if len(potential_matches):
        merged_variable = potential_matches.iloc[0]

        if str(merged_variable) in none_variables:
            raise Exception(f'Merged variable for {variable} is {merged_variable}')
    else:
        merged_variable = f'!!NO::MATCH::({variable})!!'


    all_merged_variables.add(merged_variable)
    return merged_variable

In [27]:
def split_variable(variable):
    parts = variable.split('_')

    number = None

    if parts[-1].replace('+', '').isnumeric():
        number = parts[-1]
        parts = parts[:-1]

    if len(parts) < 2:
        return '_'.join(parts), None, None, None, number


    target_type = parts[0]
    sub_target_type = None

    if len(parts) == 3:
        sub_target_type = parts[1]

    action = parts[-1].split('(')[0].strip(':')
    topic = None

    if len(parts[-1].split('(')) == 2:
        topic = parts[-1].split('(')[-1].strip('()')

    return target_type, sub_target_type, action, topic, number

def to_excel(variables_dict, file_name):

    with pd.ExcelWriter(f'{OUT_FOLDER}/{file_name}.xlsx', engine='xlsxwriter') as writer:

        for variable_name in tqdm.tqdm(variables_dict):
            variable_list = []

            for bug_id_group in variables_dict[variable_name]:
                
                for sub_filter_value in variables_dict[variable_name][bug_id_group]:

                    for merged_variable in variables_dict[variable_name][bug_id_group][sub_filter_value]:
                        if merged_variable == 'DELETE::ME':
                            continue

                        set_l = variables_dict[variable_name][bug_id_group][sub_filter_value][merged_variable].get(AUTHOR_L, set())
                        set_a = variables_dict[variable_name][bug_id_group][sub_filter_value][merged_variable].get(AUTHOR_A, set())

                        label_not_used_by_one = len(set_l) == 0 or len(set_a) == 0

                        target_type, sub_target_type, action, topic, number = split_variable(merged_variable)

                        if sub_target_type is not None:
                            raise Exception(f'Variable {merged_variable} has sub_target_type {sub_target_type}. This is deprecated.')
                        
                        variable_list.append({
                            'Bug_ID': bug_id_group,
                            'Target': sub_filter_value,
                            'Agreement': not label_not_used_by_one,
                            'Note': None,
                            'Target type': target_type,
                            'Topic': topic,
                            'Action': action,
                            # 'Number': number,
                            'Variable_L': '\n' + '\n'.join(set_l) + '\n',
                            'Variable_A': '\n' + '\n'.join(set_a) + '\n',
                            'Merged variable': merged_variable,
                        })
                
                    variable_list.append({
                        'Bug_ID': bug_id_group,
                        'Target': sub_filter_value,
                        'Agreement': '_____',
                        'Note': None,
                        'Target type': None,
                        'Topic': None,
                        'Action': None,
                        # 'Number': None,
                        'Variable_L': None,
                        'Variable_A': None,
                        'Merged variable': None,
                    })

            variable_df = pd.DataFrame(variable_list)

            variable_df.sort_values(by=['Bug_ID', 'Target', 'Target type', 'Topic', 'Action'], inplace=True)

            sheet_name = variable_name.split(' - ')[0]
            variable_df.to_excel(writer, sheet_name=sheet_name, index=False)

            workbook  = writer.book
            worksheet = writer.sheets[sheet_name]
            format_difference = workbook.add_format({'bg_color': 'red'})
            format_neither_true_nor_false = workbook.add_format({'bg_color': '#afecff'})
            format_break = workbook.add_format({'bg_color': 'silver', 'font_color': 'white'})
            format_default = workbook.add_format({'align': 'vcenter', 'valign': 'left'})

            (max_row, max_col) = variable_df.shape

            worksheet.conditional_format(1, 2, max_row, 2, {'type': 'formula', 'criteria': '=$C2=FALSE()', 'format': format_difference})
            worksheet.conditional_format(1, 2, max_row, 2, {'type': 'formula', 'criteria': '=AND($C2<>TRUE(), $C2<>FALSE())', 'format': format_neither_true_nor_false})
            worksheet.conditional_format(1, 1, max_row, max_col, {'type': 'formula', 'criteria': '=$C2="_____"', 'format': format_break})

            for column in variable_df:
                column_length = max(variable_df[column].astype(str).map(len).max(), len(column))
                col_idx = variable_df.columns.get_loc(column)
                writer.sheets[sheet_name].set_column(col_idx, col_idx, column_length, cell_format=format_default)




In [28]:
def complete_variables_dict(variables_dict, bug_id, group_tag, variable_df, sub_filter_values, sub_filter_column = None, sub_filter_sub_column='Value'):
    if variable_df is None:
        return variables_dict

    if len(sub_filter_values) == 0:
        sub_filter_values = ['?']

    bug_id_group = bug_id + group_tag
    if bug_id_group not in variables_dict[variable_name]:
        variables_dict[variable_name][bug_id_group] = {}

    for sub_filter_value in sub_filter_values:

        if len(sub_filter_values) > 1:
            sub_df = get_sub_df(variable_df, bug_id, sub_filter_value, sub_filter_column, sub_filter_sub_column)
        else:
            sub_df = get_sub_df(variable_df, bug_id)

        variable_labels = get_variable_labels(sub_df, variable_column)

        for variable_label in variable_labels:
            merged_variable = get_merged_variable(variable_label, variable_tag, author)

            if sub_filter_value not in variables_dict[variable_name][bug_id_group]:
                variables_dict[variable_name][bug_id_group][sub_filter_value] = {}

            if merged_variable not in variables_dict[variable_name][bug_id_group][sub_filter_value]:
                variables_dict[variable_name][bug_id_group][sub_filter_value][merged_variable] = {}
            
            if author not in variables_dict[variable_name][bug_id_group][sub_filter_value][merged_variable]:
                variables_dict[variable_name][bug_id_group][sub_filter_value][merged_variable][author] = set()
            


            variables_dict[variable_name][bug_id_group][sub_filter_value][merged_variable][author].add(variable_label)
            
    return variables_dict

# F variables

In [None]:
variable_column_map = {
    'F5 - Aspects of the bug discussion': 'F5 - Aspects of the bug discussion',
    'F6 - Type of bug': 'F6 - Type of Bug',
    'F7 - Has duplicates': 'F7 - does the bug have any duplicates?',
    # 'F8 - Was re-opened (regression)': 'F8 - was reopened (regression)' automatic
}

sheet_name = 'Fixed_Bug_(F_Variables)'
variable_key = 'f'
sub_filter_column = 'Bug ID if duplicate'
sub_filter_sub_column = None



# ---------------

excel_cache = {}
variables_dict = {}

# MODIFY
for (bug_id, duplicate_bug_ids) in tqdm.tqdm(all_information[['Bug_ID', 'Duplicate_Bug_IDs']].itertuples(index=False)):
    duplicate_bug_ids = str_to_list(duplicate_bug_ids)

    for (variable_name, variable_column) in variable_column_map.items():

        if variable_name not in variables_dict:
            variables_dict[variable_name] = {}

        variable_tag = variable_name.split(' - ')[0]

        for author in AUTHORS:
            BUG_TAGS, CG_TAGS = get_constants(author)

            # MODIFY
            if BUG_TAGS not in excel_cache:
                excel_cache[BUG_TAGS] = pd.read_excel(BUG_TAGS, header=[0, 1], sheet_name=sheet_name, keep_default_na=False, dtype=str)
            # if CG_TAGS not in excel_cache:
            #     excel_cache[CG_TAGS] = pd.read_excel(CG_TAGS, header=[0, 1], sheet_name=sheet_name, keep_default_na=False, dtype=str)

            bug_f_variables = excel_cache.get(BUG_TAGS, None)
            cg_f_variables = excel_cache.get(CG_TAGS, None)

            # MODIFY
            bug_sub_filter_values = [bug_id.split(' ')[-1]] + duplicate_bug_ids if len(duplicate_bug_ids) else [bug_id.split(' ')[-1]]
            cg_sub_filter_values = []
            
            
            for (variable_df, group_tag, ci_systems) in [(bug_f_variables, '', bug_sub_filter_values), (cg_f_variables, ' - CG', cg_sub_filter_values)]:
                variables_dict = complete_variables_dict(variables_dict, bug_id, group_tag, variable_df, ci_systems, sub_filter_column)


to_excel(variables_dict, f'{variable_key}_variables')

# ---------------

0it [00:00, ?it/s]

71it [00:42,  1.68it/s]
100%|██████████| 3/3 [00:00<00:00, 19.94it/s]


# I variables

In [None]:
variable_column_map = {
    'I2 - Introducing issue types': 'I2',
    'I6 - Aspects of introducing issue discussion': 'I6 - Aspects of introducing issue discussion',
    'I8 - Does it have a wiki / spec': 'I8 - does it have a wiki? or does it have a spec?',
    'I9 - Introducing issue doubles as review': 'I9 - Introducing Issue doubles as review',
}

sheet_name = 'Introducing_Issues_(I_Variables'
variable_key = 'i'
sub_filter_column = 'I1'
sub_filter_sub_column = 'Introducing issues ids'


# ---------------

excel_cache = {}
variables_dict = {}

# MODIFY
for (bug_id, bug_ii, cg_ii) in all_information[['Bug_ID', 'BIC_IntroducingIssue', 'CG_IntroducingIssue']].itertuples(index=False):
    bug_ii = str_to_list(bug_ii)
    cg_ii = str_to_list(cg_ii)

    for (variable_name, variable_column) in variable_column_map.items():

        if variable_name not in variables_dict:
            variables_dict[variable_name] = {}

        variable_tag = variable_name.split(' - ')[0]

        for author in AUTHORS:
            BUG_TAGS, CG_TAGS = get_constants(author)

            # MODIFY
            if BUG_TAGS not in excel_cache:
                excel_cache[BUG_TAGS] = pd.read_excel(BUG_TAGS, header=[0, 1], sheet_name=sheet_name, keep_default_na=False, dtype=str)
            if CG_TAGS not in excel_cache:
                excel_cache[CG_TAGS] = pd.read_excel(CG_TAGS, header=[0, 1], sheet_name=sheet_name, keep_default_na=False, dtype=str)

            bug_f_variables = excel_cache.get(BUG_TAGS, None)
            cg_f_variables = excel_cache.get(CG_TAGS, None)

            # MODIFY
            bug_sub_filter_values = [bug_id.split(' ')[-1]] + bug_ii if len(bug_ii) else []
            cg_sub_filter_values = [bug_id.split(' ')[-1]] + cg_ii if len(cg_ii) else []
            
            
            for (variable_df, group_tag, ci_systems) in [(bug_f_variables, '', bug_sub_filter_values), (cg_f_variables, ' - CG', cg_sub_filter_values)]:
                variables_dict = complete_variables_dict(variables_dict, bug_id, group_tag, variable_df, ci_systems, sub_filter_column, sub_filter_sub_column)


to_excel(variables_dict, f'{variable_key}_variables')

# ---------------

100%|██████████| 4/4 [00:00<00:00, 24.51it/s]


# ML variables

In [None]:
variable_column_map = {
    'ML1 - ML bug discussion IDs': 'ML1',
    'ML3 - Aspects of ML bug discussion': 'ML3 - Aspects of ML bug discussion',
    'ML4 - ML introducing issue discussion IDs': 'ML4',
    'ML6 - Aspects of ML introducing issue discussion': 'ML6 - Aspects of ML bug introduction discussion',
}

sheet_name = 'Mailing_list_(ML_Variables)'
variable_key = 'ml'
# sub_filter_column = ''
# sub_filter_sub_column = ''

# ---------------

excel_cache = {}
variables_dict = {}

# MODIFY
for (bug_id, _) in all_information[['Bug_ID', 'Project']].itertuples(index=False):

    for (variable_name, variable_column) in variable_column_map.items():

        if variable_name not in variables_dict:
            variables_dict[variable_name] = {}

        variable_tag = variable_name.split(' - ')[0]

        for author in AUTHORS:
            BUG_TAGS, CG_TAGS = get_constants(author)

            # MODIFY
            if BUG_TAGS not in excel_cache:
                excel_cache[BUG_TAGS] = pd.read_excel(BUG_TAGS, header=[0, 1], sheet_name=sheet_name, keep_default_na=False, dtype=str)
            if CG_TAGS not in excel_cache:
                excel_cache[CG_TAGS] = pd.read_excel(CG_TAGS, header=[0, 1], sheet_name=sheet_name, keep_default_na=False, dtype=str)

            bug_f_variables = excel_cache.get(BUG_TAGS, None)
            cg_f_variables = excel_cache.get(CG_TAGS, None)

            # MODIFY
            bug_sub_filter_values = []
            cg_sub_filter_values = []
            
            for (variable_df, group_tag, ci_systems) in [(bug_f_variables, '', bug_sub_filter_values), (cg_f_variables, ' - CG', cg_sub_filter_values)]:
                variables_dict = complete_variables_dict(variables_dict, bug_id, group_tag, variable_df, ci_systems)


to_excel(variables_dict, f'{variable_key}_variables')

# ---------------

100%|██████████| 4/4 [00:00<00:00, 41.49it/s]


# O variables

In [77]:
variable_column_map = {
    'O1 - Identifiers of related information in other media': 'O1',
    'O2 - Type of other media in which bug was discussed': 'O2',
    'O4 - Aspects of discussions in other media': 'O4 - Aspects of discussions in other media',
    'O5 - Buggy files present in the BIC': 'O5 - Files Involved in the BFC (tagged as bug files by Gema) are in BIC',
}

sheet_name = 'Other_source_(O_Variables)'
variable_key = 'o'
# sub_filter_column = ''
# sub_filter_sub_column = ''



# ---------------

excel_cache = {}
variables_dict = {}

non_target_filter_variables = ['O1', 'O5 - Files Involved in the BFC (tagged as bug files by Gema) are in BIC']

# MODIFY
for (bug_id, _) in all_information[['Bug_ID', 'Project']].itertuples(index=False):

    for (variable_name, variable_column) in variable_column_map.items():

        if variable_name not in variables_dict:
            variables_dict[variable_name] = {}

        variable_tag = variable_name.split(' - ')[0]

        for author in AUTHORS:
            BUG_TAGS, CG_TAGS = get_constants(author)

            # MODIFY
            if BUG_TAGS not in excel_cache:
                excel_cache[BUG_TAGS] = pd.read_excel(BUG_TAGS, header=[0, 1], sheet_name=sheet_name, keep_default_na=False, dtype=str)
            if CG_TAGS not in excel_cache:
                excel_cache[CG_TAGS] = pd.read_excel(CG_TAGS, header=[0, 1], sheet_name=sheet_name, keep_default_na=False, dtype=str)

            bug_f_variables = excel_cache.get(BUG_TAGS, None)
            cg_f_variables = excel_cache.get(CG_TAGS, None)

            # MODIFY
            bug_sub_filter_values = list(set(bug_f_variables['O1']['Identifier of related information in other media'].to_list())) if variable_column not in non_target_filter_variables else []
            cg_sub_filter_values = list(set(cg_f_variables['O1']['Identifier of related information in other media'].to_list())) if variable_column not in non_target_filter_variables else []
            
            for (variable_df, group_tag, sub_filter_values) in [(bug_f_variables, '', bug_sub_filter_values), (cg_f_variables, ' - CG', cg_sub_filter_values)]:
                sub_filter_values = [x for x in sub_filter_values if x not in none_variables]

                variables_dict = complete_variables_dict(variables_dict, bug_id, group_tag, variable_df, sub_filter_values, 'O1', 'Identifier of related information in other media')


to_excel(variables_dict, f'{variable_key}_variables')

# ---------------



100%|██████████| 4/4 [00:00<00:00, 50.72it/s]


In [76]:
list(set(cg_f_variables['O1']['Identifier of related information in other media'].to_list()))

['',
 'https://specs.openstack.org/openstack/nova-specs/specs/juno/implemented/pci-passthrough-sriov.html',
 'https://wiki.openstack.org/wiki/Nova/Juno-Specs',
 'https://wiki.openstack.org/wiki/ReleaseNotes/Liberty/en',
 'https://wiki.openstack.org/wiki/Meetings/Passthrough',
 'https://wiki.openstack.org/wiki/Nova-neutron-sriov',
 'https://specs.openstack.org/openstack/nova-specs/specs/juno/implemented/compute-manager-objects-juno.html',
 'https://etherpad.opendev.org/p/grizzly-nova-config-options',
 'NA',
 'https://specs.openstack.org/openstack/nova-specs/specs/kilo/approved/nova-api-policy.html']

# CI variables

In [35]:
# all automatic

# B variables

In [29]:
variable_column_map = {
    'B1 - Build tools': 'B1 - Build tools',
    'B2 - Dependency resolution': 'B2 - Dependency resolution',
    'B3 - Build practices': 'B3 - Build practices',
}

sheet_name = 'Building_Tools_(B_Variables)'
variable_key = 'b'
# sub_filter_column = ''
# sub_filter_sub_column = ''

# ---------------

excel_cache = {}
variables_dict = {}

# MODIFY
# order by bug id

all_information['sorter'] = all_information['Bug_ID'].str.extract(r'(\d+)').astype(int)

for (bug_id, bic, cgc, _) in all_information[['Bug_ID', 'BIC', 'CGC', 'sorter']].sort_values('sorter').itertuples(index=False):

    print(bug_id)

    for (variable_name, variable_column) in variable_column_map.items():

        if variable_name not in variables_dict:
            variables_dict[variable_name] = {}

        variable_tag = variable_name.split(' - ')[0]

        for author in AUTHORS:
            BUG_TAGS, CG_TAGS = get_constants(author)

            # MODIFY
            if BUG_TAGS not in excel_cache:
                excel_cache[BUG_TAGS] = pd.read_excel(BUG_TAGS, header=[0, 1], sheet_name=sheet_name, keep_default_na=False, dtype=str)
            if CG_TAGS not in excel_cache:
                excel_cache[CG_TAGS] = pd.read_excel(CG_TAGS, header=[0, 1], sheet_name=sheet_name, keep_default_na=False, dtype=str)

            bug_f_variables = excel_cache.get(BUG_TAGS, None)
            cg_f_variables = excel_cache.get(CG_TAGS, None)

            # MODIFY
            bug_sub_filter_values = [bic]
            cg_sub_filter_values = [cgc]
            
            for (variable_df, group_tag, ci_systems) in [(bug_f_variables, '', bug_sub_filter_values), (cg_f_variables, ' - CG', cg_sub_filter_values)]:
                variables_dict = complete_variables_dict(variables_dict, bug_id, group_tag, variable_df, ci_systems)


to_excel(variables_dict, f'{variable_key}_variables')

# ---------------

Elastic Search 200
Elastic Search 764
Elastic Search 864
Elastic Search 1134
Elastic Search 1154
Elastic Search 1162
Elastic Search 1380
Elastic Search 1626
Elastic Search 1725
Elastic Search 1814
Elastic Search 1948
Elastic Search 1960
Elastic Search 2566
Elastic Search 2608
Elastic Search 2991
Elastic Search 3242
Elastic Search 3267
Elastic Search 3560
Elastic Search 4581
Elastic Search 4814
Elastic Search 5021
Elastic Search 5048
Elastic Search 5165
Elastic Search 5948
Elastic Search 7623
Elastic Search 7686
Elastic Search 8125
Elastic Search 8438
Elastic Search 8507
Elastic Search 8526
Elastic Search 8580
Elastic Search 8893
Elastic Search 9317
Elastic Search 12193
Elastic Search 14782
Elastic Search 15858
Elastic Search 16246
Elastic Search 16790
NOVA 1294939
NOVA 1300788
NOVA 1307791
NOVA 1314677
NOVA 1336127
NOVA 1343080
NOVA 1367363
NOVA 1370177
NOVA 1370590
NOVA 1371677
NOVA 1375379
NOVA 1381468
NOVA 1392798
NOVA 1402535
NOVA 1402728
NOVA 1403544
NOVA 1419002
NOVA 1424647
NOVA

100%|██████████| 3/3 [00:00<00:00,  3.92it/s]


# C variables

In [None]:
variable_column_map = {
            'C2.1 - Commit type (current)': 'C2.1 - Commit type (Current) (Commit type (discussion with Steffen))',
            'C2.2 - Commit types (children)': 'C 2.2 - Commit Children(s) Type (Commit type (discussion with Steffen))',
            'C2.3 - Commit types (parents)': 'C 2.3 - Commit Parent(s) Type (Commit type (discussion with Steffen))',
            'C8 - Multiple concerns': 'C8 - Multiple Concerns',
            'C9 - Commit type Swanson': 'C9 - Commit Type Swason',
            'C10 - Test changes': 'C10 - Test Changes',
            'C11 - Bug-covering test changes': 'C11 - Bug-covering test changes',
            'C12 - Bug-covering test changes prior or subsequent commit': 'C12 - Bug Covering test changes prior or subsequent commit',
            'C13 - Test failures': 'C-13 - Test Failures',
            'C14 - Bug-covering test failures': 'C-14 - Bug Covering test failures',
            'C15 - Documentation changes': 'C-15 - Documentation Changes',
            'C16 - Bug affecting documentation changes': 'C-16 - Bug affecting documentation Changes',
            'C17 - Refactorings': 'C-17 - Refactorings',
            'C18 - Bug-introducing refactorings': 'C-18 - Bug Introducing Refactorings',
            'C19 - Design changes': 'C-19 - Design Changes',
            'C20 - Bug-introducing design changes': 'C-20 - Bug introducing Design Changes',
            'C21 - Changes of external dependencies': 'C-21 - Changes of external dependencies',
            'C22 - Bug-introducing changes of external dependencies': 'C-21 - Bug-introducing Changes of external dependencies',
            'X C2 OLD - Commit type of addition': 'C2 OLD - Commit type of addition',
            'X C22 - Commit has more than one revision': 'C22 - Commit has more than one revision (e.g., check the commit id in the search bar)',
        }

sheet_name = 'BIC_(C_Variables)'
variable_key = 'c'
# sub_filter_column = ''
# sub_filter_sub_column = ''

# ---------------

excel_cache = {}
variables_dict = {}

# MODIFY
for (bug_id, bic, cgc) in all_information[['Bug_ID', 'BIC', 'CGC']].itertuples(index=False):

    for (variable_name, variable_column) in variable_column_map.items():

        if variable_name not in variables_dict:
            variables_dict[variable_name] = {}

        variable_tag = variable_name.split(' - ')[0]

        for author in AUTHORS:
            BUG_TAGS, CG_TAGS = get_constants(author)

            # MODIFY
            if BUG_TAGS not in excel_cache:
                excel_cache[BUG_TAGS] = pd.read_excel(BUG_TAGS, header=[0, 1], sheet_name=sheet_name, keep_default_na=False, dtype=str)
            if CG_TAGS not in excel_cache:
                excel_cache[CG_TAGS] = pd.read_excel(CG_TAGS, header=[0, 1], sheet_name=sheet_name, keep_default_na=False, dtype=str)

            bug_f_variables = excel_cache.get(BUG_TAGS, None)
            cg_f_variables = excel_cache.get(CG_TAGS, None)

            # MODIFY
            bug_sub_filter_values = [bic]
            cg_sub_filter_values = [cgc]
            
            for (variable_df, group_tag, ci_systems) in [(bug_f_variables, '', bug_sub_filter_values), (cg_f_variables, ' - CG', cg_sub_filter_values)]:
                variables_dict = complete_variables_dict(variables_dict, bug_id, group_tag, variable_df, ci_systems)


to_excel(variables_dict, f'{variable_key}_variables')

# ---------------

100%|██████████| 20/20 [00:00<00:00, 32.29it/s]


# R variables

In [None]:
variable_column_map = {
            'R2 - Review tool': 'R2 - Review Tool',
            'R7 - Caused changes': 'R7',
            'R8 - Aspects of review comments': 'R8 - Aspects of reviews comments',
            'R9 - Bug covering review comments': 'R9 - Bug covering review comments',
            'R10 - Bug covering review changes': 'R10 - Bug covering review changes',
            'R12 - Review branch': 'R 11 - Where is done the review? (main branch or secondary branch)',
        }

sheet_name = 'Review_(R_Variables)'
variable_key = 'r'
sub_filter_column = 'PR ID'
# sub_filter_sub_column = ''


# ---------------

excel_cache = {}
variables_dict = {}

# MODIFY
for (bug_id, bic_codereviews, cg_codereviews) in all_information[['Bug_ID', 'BIC_CodeReview', 'CG_CodeReview']].itertuples(index=False):
    bic_codereviews = str_to_list(bic_codereviews)
    cg_codereviews = str_to_list(cg_codereviews)

    for (variable_name, variable_column) in variable_column_map.items():

        if variable_name not in variables_dict:
            variables_dict[variable_name] = {}

        variable_tag = variable_name.split(' - ')[0]

        for author in AUTHORS:
            BUG_TAGS, CG_TAGS = get_constants(author)

            # MODIFY
            if BUG_TAGS not in excel_cache:
                excel_cache[BUG_TAGS] = pd.read_excel(BUG_TAGS, header=[0, 1], sheet_name=sheet_name, keep_default_na=False, dtype=str)
            if CG_TAGS not in excel_cache:
                excel_cache[CG_TAGS] = pd.read_excel(CG_TAGS, header=[0, 1], sheet_name=sheet_name, keep_default_na=False, dtype=str)

            bug_f_variables = excel_cache.get(BUG_TAGS, None)
            cg_f_variables = excel_cache.get(CG_TAGS, None)

            # MODIFY
            bug_sub_filter_values = bic_codereviews
            cg_sub_filter_values = cg_codereviews
            
            for (variable_df, group_tag, ci_systems) in [(bug_f_variables, '', bug_sub_filter_values), (cg_f_variables, ' - CG', cg_sub_filter_values)]:
                variables_dict = complete_variables_dict(variables_dict, bug_id, group_tag, variable_df, ci_systems, sub_filter_column)


to_excel(variables_dict, f'{variable_key}_variables')

# ---------------



In [71]:
all_information[all_information['Bug_ID'] == 'Elastic Search 8580'][['Bug_ID', 'BIC_CodeReview', 'CG_CodeReview']]

Unnamed: 0,Bug_ID,BIC_CodeReview,CG_CodeReview
34,Elastic Search 8580,[],"[6849, 6319]"


In [72]:
len(all_merged_variables), len(all_variables)

(231, 1157)