In [6]:
import pandas as pd
import tqdm as tqdm

from nltk import agreement

def get_coder_df(df, column):
    df[column] = df[column].apply(lambda x: str(x).lower().replace('\n', ''))
    df[column] = df[column].apply(lambda x: x if x != '' else None)
    df = df.copy().dropna(subset=[column])
    df = df[['Bug_ID', 'Merged']].groupby('Bug_ID').agg(lambda x: [y for y in x.tolist() if y != '']).reset_index()
    return df.rename(columns={'Merged': column})

def concat(x):
    x = x.dropna()
    return '_'.join(x.astype(str))


variable_group_ids = ['b', 'c', 'ci', 'f', 'i', 'ml', 'o', 'r']
column_coder_1 = 'Variable_L'
column_coder_2 = 'Variable_A'
variables_overview = pd.read_csv('../../input/variables_overview.csv', sep='\t')

results = []

p_bar = tqdm.tqdm(variable_group_ids, desc='Calculating Krippendorff\'s alpha')
for variable_group_id in p_bar:

    filename = f'input/disagreements_resolved/{variable_group_id}_variables.xlsx'
    try:
        xls = pd.ExcelFile(filename)
    except FileNotFoundError:
        print(f'--- Warning: File {filename} not found')
        continue

    for variable_id in xls.sheet_names:
        p_bar.set_postfix_str(variable_id)
        variable_df = pd.read_excel(xls, variable_id)

        if variable_id not in variables_overview['ID'].values:
            if variable_id in variables_overview['Mapping'].values:
                variable_id_map = variables_overview[variables_overview['Mapping'] == variable_id]['ID'].values[0]
                print(f'--- Log: Variable {variable_id_map} is a mapping of {variable_id}')
                variable_id = variable_id_map
            else:
                print(f'--- Warning: Variable {variable_id} not found in variables_overview.csv')
                continue


        if len(variable_df) == 0:
            results.append({
                'variable_id': variable_id,
                'alpha': 'variable empty'
            })
            continue

        variable_df['Merged'] = variable_df[['Target type', 'Topic', 'Action']].apply(lambda x: concat(x), axis=1)
        
        variable_L_df = get_coder_df(variable_df, column_coder_1)
        variable_A_df = get_coder_df(variable_df, column_coder_2)

        annots = []

        for bug_id in set(variable_L_df['Bug_ID'].to_list() + variable_A_df['Bug_ID'].to_list()):
            merged_L = variable_L_df[variable_L_df['Bug_ID'] == bug_id][column_coder_1].values[0]
            merged_A = variable_A_df[variable_A_df['Bug_ID'] == bug_id][column_coder_2].values[0]

            if len (merged_L):
                annots.append(['coder_1', bug_id, frozenset(merged_L)])
            if len (merged_A):
                annots.append(['coder_2', bug_id, frozenset(merged_A)])

        task = agreement.AnnotationTask(data=annots)
        task.load_array(annots)

        results.append({
            'variable_id': variable_id,
            'alpha': task.alpha()
        })

df_results = pd.DataFrame(results)
df_results.to_csv('output/interrater_agreement/krippendorffs_alpha.csv', index=False)


Calculating Krippendorff's alpha:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Krippendorff's alpha:  12%|█▎        | 1/8 [00:05<00:17,  2.44s/it, C13]    



Calculating Krippendorff's alpha:  12%|█▎        | 1/8 [00:07<00:17,  2.44s/it, C15]



Calculating Krippendorff's alpha:  25%|██▌       | 2/8 [00:09<00:29,  4.99s/it, X C22]   



Calculating Krippendorff's alpha: 100%|██████████| 8/8 [00:15<00:00,  1.99s/it, R13]  

--- Log: Variable R11 is a mapping of R13



