In [None]:
import pandas as pd

In [None]:
# evaluate 
EXPERIMENT = 2 # [1, 2]
# experiment 1 two-choice and 2 annotators
# experiment 2 four-choice and3 annotators


ANNOTATOR = '0'  # ['2', '1', '0', 'all']

In [None]:
def read_file(file_path: str):
    if file_path.endswith('.xlsx'):
        df = pd.read_excel(file_path)
    elif file_path.endswith('.csv'):
        df = pd.read_csv(file_path, header=0)
    else:
        raise ValueError(f'unsupported file: {file_path}')
    return df

In [None]:
if ANNOTATOR == '1':
    df = read_file(f'./Annotator_1_exp{EXPERIMENT}.csv')
elif ANNOTATOR == '0':
    df = read_file(f'./Annotator_0_exp{EXPERIMENT}.csv')
elif ANNOTATOR == '2' and EXPERIMENT == 1:
    df = read_file(f'./Annotator_2_exp{EXPERIMENT}.csv')
elif ANNOTATOR == 'all':
    if EXPERIMENT == 0:
        df1 = read_file(f'./Annotator_1_exp{EXPERIMENT}.csv')
        df1['annotator'] = 1
        df0 = read_file(f'./Annotator_0_exp{EXPERIMENT}.csv')
        df0['annotator'] = 0
        df = pd.concat([df1, df0])
    elif EXPERIMENT == 1:
        df1 = read_file(f'./Annotator_1_exp{EXPERIMENT}.csv')
        df1['annotator'] = 1
        df0 = read_file(f'./Annotator_0_exp{EXPERIMENT}.csv')
        df0['annotator'] = 0
        df2 = read_file(f'./Annotator_2_exp{EXPERIMENT}.csv')
        df2['annotator'] = 2
        df = pd.concat([df1, df0, df2])


In [None]:
# remove empty rows, which are created for formatting
df_filtered = df[~df['commit_url'].isna()]  
print(f'count: {len(df_filtered)}')

In [None]:
# keep dataset only gt label is 1
# if not, we will overestimate the accuracy on the four-choice task
df_filtered['true_label'] = df_filtered['true_label'].astype(int)
df_filtered = df_filtered[df_filtered['true_label'] == 1]
print(f'count agter keep only gt label == 1: {len(df_filtered)}')  # evaluate the number of rows after filtering

In [None]:
# stats of different methods assigned to this annotator
print('stats of different methods')
print(df_filtered['method'].value_counts())

In [None]:
if 'is_patch(codeonly)' in df_filtered.columns:
    df_filtered['right(codeonly)'] = df_filtered['is_patch(codeonly)'] == df_filtered['true_label']
df_filtered['right'] = df_filtered['is_patch'] == df_filtered['true_label']

In [None]:
def filter_change_log(df):
    # print(f'len before filtering: {len(df)}')
    df['contain_changelog'] = df.apply(contain_changelog, axis=1)
    filtered_rows = []
    for i in range(int(len(df)/2)):
        row_1 = df.iloc[2*i]
        row_2 = df.iloc[2*i+1]
        if not row_1['contain_changelog'] and not row_2['contain_changelog']:
            filtered_rows.append(row_1)
            filtered_rows.append(row_2)
    output_df =  pd.DataFrame(filtered_rows)
    # print(f'len after filtering: {len(output_df)}')
    return output_df

# filter_change_log = lambda x: x
    
def contain_changelog(row):
    if 'changelog' in row['commit_msg_text'].lower() or 'changelog' in row['code_text'].lower():
        return True
    else:
        return False

In [None]:
if 'is_patch(codeonly)' in df_filtered.columns:
    print('Accuracy codeonly only')

    total_accuracy = df_filtered['right(codeonly)'].sum() / len(df_filtered)
    print(f'Total accuracy: {total_accuracy}')

    if ANNOTATOR != 'total':
        anno_tfidf_part = df_filtered[df_filtered['method'] == 'tfidf'] 
        anno_lime_part = df_filtered[df_filtered['method'] == 'lime']
        anno_non_highlight_part = df_filtered[df_filtered['method'] == 'none']
    else:
        df_filtered1 = pd.concat([df_filtered, df_filtered])
        anno_tfidf_part = df_filtered1[df_filtered1['method'] == 'tfidf']
        anno_lime_part = df_filtered1[df_filtered1['method'] == 'lime']
        anno_non_highlight_part = df_filtered[df_filtered['method'] == 'none']
        
        
    print(f'tfidf accuracy: {anno_tfidf_part["right(codeonly)"].sum() / len(anno_tfidf_part): .4f}, {anno_tfidf_part["right(codeonly)"].sum()}/{len(anno_tfidf_part)}')
    print(f'lime accuracy: {anno_lime_part["right(codeonly)"].sum() / len(anno_lime_part): .4f}, {anno_lime_part["right(codeonly)"].sum()}/{len(anno_lime_part)}')
    print(f'non_highlight accuracy: {anno_non_highlight_part["right(codeonly)"].sum() / len(anno_non_highlight_part): .4f}, {anno_non_highlight_part["right(codeonly)"].sum()}/{len(anno_non_highlight_part)}')

In [None]:
print('Accuracy all')

total_accuracy = df_filtered['right'].sum() / len(df_filtered)
print(f'Total accuracy: {total_accuracy}')

if ANNOTATOR != 'total':
    anno_tfidf_part = df_filtered[df_filtered['method'] == 'tfidf']
    anno_lime_part = df_filtered[df_filtered['method'] == 'lime']
    anno_non_highlight_part = df_filtered[df_filtered['method'] == 'none']
else:
    df_filtered1 = pd.concat([df_filtered, df_filtered])
    anno_tfidf_part = df_filtered1[df_filtered1['method'] == 'tfidf']
    anno_lime_part = df_filtered1[df_filtered1['method'] == 'lime']
    anno_non_highlight_part = df_filtered[df_filtered['method'] == 'none']                        

    
print(f'tfidf accuracy: {anno_tfidf_part["right"].sum() / len(anno_tfidf_part): .4f}, {anno_tfidf_part["right"].sum()}/{len(anno_tfidf_part)}')
print(f'lime accuracy: {anno_lime_part["right"].sum() / len(anno_lime_part): .4f}, {anno_lime_part["right"].sum()}/{len(anno_lime_part)}')
print(f'non_highlight accuracy: {anno_non_highlight_part["right"].sum() / len(anno_non_highlight_part): .4f}, {anno_non_highlight_part["right"].sum()}/{len(anno_non_highlight_part)}')

In [None]:
if 'helpfulness (1 - 3, 3 is most helpful)' in df_filtered.columns:
    print('helpfulness')  # no annatation
    if ANNOTATOR != 'total':
        helpfulness_filtered = df_filtered[df_filtered['is_patch'] == 1 & df_filtered['right']]
    else:
        df_filtered1 = pd.concat([df_filtered, df_filtered])
        helpfulness_filtered = df_filtered1[(df_filtered1['is_patch'] == 1) & (df_filtered1['right'])]
        
    helpfulness_filtered = filter_change_log(helpfulness_filtered)

    helpfulness_filtered_tfidf_part = helpfulness_filtered[helpfulness_filtered['method'] == 'tfidf']
    helpfulness_filtered_lime_part = helpfulness_filtered[helpfulness_filtered['method'] == 'lime']

    print(f'total helpfulness: {helpfulness_filtered["helpfulness (1 - 3, 3 is most helpful)"].mean()}')
    print(f'tfidf helpfulness: {helpfulness_filtered_tfidf_part["helpfulness (1 - 3, 3 is most helpful)"].mean()}')
    print(f'lime helpfulness: {helpfulness_filtered_lime_part["helpfulness (1 - 3, 3 is most helpful)"].mean()}')
