In [1]:
import pandas as pd
import numpy as np
import altair as alt
import json

from utils import calculate_krippendorff_alpha, print_units_and_annotators

alt.data_transformers.enable('default', max_rows=10000)

DataTransformerRegistry.enable('default')

# Data processing   
This notebook implements the steps taken by the authors to process the annotation data and generate a high-quality dataset for the task of similarity detection in Spanish song lyrics. 

### 1. Load and process the annotation data
First, we load the dataset and the lyrics pairs from the Excel file and convert them to a JSON file. 

In [2]:
df = pd.read_excel('../data/raw/DetailedResults.xlsx', sheet_name='Sheet1')

In [3]:
# convert dataframe to json object
data = json.loads(df.to_json(orient='records'))

In [4]:
# 
for row in data:
    result_str = row['Result']
    result_dict = json.loads(result_str)
    row['Result'] = result_dict

    song_pair_info_str = row['SongPairInfo']
    song_pair_info_dict = json.loads(song_pair_info_str)
    row['SongPairInfo'] = song_pair_info_dict

In [5]:
# Now save data to json file
with open('../data/raw/detailed_results.json', 'w') as f:
    # save formatted
    json.dump(data, f, indent=4)

In [6]:
# Keep only the fields we need
clean_data = []
for item in data:
    new_item = {
        'annotator_id': item['JobMemberId'],
        'value': item['Result']['LikertRating']['value'],
        'sim_rating': int(item['Result']['LikertRating']['key']),
        'id1': item['SongPairInfo']['id_1'],
        'id2': item['SongPairInfo']['id_2']
    }
    clean_data.append(new_item) 

# Save to json file
with open('../data/processed/clean_detailed_results.json', 'w') as f:
    # save formatted
    json.dump(clean_data, f, indent=4)

### 2. Data Refinement
In this part, we will process the data to obtain high-quality annotations. We define a high-quality annotation as an annotation in which at least 2/3 of the annotators agree on the same score for a given pair of lyrics. In order to increase the quality of our data, if only 2/3 of the annotators agree on the same score, the third one has to be within 1 point of the other two (this is, within the `[-1, 1]` interval from the mode).

In [7]:
# First we load the clean data from the json file into a dataframe
df = pd.read_json('../data/processed/clean_detailed_results.json')

# Set a unique string as the pair id 
df['pair_id'] = df['id1'].astype('str') + '_' + df['id2'].astype('str')

In [8]:
print_units_and_annotators(df)

Number of units: 2775
Number of annotators: 63


In [9]:
# Show value counts and percentages for each rating
df['sim_rating'].value_counts()


0    3058
1    3014
2    1058
3     746
4     347
5     102
Name: sim_rating, dtype: int64

In [10]:
alt.Chart(df.reset_index()).mark_rect().encode(
    x=alt.X('sim_rating:O'),
    y='count()')

In [11]:
def apply_filtering_criteria(row):
    if (len(row['sim_rating'].unique()) > 2):
        # Complete disagreement, discard
        return False
    elif (len(row['sim_rating'].unique()) == 1):
        # Complete agreement, keep
        return True
    else:
        # Partial agreement. Check if the rating from the disagreeing annotator falls within
        # the +/- 1 range of the other two
        # Get the mode
        mode = row['sim_rating'].mode()[0]
        if mode == 0:
            # In order to label a pair as completely dissimilar, all annotators must agree
            return False
        if mode > 0 and 0 in row['sim_rating'].unique():
            # And likewise, to label a pair as similar, all annotators must have given a positive rating
            return False
        # Get the value that is not the mode
        other_val = row['sim_rating'].unique()[row['sim_rating'].unique() != mode][0]
        # Check if the other value is within the +/- 1 range of the mode
        # if (other_val in [mode-2, mode-1, mode+1, mode+2]): # Lax criteria
        if (other_val in [mode-1, mode+1]): # Strict criteria
            return True
        else:
            return False

# Keep only those pairs in which two or more annotators gave the same rating
filt_df = df.groupby('pair_id').filter(apply_filtering_criteria)

### 3. Krippendorff's alpha
Krippendorff's alpha is a flexible measure of the quality of annotation data (as opposed to other measures such as Fleiss' or Cohen's kappas, that focus on inter-annotator reliability). It provides a way to assess the degree of correspondence between the values assigned to a set of items by different raters. In general, Krippendorff's alpha is a generalization of Fleiss' kappa that can be applied to any kind of data annotated by more than two raters, it is robust to outliers, missing values, is able to manage non-binary data, and it can be used to assess the reliability of data obtained from multiple sources. 

The alpha score ranges from 0 to 1, with 0 indicating no agreement and 1 indicating complete agreement. The closer the score is to 1, the better the agreement between raters. As such, it's often used to evaluate the consistency and reliability of data obtained from multiple sources.

It's important to note that Krippendorff's alpha is a measure of the reliability of the data, not of the raters themselves. This means it primarily assesses the consistency of the values assigned to items by the raters, rather than the quality of specific raters or their individual abilities. By applying the consensus rules described above, we were able to increase the quality of our data and the reliability of the annotations to a score of 0.9, producing a high-quality dataset for the task at hand. 

In [14]:
print_units_and_annotators(filt_df)
print(f"Krippendorffs' alpha: {calculate_krippendorff_alpha(filt_df)}")

Number of units: 676
Number of annotators: 58
Krippendorffs' alpha: 0.9010355393690141


In [17]:
# Save to csv
filt_df.to_csv('../data/processed/filtered_detailed_results.csv', index=False)

In [18]:
# Altair boxplot of similarity ratings
alt.Chart(filt_df.reset_index()).mark_bar().encode(
    x=alt.X('annotator_id:N', sort='-y'),
    y=alt.Y('count()'),
    tooltip=['annotator_id', 'count()'],
    color=alt.Color('sim_rating:N')
)

In [19]:
# Altair boxplot of similarity ratings. Sort indices by number of annotations made by each annotator
alt.Chart(filt_df.reset_index()).mark_boxplot().encode(
    x=alt.X('annotator_id:N', 
            sort=alt.EncodingSortField(
                        field='sim_rating', 
                        op='count', 
                        order='descending')),
    y=alt.Y('sim_rating:Q'),
)