# Imports

In [1]:
import sklearn
import numpy as np
from sklearn.metrics import confusion_matrix

import pandas as pd
import ast

ModuleNotFoundError: No module named 'sklearn'

# Loading the Data

Here we are loading the reviewer files. Please note that the answers have been manually reviewed and small changes/ fix-ups have been done where needed.

In [None]:
Reviewer1 = pd.read_excel('./results/FullText_reviewer_1.xlsx')
GPT = pd.read_excel('./results/FullText_reviewer_GPT4.xlsx')
Resolution = pd.read_excel('./results/FullText_resolution.xlsx')

# Calculating Inter-rater Agreement

We have four functions to calculate inter-rater agreement. `kappa_calculation` is the main fuction that calculates Cohen's Kappa for two lists containing 'yes'/'no' values. `kappa_boolean` and `kappa_non_boolean` re-format the answers from the reviewers into 'yes'/'no' lists and call on `kappa_calculation` to generate the agreement values. Finally, `Kappa` is the function that puts everything together calling either `kappa_boolean` or `kappa_non_boolean` for each parameter.

## Kappa Calculation

The code for this Kappa Calculation was taken from this page: https://rowannicholls.github.io/python/statistics/agreement/cohens_kappa.html


In [None]:
def kappa_calculation(List1, List2):
    """ Function that calculates Cohen's Kappa coefficient for two lists that contain 'yes' or 'no' answers.
    Please note that the input lists should have the same length.

    Parameters:
    List1, List2 (list['yes'|'no']): lists of 'yes' or 'no' values.

    Returns:
    kappa, (lower, upper)
    A float represesenting the calculated Cohen's Kappa coefficient of the two lists and a tuple of two floats
    representing the lower and the upper ends of the confidence interval for the kappa coefficient.
    """

    readerA = List1
    readerB = List2

    # Confusion matrix
    cm = confusion_matrix(readerA, readerB, labels=['yes','no'])

    # Sample size
    n = np.sum(cm)

    # Expected matrix
    sum0 = np.sum(cm, axis=0)
    sum1 = np.sum(cm, axis=1)
    expected = np.outer(sum0, sum1) / n

    # Number of classes
    n_classes = cm.shape[0]

    # Calculate p_o (the observed proportionate agreement) and
    # p_e (the probability of random agreement)
    identity = np.identity(n_classes)
    p_o = np.sum((identity * cm) / n)
    p_e = np.sum((identity * expected) / n)

    # Calculate Cohen's kappa
    kappa = (p_o - p_e) / (1 - p_e)

    # Confidence intervals
    se = np.sqrt((p_o * (1 - p_o)) / (n * (1 - p_e)**2))
    ci = 1.96 * se * 2
    lower = kappa - 1.96 * se
    upper = kappa + 1.96 * se

    #display only upto two decimal places

    kappa = float(f"{kappa:.2f}")
    lower = float(f"{lower:.2f}")
    upper = float(f"{upper:.2f}")

    return kappa, (lower, upper)

## Boolean columns

The following function is used to process values for columns that have boolean values (**llm**, **review** and **structured_data** )

Its main purpose is to convert all the YES/NO values in the list to their lowercase form because `kappa_calculation` needs consistent 'yes' or 'no' values.

**NOTE: Please make sure the all the values from reviewers are either 'YES'/'yes' or 'NO'/'no', otherwise it may throw an error.**


In [None]:
def kappa_boolean(List1, List2):
    """ Function calculating Cohen's Kappa coefficient for two lists of boolean values. Reviewers were asked
    to answer "YES" or "NO", so those answers need to be converted to lowercase.

    Parameters:
    List1, List2 (list[str]): Answers retrieved by the reviewers, should be "YES" or "NO"

    Returns:
    kappa, (lower, upper)
    A float represesenting the calculated Cohen's Kappa coefficient of the two lists and a tuple of
    two floats representing the lower and the upper ends of the confidence interval for the kappa coefficient.
    """

    List1 = [(val).lower() for val in List1]
    List2 = [(val).lower() for val in List2]

    return kappa_calculation(List1, List2)


## Non Boolean Columns


This function will be used to process non boolean columns like **llm_name** and **list_of_medical_conditions**. The values in these columns represent lists of tokens separated by commas. We used a one-hot encoding with the union of all values for each entry to generate a list of 'yes'/'no' answers and calculate agreement with `kappa_calculation`. Here is an example of how the function works:

Say the responses to the first article are

    List1[0] = ['BERT', 'ClinicalBERT']
    List2[0] = ['BIOBERT', 'BERT'].

To create the one-hot encodings corresponding to the first article, we first make a union vector of all responses

    UNION = ['BERT','ClinicalBERT','BIOBERT']

Next, the encoding for each list will be a vector of same length as UNION with the i-th entry being 'yes' if UNION[i] is in the list and 'no' otherwise. For our example we'll have

    list1: ['yes', 'yes', 'no']
    list2: ['yes', 'no', 'yes']

This process is repeated for all articles and the one-hot-encodings are concatenated.

In [None]:
def remove_whitespace_and_capitalize(input_string):
    """ Helper function used to pre-process the text in a list of token data. This function is used to ensure
    that casing and white space are ignored when comparing answers from reviewers. For xample, 'gpt3' will
    be equivalent to GPT3' and 'Clinical BERT' to 'clinicalBERT'
    """
    # Remove white spaces
    no_whitespace = input_string.replace(" ", "")

    # Convert to uppercase
    uppercase_string = no_whitespace.upper()

    return uppercase_string

In [None]:
def kappa_non_boolean(List1, List2):
    """ Function that converts list-of-tokens responses for non boolean parameters into boolean vectors
    and calculates the inter-rater agreement.

    Parameters:
    List1, List2 (list[str]): lists representing the extracted values for non-boolean parameters. Please note that for non-boolean data
                              each entry may have several answers. For example, several LLMs might have been used in a single article.
                              In this case, the entry corresponding to that article would be a string enumeration of all the LLMs used.
                              ex: ['', 'BERT, ClinicalBERT' , 'Glove, BERT', '', ...]

    Returns:
    kappa, (lower, upper)
    A float represesenting the calculated Cohen's Kappa coefficient of the two lists and a tuple of
    two floats representing the lower and the upper ends of the confidence interval for the kappa coefficient.

    """

    one_hot_list1 = []
    one_hot_list2 = []

    for ind in range(len(List1)):

        # remove white space and make every string to UpperCase
        string1 = remove_whitespace_and_capitalize(str(List1[ind])).split(',')
        string2 = remove_whitespace_and_capitalize(str(List2[ind])).split(',')

        # remove empty string or nan values
        string1 = [item for item in string1 if item != '' and item != 'NAN']
        string2 = [item for item in string2 if item != '' and item != 'NAN']

        # Make a union list for each index.
        UNION = list(set(string1) | set(string2))

        for each_item in UNION:

            if each_item in string1:
                one_hot_list1.append('yes')
            else:
                one_hot_list1.append('no')

            if each_item in string2:
                one_hot_list2.append('yes')
            else:
                one_hot_list2.append('no')

    # kappa_calculation
    final_ans = kappa_calculation(one_hot_list1, one_hot_list2)

    return final_ans


## Agreement between a pair of reviewers

This function calculates the inter-rater agreement between two reviewers for all the parameters of interest.


In [None]:
def Kappa(rater1, rater2):
    """ Function that calculates inter-rater agreement between `rater1` and `rater2` across all parameters
    of interest.

    Parameters:
    rater1, rater2: dataframes corresponding to the two reviewers. It is assumed that the dataframes have a column
                    for each parameter of interest (defined within the body of the function), that the numer of rows
                    is equal and the answers follow expected formatting (YES/NO, list of tokens, etc.)

    Returns:
    dict {str: (float, (float, float))}
    A collection mapping each parameter to the calculated Cohen's Kappa and confidence interval.
    """

    boolean_columns = ['review','llm', 'structured_data']
    non_boolean_columns = ['llm_name', 'list_of_medical_conditions']
    result = {}

    for column_name in boolean_columns:

        try:

            List1 = rater1[column_name].to_list()
            List2 = rater2[column_name].to_list()

            # calculate kappa for this specific boolean column
            output = kappa_boolean(List1, List2)

            result[column_name] = output

        except Exception as e:
            print('Error in column', column_name , e)
            result[column_name] = 'error'


    for column_name in non_boolean_columns:

        try:

            List1 = rater1[column_name].to_list()
            List2 = rater2[column_name].to_list()

            output = kappa_non_boolean(List1,List2)

            result[column_name] = output

        except Exception as e:
            print('Error in column', column_name , e)
            result[column_name] = 'error'

    return result



## Running the code

Below we calculate and display the agreement values for each pair of reviewers and consensus.

In [None]:
gpt_reviewer1 = Kappa(GPT,Reviewer1)
gpt_resolution = Kappa(GPT,Resolution)
reviewer1_resolution = Kappa(Reviewer1,Resolution)


In [None]:
gpt_reviewer1

{'review': (0.0, (-1.92, 1.92)),
 'llm': (0.47, (0.05, 0.88)),
 'structured_data': (0.57, (0.23, 0.9)),
 'llm_name': (-0.35, (-0.9, 0.2)),
 'list_of_medical_conditions': (-0.24, (-0.34, -0.13))}

In [None]:
gpt_resolution

In [None]:
reviewer1_resolution