In [1]:
import pandas as pd
import sklearn.metrics
import numpy as np

## Data Prep

Import the screening results from Reviewer1, Reviewer2 and GPT.

Each reviewer evaluated 100 abstracts and answered `YES` or `NO` to two parameters: **AI_ML** and **NLP**. We extract the answers for each parameter and reviewer as python lists and convert `YES`/`NO` to `1` and `0` for easier manipulation.

In [2]:
def prepare_for_kappa(lst):
    lst = [value for value in lst if value != "MISSING"]
    return [1 if value == "YES" else 0 for value in lst]

In [3]:
df_reviewer1 = pd.read_excel('results/Screening_reviewer_1.xlsx')
lst_reviewer1_aiml = prepare_for_kappa(df_reviewer1.iloc[:102]['AI_ML'].tolist())
lst_reviewer1_nlp = prepare_for_kappa(df_reviewer1.iloc[:102]['NLP'].tolist())
# assert that there are 100 answers
len(lst_reviewer1_aiml)

100

In [4]:
df_reviewer2 = pd.read_excel('results/Screening_reviewer_2.xlsx')
lst_reviewer2_aiml = prepare_for_kappa(df_reviewer2.iloc[:102]['AI_ML'].tolist())
lst_reviewer2_nlp = prepare_for_kappa(df_reviewer2.iloc[:102]['NLP'].tolist())
# assert that there are 100 answers
len(lst_reviewer2_aiml)

100

In [5]:
df_reviewer_gpt = pd.read_excel('results/Screening_reviewer_GPT4.xlsx')
lst_reviewer_gpt_aiml = prepare_for_kappa(df_reviewer_gpt.iloc[:102]['AI_ML'].tolist())
lst_reviewer_gpt_nlp = prepare_for_kappa(df_reviewer_gpt.iloc[:102]['NLP'].tolist())
# assert that there are 100 answers
len(lst_reviewer_gpt_aiml)

100

In [6]:
df_resolution = pd.read_excel('results/Screening_resolution.xlsx')
lst_resolution_aiml = prepare_for_kappa(df_resolution.iloc[:102]['AI_ML_Resolution'].tolist())
lst_resolution_nlp = prepare_for_kappa(df_resolution.iloc[:102]['NLP_Resolution'].tolist())
# assert that there are 100 answers
len(lst_resolution_aiml)

100

## Calculating Inter-rater Agreement

We calculate inter-rater agreement as Cohen's Kappa coefficient.

In [7]:
def calculate_kappa_and_CI(list1, list2):
  ''' Function that calculates agreement between two graders whose answers are list1 and list2 respectively. Source code:
      https://rowannicholls.github.io/python/statistics/agreement/cohens_kappa.html#confidence-intervals

      Parameters:
      list1, list2: lists of 1s and 0s, where 1 stands for a "YES" answer and 0 for a "NO" answer

      Returns:
      a tuple of three numerical values representing the Cohen's Kappa value, lower end of the confidence interval and
      upper end of the confidence interval, in this order.
  '''

  # Create confusion matrix
  cm = sklearn.metrics.confusion_matrix(list1, list2)

  # Sample size
  n = np.sum(cm)

  # Expected matrix
  sum0 = np.sum(cm, axis=0)
  sum1 = np.sum(cm, axis=1)
  expected = np.outer(sum0, sum1) / n

  # Number of classes
  n_classes = cm.shape[0]

  # Calculate p_o (the observed proportionate agreement) and
  # p_e (the probability of random agreement)
  identity = np.identity(n_classes)
  p_o = np.sum((identity * cm) / n)
  p_e = np.sum((identity * expected) / n)

  # Calculate Cohen's kappa
  kappa = (p_o - p_e) / (1 - p_e)

  # Confidence intervals
  se = np.sqrt((p_o * (1 - p_o)) / (n * (1 - p_e)**2))
  ci = 1.96 * se * 2
  lower = kappa - 1.96 * se
  upper = kappa + 1.96 * se

  return (kappa, lower, upper)


In [8]:
graders = ["GPT", "Reviewer1", "Reviewer2", "Resolution"]
aiml_lists = [lst_reviewer_gpt_aiml, lst_reviewer1_aiml, lst_reviewer2_aiml, lst_resolution_aiml]
nlp_lists = [lst_reviewer_gpt_nlp, lst_reviewer1_nlp, lst_reviewer2_nlp, lst_resolution_nlp]

In [9]:
# AI/ML Kappas calculations for each unique pair of graders
for grader1_idx in range(0,len(graders)-1):
  for grader2_idx in range(grader1_idx+1, len(graders)):
    list1 = aiml_lists[grader1_idx]
    list2 = aiml_lists[grader2_idx]

    kappa, lower, upper = calculate_kappa_and_CI(list1, list2)

    print(f"Agreement between {graders[grader1_idx]} and {graders[grader2_idx]} on AI/ML parameter:")
    print(f"{kappa:.3f}, ({lower:.3f}, {upper:.3f})\n")


Agreement between GPT and Reviewer1 on AI/ML parameter:
0.901, (0.806, 0.996)

Agreement between GPT and Reviewer2 on AI/ML parameter:
0.899, (0.801, 0.996)

Agreement between GPT and Resolution on AI/ML parameter:
0.925, (0.841, 1.009)

Agreement between Reviewer1 and Reviewer2 on AI/ML parameter:
0.899, (0.801, 0.996)

Agreement between Reviewer1 and Resolution on AI/ML parameter:
0.975, (0.926, 1.024)

Agreement between Reviewer2 and Resolution on AI/ML parameter:
0.923, (0.837, 1.009)



In [10]:
# NLP Kappas calculations for each unique pair of graders
for grader1_idx in range(0,len(graders)-1):
  for grader2_idx in range(grader1_idx+1, len(graders)):
    list1 = nlp_lists[grader1_idx]
    list2 = nlp_lists[grader2_idx]

    kappa, lower, upper = calculate_kappa_and_CI(list1, list2)

    print(f"Agreement between {graders[grader1_idx]} and {graders[grader2_idx]} on NLP parameter:")
    print(f"{kappa:.3f}, ({lower:.3f}, {upper:.3f})\n")

Agreement between GPT and Reviewer1 on NLP parameter:
0.917, (0.824, 1.010)

Agreement between GPT and Reviewer2 on NLP parameter:
0.971, (0.915, 1.027)

Agreement between GPT and Resolution on NLP parameter:
0.944, (0.866, 1.021)

Agreement between Reviewer1 and Reviewer2 on NLP parameter:
0.890, (0.785, 0.996)

Agreement between Reviewer1 and Resolution on NLP parameter:
0.973, (0.920, 1.026)

Agreement between Reviewer2 and Resolution on NLP parameter:
0.917, (0.824, 1.010)



## Calculating additional metrics

Here we calculate accuracy, sensitivity, specificity and F1 score of GPT4 as compared to the resolution answers.

In [11]:
def calculate_stats(GPT_answers, ground_truth):
  """ Calculates Accuracy, Sensitivity, Specificity and F1 score for a list of binary answers given the ground truth.

      Parameters:
      GPT_answers, ground_truth: lists of 1s and 0s. Must have the same length.

      Returns:
      Dictionary mapping each calculated statistic to its corresponding value.
  """
  n = len(GPT_answers)
  true_positives = 0
  true_negatives = 0
  false_positives = 0
  false_negatives = 0

  for given_answer, correct_answer in zip(GPT_answers, ground_truth):
    if given_answer == correct_answer == 1:
      true_positives+=1
    elif given_answer == correct_answer == 0:
      true_negatives+=1
    # if reached here we know that given_answer != correct_answer since they can only take values of 1 or 0
    elif correct_answer == 1: # then given_answer == 0
      false_negatives+=1
    else: # correct_answer == 0 and given_answer == 1
      false_positives+=1

  if not (true_positives+true_negatives+false_positives+false_negatives == n):
    raise Exception('Something went wrong, check that your lists are binary and of equal length')

  answers_by_category = (true_positives, true_negatives, false_positives, false_negatives)



  precision = true_positives/(true_positives+false_positives)   # how many of classified positives are true positives
  sensitivity = true_positives/(true_positives+false_negatives) # how many of the actual positives are correctly identified as positives
  specificity = true_negatives/(true_negatives+false_positives) # how many of the actual negatives are identified as negatives
  accuracy = (true_positives+true_negatives)/n                  # how many of the examples are correctly classified
  f1_score = 2*precision*sensitivity/(precision+sensitivity)    # harmonic mean of precision and sensitivity

  return {'Results by category (TP, TN, FP, FN)': answers_by_category,
          'Accuracy': accuracy,
          'Sensitivity': sensitivity,
          'Specificity (sensitivity excluded)': specificity,
          'F1 score': f1_score}


We calculate and report the performance metrics of GPT for the concatenated lists for **AI/ML** and **NLP** parameters

In [12]:
stats_GPT_consensus_AIML = calculate_stats(lst_reviewer_gpt_aiml, lst_resolution_aiml)
stats_GPT_consensus_NLP = calculate_stats(lst_reviewer_gpt_nlp, lst_resolution_nlp)
stats_GPT_consensus_overall = calculate_stats([*lst_reviewer_gpt_aiml, *lst_reviewer_gpt_nlp], [*lst_resolution_aiml, *lst_resolution_nlp])

print("Results based only on AIML parameter: \n")
for key, val in stats_GPT_consensus_AIML.items():
  print(key, ": ", val)

print("\n\nResults based only on NLP parameter: \n")
for key, val in stats_GPT_consensus_NLP.items():
  print(key, ": ", val)

print("\n\nOverall results: \n")
for key, val in stats_GPT_consensus_overall.items():
  print(key, ": ", val)

Results based only on AIML parameter: 

Results by category (TP, TN, FP, FN) :  (26, 71, 2, 1)
Accuracy :  0.97
Sensitivity :  0.9629629629629629
Specificity (sensitivity excluded) :  0.9726027397260274
F1 score :  0.9454545454545454


Results based only on NLP parameter: 

Results by category (TP, TN, FP, FN) :  (22, 76, 0, 2)
Accuracy :  0.98
Sensitivity :  0.9166666666666666
Specificity (sensitivity excluded) :  1.0
F1 score :  0.9565217391304348


Overall results: 

Results by category (TP, TN, FP, FN) :  (48, 147, 2, 3)
Accuracy :  0.975
Sensitivity :  0.9411764705882353
Specificity (sensitivity excluded) :  0.9865771812080537
F1 score :  0.9504950495049505
