In [1]:
# Import packages

import numpy as np
import pandas as pd
from pathlib import Path
import os
import sys
from zipfile import ZipFile

from sklearn.metrics import accuracy_score, f1_score, \
    precision_score, recall_score, classification_report, \
    confusion_matrix, cohen_kappa_score

from statistics import mean, median, StatisticsError
import math


In [2]:
# Import functions

from qualtrics_functions import get_ratings_df, get_inter_ratings_df, calculate_intrarater_kappas, calculate_kappas, \
    get_kappa_groups, kick_out_judges, everyone_agreed, get_f1_groups, get_annotation_matrix, kick_out_intra_judges


In [3]:
# Import ratings_df from Qualtrics export for internal evaluation

group1_df, ratings1_df, y1_model = get_ratings_df(survey_name = 'group1', sentence_pairs_csv = 'output/group1_df.csv', groupnum = 1)
group2_df, ratings2_df, y2_model = get_ratings_df(survey_name = 'group2', sentence_pairs_csv = 'output/group2_df.csv', groupnum = 2)

print("Rating dataframe for group 1 as example:")
ratings1_df


Rating dataframe for group 1 as example:


qid,6,46,36,18,29,17,21,22,38,37,...,47,41,53,26,52,50,10,34,35,8
2,2,1,2,2,2,1,2,1,1,1,...,2,2,2,2,2,2,2,1,1,2
3,1,1,2,2,2,1,2,1,1,1,...,2,2,2,1,2,2,1,1,1,2
4,1,1,2,2,1,1,1,1,2,1,...,2,2,2,1,2,1,2,1,2,2


In [4]:
# Calculate intrarater kappas on repeated sentences

intra_kappas1_df = calculate_intrarater_kappas(group1_df, ratings1_df)
intra_kappas2_df = calculate_intrarater_kappas(group2_df, ratings2_df)
print(f"Intrarater kappas for group1: {intra_kappas1_df}")
print(f"Intrarater kappas for group2: {intra_kappas1_df}")


Intrarater kappas for group1: {2: 1.0, 3: 1.0, 4: 0.5454545454545454}
Intrarater kappas for group2: {2: 1.0, 3: 1.0, 4: 0.5454545454545454}


In [5]:
# Create dataframes for interrater evaluation (remove repeated sentences)

inter_sent1_df, inter_rat1_df = get_inter_ratings_df(group1_df, ratings1_df)
inter_sent2_df, inter_rat2_df = get_inter_ratings_df(group2_df, ratings2_df)


In [6]:
# Calculate kappas for whole dataset

kappa_pairs1, mean_kappa1 = calculate_kappas(inter_rat1_df)
kappa_pairs2, mean_kappa2 = calculate_kappas(inter_rat2_df)
print(f"Mean kappa for group1: {mean_kappa1}")
print(f"Mean kappa for group2: {mean_kappa2}")


Mean kappa for group1: 0.3344419249181154
Mean kappa for group2: 0.1867981789886727


In [7]:
# Calculate kappa groups

kappa_groups1 = get_kappa_groups(inter_sent1_df, inter_rat1_df)
kappa_groups2 = get_kappa_groups(inter_sent2_df, inter_rat2_df)

print("Kappa model/source groups for rater group1:\n")
kappa_groups1


Kappa model/source groups for rater group1:



[['model',
  'gpt2',
  [[2, 3, 0.64], [2, 4, 0.09768637532133684], [3, 4, 0.38983050847457623]],
  0.37583896126530436],
 ['model',
  't5',
  [[2, 3, 0.5524861878453038],
   [2, 4, 0.14409221902017288],
   [3, 4, 0.17728531855955676]],
  0.2912879084750111],
 ['source',
  'cs_conf',
  [[2, 3, 0.4444444444444444], [2, 4, -0.11111111111111116], [3, 4, 0.4]],
  0.24444444444444444],
 ['source',
  'student',
  [[2, 3, 0.5068493150684932],
   [2, 4, 0.2588235294117647],
   [3, 4, 0.24096385542168663]],
  0.3355455666339815],
 ['source',
  'medium',
  [[2, 3, 0.7777777777777778],
   [2, 4, 0.16000000000000003],
   [3, 4, 0.2222222222222222]],
  0.38666666666666666]]

In [8]:

print("Kappa model/source groups for rater group2:\n")
kappa_groups2


Kappa model/source groups for rater group2:



[['model',
  't5',
  [[2, 3, 0.05714285714285705], [2, 4, -0.11428571428571432], [3, 4, 0.0]],
  -0.01904761904761909],
 ['model',
  'gpt2',
  [[2, 3, -0.10031347962382453],
   [2, 4, 0.33742331288343563],
   [3, 4, 0.1706484641638225]],
  0.13591943247447788],
 ['source',
  'cs_conf',
  [[2, 3, -0.4096385542168677], [2, 4, 0.02409638554216864], [3, 4, 0.55]],
  0.05481927710843367],
 ['source',
  'student',
  [[2, 3, 0.6666666666666667], [2, 4, 0.1428571428571429], [3, 4, 0.0]],
  0.2698412698412699],
 ['source',
  'medium',
  [[2, 3, 0.19999999999999996],
   [2, 4, 0.5384615384615384],
   [3, 4, 0.1428571428571429]],
  0.2937728937728938]]

In [9]:
# Create agreement_df for calculating F1-scores

agreement1_df = everyone_agreed(inter_sent1_df, inter_rat1_df)
agreement2_df = everyone_agreed(inter_sent2_df, inter_rat2_df)

print('Number of agreed sentences in group1:')
print(agreement1_df['model'].value_counts())
print(agreement1_df['source'].value_counts())
print('\nNumber of agreed sentences in group2:')
print(agreement2_df['model'].value_counts())
print(agreement2_df['source'].value_counts())


Number of agreed sentences in group1:
gpt2    14
t5      13
Name: model, dtype: int64
medium     10
student     9
cs_conf     8
Name: source, dtype: int64

Number of agreed sentences in group2:
gpt2    12
t5       9
Name: model, dtype: int64
medium     8
student    8
cs_conf    5
Name: source, dtype: int64


In [10]:
# Calculate F1 groups

f1_groups1 = get_f1_groups(agreement1_df)
f1_groups2 = get_f1_groups(agreement2_df)

print("F1-score the model/source groups for rater group1:")
f1_groups1


F1-score the model/source groups for rater group1:


[0.37209302325581395,
 ['model', 't5', 0.40909090909090906],
 ['model', 'gpt2', 0.3333333333333333],
 ['source', 'student', 0.4],
 ['source', 'medium', 0.4117647058823529],
 ['source', 'cs_conf', 0.2727272727272727]]

In [11]:

print("F1-score the model/source groups for rater group2:")
f1_groups2


F1-score the model/source groups for rater group2:


[0.3,
 ['model', 't5', 0.47058823529411764],
 ['model', 'gpt2', 0.07692307692307693],
 ['source', 'student', 0.2727272727272727],
 ['source', 'medium', 0.3333333333333333],
 ['source', 'cs_conf', 0.28571428571428575]]