In [1]:
# Import packages

import numpy as np
import pandas as pd
from pathlib import Path
import os
import sys
from zipfile import ZipFile

from sklearn.metrics import accuracy_score, f1_score, \
    precision_score, recall_score, classification_report, \
    confusion_matrix, cohen_kappa_score

from statistics import mean, median, StatisticsError
import math


In [2]:
# Load functions

from qualtrics_functions import get_ratings_df, get_inter_ratings_df, calculate_intrarater_kappas, calculate_kappas, \
    get_kappa_groups, kick_out_judges, everyone_agreed, get_f1_groups, get_annotation_matrix, kick_out_intra_judges



In [3]:
# Import ratings_df from Qualtrics export for external evaluation

sentence_pairs_df, ratings_df, y_model = get_ratings_df(survey_name = 'writingassistant', sentence_pairs_csv = 'output/sentence_pairs_df.csv', groupnum = 1, introblock=1, drop_na = True)


In [4]:
# Calculate intrarater kappas

intra_kappas = calculate_intrarater_kappas(sentence_pairs_df, ratings_df)
print("Intrarater kappas for the judges:")
intra_kappas


Intrarater kappas for the judges:


{2: 0.5454545454545454,
 3: 1.0,
 4: 0.11764705882352955,
 5: 0.16666666666666663,
 6: 0.16666666666666663,
 8: -0.36363636363636354,
 9: -0.36363636363636354,
 10: -0.6666666666666667,
 11: -0.15384615384615374,
 12: 0.16666666666666663,
 14: 0.5454545454545454,
 15: -0.15384615384615374,
 17: 0.0,
 18: 1.0,
 19: 0.5454545454545454,
 20: 0.5454545454545454,
 21: 0.2857142857142857,
 47: -0.4285714285714286}

In [5]:
# Kick out judges with low intrarater kappas

filt_ratings_df = kick_out_intra_judges(sentence_pairs_df, ratings_df, kappa_threshold = 0.5) # kappa threshold ~getting 4/5 right
print("\nRatings table for the remaining judges:")
filt_ratings_df


6/18 judges left, kappa threshold: 0.5

Ratings table for the remaining judges:


qid,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,6.1,32,9.1,5.1,40
2,1,1,1,2,1,1,1,2,2,1,...,2,2,1,2,2,1,1,1,2,2
3,2,2,2,1,2,1,1,1,2,1,...,2,2,2,2,2,1,1,1,1,2
14,2,1,1,1,1,2,2,1,2,1,...,2,2,2,2,1,2,1,1,2,2
18,1,1,2,1,1,2,2,1,1,1,...,1,2,2,2,2,2,1,1,2,2
19,1,2,2,1,1,1,1,2,1,1,...,1,2,1,2,2,1,2,1,1,2
20,1,2,2,1,1,2,1,2,2,1,...,1,2,2,1,2,1,1,1,1,2


In [6]:
# Create dataframes for interrater evaluation without repeating sentences

inter_sent_df, inter_rat_df = get_inter_ratings_df(sentence_pairs_df, ratings_df)


In [7]:
# Filtering to intrarater judges only

inter_rat_df = inter_rat_df.loc[filt_ratings_df.index,:]


In [8]:
# Calculate kappas for the whole dataset

kappa_pairs, mean_kappa = calculate_kappas(inter_rat_df)
print(f"Mean kappa for all the ratings: {mean_kappa}")


Mean kappa for all the ratings: 0.1400131739809495


In [9]:
# Calculate kappa groups

get_kappa_groups(inter_sent_df, inter_rat_df)


[['model',
  't5',
  [[2, 3, 0.24960505529225907],
   [2, 14, 0.17061611374407581],
   [2, 18, -0.07655502392344515],
   [2, 19, 0.2448330683624801],
   [2, 20, 0.35897435897435903],
   [3, 14, 0.19786096256684493],
   [3, 18, 0.09688013136288998],
   [3, 19, 0.2411467116357504],
   [3, 20, 0.24960505529225907],
   [14, 18, 0.17898193760262726],
   [14, 19, -0.09612141652613837],
   [14, 20, 0.17061611374407581],
   [18, 19, -0.1345218800648298],
   [18, 20, 0.08293460925039864],
   [19, 20, 0.16534181240063595]],
  0.1400131739809495],
 ['source',
  'student',
  [[2, 3, 0.148606811145511],
   [2, 14, 0.12790697674418616],
   [2, 18, -0.16099071207430327],
   [2, 19, 0.13194444444444442],
   [2, 20, 0.27184466019417475],
   [3, 14, 0.24749163879598668],
   [3, 18, 0.025974025974025983],
   [3, 19, 0.148606811145511],
   [3, 20, 0.36305732484076436],
   [14, 18, 0.4147157190635452],
   [14, 19, -0.017441860465116088],
   [14, 20, 0.13249211356466872],
   [18, 19, -0.16099071207430327],


In [10]:
# Kick out judges based on interrater agreement

inter_rat_filt_df = kick_out_judges(inter_rat_df, kick_out_rate = 0.3)
inter_rat_filt_df


4/6 judges left, mean_kappa: 0.1400131739809495 -> 0.25158434365962395


qid,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
2,1,1,1,2,1,1,1,2,2,1,...,2,1,2,2,1,2,2,1,2,2
3,2,2,2,1,2,1,1,1,2,1,...,2,2,1,2,2,2,2,2,2,2
19,1,2,2,1,1,1,1,2,1,1,...,2,2,2,2,2,1,2,1,2,2
20,1,2,2,1,1,2,1,2,2,1,...,2,1,1,2,1,1,2,2,1,2


In [11]:
# Calculate filtered kappa groups

get_kappa_groups(inter_sent_df, inter_rat_filt_df)


[['model',
  't5',
  [[2, 3, 0.24960505529225907],
   [2, 19, 0.2448330683624801],
   [2, 20, 0.35897435897435903],
   [3, 19, 0.2411467116357504],
   [3, 20, 0.24960505529225907],
   [19, 20, 0.16534181240063595]],
  0.25158434365962395],
 ['source',
  'student',
  [[2, 3, 0.148606811145511],
   [2, 19, 0.13194444444444442],
   [2, 20, 0.27184466019417475],
   [3, 19, 0.148606811145511],
   [3, 20, 0.36305732484076436],
   [19, 20, 0.27184466019417475]],
  0.22265078532743005],
 ['source',
  'medium',
  [[2, 3, 0.2857142857142857],
   [2, 19, 0.18181818181818177],
   [2, 20, 0.4444444444444444],
   [3, 19, 0.18604651162790697],
   [3, 20, 0.13793103448275867],
   [19, 20, 0.0625]],
  0.21640907634792958]]

In [12]:
# In order to calculate F1-score to evaluate model performance
# Filter to sentences only that majority of judges agreed on

agreement_df = everyone_agreed(inter_sent_df, inter_rat_filt_df, majority=True)

print("Number of sentences where majority is achieved:\n")
print(agreement_df['source'].value_counts())


Number of sentences where majority is achieved:

student    17
medium     16
Name: source, dtype: int64


In [13]:
# Kappa score for these sentences only

get_kappa_groups(agreement_df, inter_rat_df[agreement_df['qid']])


[['model',
  't5',
  [[2, 3, 0.6869070208728653],
   [2, 14, 0.17021276595744683],
   [2, 18, -0.04469273743016755],
   [2, 19, 0.6278195488721805],
   [2, 20, 0.6927374301675977],
   [3, 14, 0.34782608695652173],
   [3, 18, 0.011235955056179914],
   [3, 19, 0.31119544592030357],
   [3, 20, 0.3820224719101124],
   [14, 18, 0.125],
   [14, 19, 0.042553191489361764],
   [14, 20, 0.25],
   [18, 19, -0.04469273743016755],
   [18, 20, 0.022222222222222254],
   [19, 20, 0.3240223463687151]],
  0.2602912673955448],
 ['source',
  'student',
  [[2, 3, 0.5211267605633803],
   [2, 14, 0.16049382716049376],
   [2, 18, -0.19718309859154926],
   [2, 19, 0.48484848484848486],
   [2, 20, 0.7605633802816901],
   [3, 14, 0.31081081081081074],
   [3, 18, -0.18055555555555558],
   [3, 19, 0.04225352112676062],
   [3, 20, 0.29166666666666663],
   [14, 18, 0.31081081081081074],
   [14, 19, 0.16049382716049376],
   [14, 20, 0.31081081081081074],
   [18, 19, -0.19718309859154926],
   [18, 20, 0.05555555555555

In [14]:
# Still results in a very low kappa score
# So filter to sentences only that all remaining judges agreed on

agreement_df = everyone_agreed(inter_sent_df, inter_rat_filt_df, majority=False)
print("Number of sentences where majority is achieved:\n")
print(agreement_df['source'].value_counts())


Number of sentences where majority is achieved:

medium     10
student     7
Name: source, dtype: int64


In [15]:
# Get kappa score for these sentences

get_kappa_groups(agreement_df, inter_rat_df[agreement_df['qid']])


[['model',
  't5',
  [[2, 3, 1.0],
   [2, 14, 0.48484848484848486],
   [2, 18, 0.04225352112676062],
   [2, 19, 1.0],
   [2, 20, 1.0],
   [3, 14, 0.48484848484848486],
   [3, 18, 0.04225352112676062],
   [3, 19, 1.0],
   [3, 20, 1.0],
   [14, 18, 0.04225352112676062],
   [14, 19, 0.48484848484848486],
   [14, 20, 0.48484848484848486],
   [18, 19, 0.04225352112676062],
   [18, 20, 0.04225352112676062],
   [19, 20, 1.0]],
  0.5433774363351829],
 ['source',
  'student',
  [[2, 3, 1.0],
   [2, 14, 0.72],
   [2, 18, -0.16666666666666674],
   [2, 19, 1.0],
   [2, 20, 1.0],
   [3, 14, 0.72],
   [3, 18, -0.16666666666666674],
   [3, 19, 1.0],
   [3, 20, 1.0],
   [14, 18, 0.15999999999999992],
   [14, 19, 0.72],
   [14, 20, 0.72],
   [18, 19, -0.16666666666666674],
   [18, 20, -0.16666666666666674],
   [19, 20, 1.0]],
  0.5582222222222222],
 ['source',
  'medium',
  [[2, 3, 1.0],
   [2, 14, 0.21052631578947367],
   [2, 18, 0.09090909090909094],
   [2, 19, 1.0],
   [2, 20, 1.0],
   [3, 14, 0.210

In [16]:
# Calculate F1 groups

f1_groups = get_f1_groups(agreement_df)
print("F1-score among the majority rating of the remaining judges:\n")
f1_groups


F1-score among the majority rating of the remaining judges:



[0.3928571428571429,
 ['model', 't5', 0.3928571428571429],
 ['source', 'student', 0.3],
 ['source', 'medium', 0.4444444444444445]]