In [38]:
import os, csv

data = []
observations = []
sentences = []
current_sentence = []
length = 0

for entry in os.scandir('.'):
    if entry.path.endswith('.csv'):
        
        with open(entry, 'r') as fin:
            reader = csv.reader(fin)
            for i, row in enumerate(reader):
                if i == 0: continue
                
                if row[0] == '':
                    if current_sentence:
                        sentences.append(current_sentence)
                        length = 0
                    current_sentence = []
                else:
                    if row[2]: data.append([row[0], row[2], row[3], len(sentences), length])
                    length += len(row[0])
                    current_sentence.append(row[0])
                    
                    if (row[1] and row[2]) or (row[5] and row[6]) or (row[10] and row[11]):
                        if row[3] == '': row[3] = row[2]
                        if row[7] == '': row[7] = row[6]
                        if row[12] == '': row[12] = row[11]
                        row = [x.lower() for x in row]
                        observations.append([row[2:4], row[6:8], row[11:13]])

        if current_sentence:
            sentences.append(current_sentence)
            
print(observations[:10])
print(len(observations))

[[['experiencer', 'agent'], ['experiencer', 'agent'], ['experiencer', 'agent']], [['theme', 'theme'], ['theme', 'theme'], ['gestalt', 'gestalt']], [['gestalt', 'gestalt'], ['gestalt', 'gestalt'], ['gestalt', 'gestalt']], [['endtime', 'interval'], ['duration', 'endtime'], ['duration', 'duration']], [['possessor', 'locus'], ['possessor', 'possessor'], ['possessor', 'locus']], [['purpose', 'purpose'], ['purpose', 'purpose'], ['purpose', 'purpose']], [['gestalt', 'gestalt'], ['gestalt', 'gestalt'], ['gestalt', 'gestalt']], [['topic', 'topic '], ['topic', 'topic'], ['topic', 'topic ']], [['recipient', 'recipient'], ['recipient', 'recipient'], ['recipient', 'recipient']], [['agent', 'ancillary'], ['agent', 'ancillary'], ['agent', 'ancillary']]]
3036


In [39]:
snacs = [
    'Agent',
    'Ancillary',
    'Approximator',
    'Beneficiary',
    'Causer',
    'Characteristic',
    'Circumstance',
    'ComparisonRef',
    'Cost',
    'Direction',
    'Duration',
    'EndTime',
    'Ensemble',
    'Experiencer',
    'Explanation',
    'Extent',
    'Frequency',
    'Gestalt',
    'Goal',
    'Identity',
    'Instrument',
    'Interval',
    'Locus',
    'Manner',
    'Means',
    'NONSNACS',
    'Org',
    'OrgMember',
    'Originator',
    'PartPortion',
    'Path',
    'Possession',
    'Possessor',
    'Purpose',
    'QuantityItem',
    'QuantityValue',
    'RateUnit',
    'Recipient',
    'SocialRel',
    'Source',
    'Species',
    'StartTime',
    'Stimulus',
    'Stuff',
    'Temporal',
    'Theme',
    'Time',
    'Topic',
    'Whole',
    '_',
    '`$',
    '`c',
    '`d',
    '`i',
    '`j',
    'Focus'
]
snacs = [x.lower() for x in snacs]

In [40]:
snacs[:3]

['agent', 'ancillary', 'approximator']

In [41]:
scene = [[y[0] for y in x] for x in observations]
print(scene[:5])
function = [[y[1] for y in x] for x in observations]
print(function[:5])

[['experiencer', 'experiencer', 'experiencer'], ['theme', 'theme', 'gestalt'], ['gestalt', 'gestalt', 'gestalt'], ['endtime', 'duration', 'duration'], ['possessor', 'possessor', 'possessor']]
[['agent', 'agent', 'agent'], ['theme', 'theme', 'gestalt'], ['gestalt', 'gestalt', 'gestalt'], ['interval', 'endtime', 'duration'], ['locus', 'possessor', 'locus']]


# Cohen's *κ*

The following has the calculations for interannotator agreement (Cohen's *κ* metric).

In [73]:
from sklearn.metrics import cohen_kappa_score

scene = [['??' if y not in snacs else y for y in x] for x in scene]
function = [['??' if y not in snacs else y for y in x] for x in function]

print(len(scene), len(function))
print('Scene Kappa', cohen_kappa_score([x[1] for x in scene], [x[2] for x in scene]))
print('Function Kappa', cohen_kappa_score([x[1] for x in function], [x[2] for x in function]))
print('Adj == Aryaman, != Nitin', len([1 for x in scene if x[0] == x[1] and x[0] != x[2]]) / len(scene))
print('Adj == Nitin, != Aryaman', len([1 for x in scene if x[0] == x[2] and x[0] != x[1]]) / len(scene))

print('\n+ Ignoring Focus and NONSNACS/`d (introduced only by 2nd annotator, all blank for 1st):')
scene2 = [x for x in scene if x[0] not in ('focus', 'nonsnacs', '`d')]
function2 = [x for x in function if x[0] not in ('focus', 'nonsnacs', '`d')]
print(len(scene2), len(function2))
print('Scene Kappa', cohen_kappa_score([x[1] for x in scene2], [x[2] for x in scene2]))
print('Function Kappa', cohen_kappa_score([x[1] for x in function2], [x[2] for x in function2]))
print('Adj == Aryaman, != Nitin', len([1 for x in scene2 if x[0] == x[1] and x[0] != x[2]]) / len(scene2))
print('Adj == Nitin, != Aryaman', len([1 for x in scene2 if x[0] == x[2] and x[0] != x[1]]) / len(scene2))

print('\n+ Only Adjudicated (adjudicated column == valid snacs label):')
scene2 = [x for x in scene2 if x[0] != '??']
function2 = [x for x in function2 if x[0] != '??']
print(len(scene2), len(function2))
print('Scene Kappa', cohen_kappa_score([x[1] for x in scene2], [x[2] for x in scene2]))
print('Function Kappa', cohen_kappa_score([x[1] for x in function2], [x[2] for x in function2]))
print('Adj == Aryaman, != Nitin', len([1 for x in scene2 if x[0] == x[1] and x[0] != x[2]]) / len(scene2))
print('Adj == Nitin, != Aryaman', len([1 for x in scene2 if x[0] == x[2] and x[0] != x[1]]) / len(scene2))

3036 3036
Scene Kappa 0.6014073337036525
Function Kappa 0.6539564674371567
Adj == Aryaman, != Nitin 0.10507246376811594
Adj == Nitin, != Aryaman 0.24044795783926218

+ Ignoring Focus and NONSNACS/`d (introduced only by 2nd annotator, all blank for 1st):
2571 2571
Scene Kappa 0.712475345191219
Function Kappa 0.775359140113864
Adj == Aryaman, != Nitin 0.1073512252042007
Adj == Nitin, != Aryaman 0.1306884480746791

+ Only Adjudicated (adjudicated column == valid snacs label):
2456 2473
Scene Kappa 0.7456958064309412
Function Kappa 0.8067077657563136
Adj == Aryaman, != Nitin 0.10586319218241043
Adj == Nitin, != Aryaman 0.10952768729641693


In [53]:
[x for x in scene if x[1] != x[2]][:20]

[['theme', 'theme', 'gestalt'],
 ['??', 'quantityitem', 'gestalt'],
 ['focus', '??', 'focus'],
 ['`d', '??', '`d'],
 ['`d', '??', '`d'],
 ['??', 'locus', 'whole'],
 ['gestalt', 'gestalt', 'org'],
 ['extent', '??', 'extent'],
 ['`d', '??', '`d'],
 ['`d', '??', '`d'],
 ['comparisonref', 'comparisonref', 'circumstance'],
 ['causer', 'explanation', 'causer'],
 ['focus', '??', 'focus'],
 ['topic', 'topic', 'gestalt'],
 ['comparisonref', 'time', 'comparisonref'],
 ['circumstance', 'manner', 'circumstance'],
 ['??', '??', '`d'],
 ['`d', '??', '`d'],
 ['focus', '??', 'focus'],
 ['focus', '??', 'focus']]

In [54]:
[x for x in scene2 if x[1] != x[2]][:20]

[['theme', 'theme', 'gestalt'],
 ['gestalt', 'gestalt', 'org'],
 ['extent', '??', 'extent'],
 ['comparisonref', 'comparisonref', 'circumstance'],
 ['causer', 'explanation', 'causer'],
 ['topic', 'topic', 'gestalt'],
 ['comparisonref', 'time', 'comparisonref'],
 ['circumstance', 'manner', 'circumstance'],
 ['characteristic', 'characteristic', 'experiencer'],
 ['stimulus', 'stimulus', 'topic'],
 ['originator', '??', 'originator'],
 ['topic', 'topic', 'theme'],
 ['originator', 'recipient', 'originator'],
 ['extent', '??', 'extent'],
 ['manner', '??', 'manner'],
 ['experiencer', 'experiencer', 'agent'],
 ['agent', 'possessor', 'experiencer'],
 ['explanation', 'theme', 'causer'],
 ['theme', '??', 'theme'],
 ['theme', 'theme', 'purpose']]

0.7154150197628458
0.8507905138339921
