In [65]:
import os
import pandas as pd
pd.set_option('chained_assignment', None)

In [8]:
gold_standard_dir = '../mitford_letters_gs'
spacy_dir = '../spacy_mitford'
letters = os.listdir(spacy_dir)

In [58]:
def match_function(t1, b1, e1, t2, b2, e2):
    # t1 = entity type of record 1
    # b1 = begin offset of record 1
    # e1 = end offset of record 1
    
    # t1 = entity type of record 2
    # b2 = begin offset of record 2
    # e2 = end effset of record 2
    
    # Two records are a match if they have the same entity type and 
    # at least 50% of their characters overlap.
    r1 = set(range(b1, e1))
    r2 = set(range(b2, e2))
    intersection = r1.intersection(r2)
    union = r1.union(r2)
    match = len(intersection) / len(union)
    if t1 == t2:
        if match >= 0.5:
            return True
        else:
            return False
    else:
        return False

In [66]:
for letter in letters:
    # Load Spacy Ner
    spacy_df = pd.read_csv(os.path.join(spacy_dir, letter))
    spacy_df.reset_index(inplace=True)
    
    # Load gold standard NER
    gs_df = pd.read_csv(os.path.join(gold_standard_dir, letter))
    gs_df.reset_index(inplace=True)
    
    # Compute Precision and Recall
    gs_df['has_match'] = False
    for record1 in gs_df.values:
        for record2 in spacy_df.values:
            if match_function(record1[1], record1[2], record1[3], record2[1], record2[2], record2[3]):
                gs_df['has_match'].loc[gs_df['index'] == record1[0]] = True
    recall = gs_df['has_match'].sum() / len(gs_df)
    #print('Recall: {}'.format(gs_df['has_match'].sum() / len(gs_df)))

    spacy_df['has_match'] = False
    for record1 in spacy_df.values:
        for record2 in gs_df.values:
            if match_function(record1[1], record1[2], record1[3], record2[1], record2[2], record2[3]):
                spacy_df['has_match'].loc[spacy_df['index'] == record1[0]] = True
    precision = spacy_df['has_match'].sum() / len(spacy_df)
    #print('Precision: {}'.format(spacy_df['has_match'].sum() / len(spacy_df)))

In [76]:
letter = letters[0]
spacy_df = pd.read_csv(os.path.join(spacy_dir, letter))
spacy_df.reset_index(inplace=True)
spacy_df.head()

Unnamed: 0,index,tag,begin_offsets,end_offset,text
0,0,place,32,41,MarlowHow
1,1,person,115,125,John Keats
2,2,person,435,450,John Keats!--Do
3,3,person,573,580,Johnson
4,4,person,740,749,JohnsonMr


In [77]:
gs_df = pd.read_csv(os.path.join(gold_standard_dir, letter))
gs_df.reset_index(inplace=True)
gs_df.head()

Unnamed: 0,index,tag,begin_offset,end_offset,text,ref
0,0,place,13,26,Seymour Court,#SeymourCt
1,1,place,32,38,Marlow,#Marlow
2,2,person,115,125,John Keats,#Keats
3,3,person,435,445,John Keats,#Keats
4,4,person,569,580,Mr. Johnson,#Johnson_Mr


In [78]:
gs_df['has_match'] = False
for record1 in gs_df.values:
    for record2 in spacy_df.values:
        if match_function(record1[1], record1[2], record1[3], record2[1], record2[2], record2[3]):
            gs_df['has_match'].loc[gs_df['index'] == record1[0]] = True
print(gs_df)
print('Recall: {}'.format(gs_df['has_match'].sum() / len(gs_df)))

spacy_df['has_match'] = False
for record1 in spacy_df.values:
    for record2 in gs_df.values:
        if match_function(record1[1], record1[2], record1[3], record1[1], record2[2], record2[3]):
            spacy_df['has_match'].loc[spacy_df['index'] == record1[0]] = True
print(spacy_df)
print('Precision: {}'.format(spacy_df['has_match'].sum() / len(spacy_df)))

    index     tag  begin_offset  end_offset           text              ref  \
0       0   place            13          26  Seymour Court       #SeymourCt   
1       1   place            32          38         Marlow          #Marlow   
2       2  person           115         125     John Keats           #Keats   
3       3  person           435         445     John Keats           #Keats   
4       4  person           569         580    Mr. Johnson      #Johnson_Mr   
5       5  person           639         645         Sister    #Johnson_Miss   
6       6  person           736         747    Mr. Johnson      #Johnson_Mr   
7       7  person           747         758    Mr. Johnson      #Johnson_Mr   
8       8  person           876         889  Mr. Northmore  #Northmore_Thos   
9       9  person          1656        1669  Mr. Northmore  #Northmore_Thos   
10     10  person          1675        1686    Mr. Johnson      #Johnson_Mr   
11     11   place          1829        1836        B