In [1]:
import os
import re
import pandas as pd
pd.set_option('chained_assignment', None)

In [2]:
gold_standard_dir = '../mitford_letters_gs'
spacy_dir = '../spacy_mitford'
flair_dir = '../flair_mitford'
stanford_dir = '../stanford_parsed_mitford'
letters = os.listdir(spacy_dir)

In [4]:
def match_function(t1, b1, e1, t2, b2, e2):
    # t1 = entity type of record 1
    # b1 = begin offset of record 1
    # e1 = end offset of record 1
    
    # t1 = entity type of record 2
    # b2 = begin offset of record 2
    # e2 = end effset of record 2
    
    # Two records are a match if they have the same entity type and 
    # at least 50% of their characters overlap.
    r1 = set(range(b1, e1))
    r2 = set(range(b2, e2))
    intersection = r1.intersection(r2)
    union = r1.union(r2)
    match = len(intersection) / len(union)
    if t1 == t2:
        if match >= 0.5:
            return True
        else:
            return False
    else:
        return False

In [15]:
# Stanford NER
precision_numerator = 0
precision_denominator = 0
recall_numerator = 0
recall_denominator = 0
for letter in letters:
    # Load Stanford NER
    df = pd.read_csv(os.path.join(stanford_dir, letter))
    df = df.loc[~df['text'].str.isspace()]
    df = df.loc[~df['text'].isin(['he', 'she', 'him', 'her', 'his', 'hers',
                                 'they', 'us', 'it', 'we'])]
    df.reset_index(inplace=True)

    # Load gold standard NER
    gs_df = pd.read_csv(os.path.join(gold_standard_dir, letter))
    gs_df.reset_index(inplace=True)
    
    # Compute Precision and Recall
    gs_df['has_match'] = False
    for record1 in gs_df.values:
        for record2 in df.values:
            if match_function(record1[1], record1[2], record1[3], record2[1], record2[2], record2[3]):
                gs_df['has_match'].loc[gs_df['index'] == record1[0]] = True
    recall = gs_df['has_match'].sum() / len(gs_df)
    recall_numerator += gs_df['has_match'].sum()
    recall_denominator += len(gs_df)
    #print('Recall: {}'.format(gs_df['has_match'].sum() / len(gs_df)))

    df['has_match'] = False
    for record1 in df.values:
        for record2 in gs_df.values:
            if match_function(record1[1], record1[2], record1[3], record2[1], record2[2], record2[3]):
                df['has_match'].loc[df['index'] == record1[0]] = True
    precision_numerator += df['has_match'].sum()
    precision_denominator += len(df)
    precision = df['has_match'].sum() / len(df)
    #print('Precision: {}'.format(df['has_match'].sum() / len(df)))

print('Flair Precision: {}'.format(precision_numerator/precision_denominator))
print('Flair Recall: {}'.format(recall_numerator/recall_denominator))

Flair Precision: 0.5259959486833221
Flair Recall: 0.4873341375150784


In [145]:
# Flair
precision_numerator = 0
precision_denominator = 0
recall_numerator = 0
recall_denominator = 0
for letter in letters:
    # Load Flair NER
    flair_df = pd.read_csv(os.path.join(flair_dir, letter))
    flair_df = flair_df.loc[~flair_df['text'].str.isspace()]
    flair_df.reset_index(inplace=True)
    
    # Load gold standard NER
    gs_df = pd.read_csv(os.path.join(gold_standard_dir, letter))
    gs_df.reset_index(inplace=True)
    
    # Compute Precision and Recall
    gs_df['has_match'] = False
    for record1 in gs_df.values:
        for record2 in flair_df.values:
            if match_function(record1[1], record1[2], record1[3], record2[1], record2[2], record2[3]):
                gs_df['has_match'].loc[gs_df['index'] == record1[0]] = True
    recall = gs_df['has_match'].sum() / len(gs_df)
    recall_numerator += gs_df['has_match'].sum()
    recall_denominator += len(gs_df)
    #print('Recall: {}'.format(gs_df['has_match'].sum() / len(gs_df)))

    flair_df['has_match'] = False
    for record1 in flair_df.values:
        for record2 in gs_df.values:
            if match_function(record1[1], record1[2], record1[3], record2[1], record2[2], record2[3]):
                flair_df['has_match'].loc[flair_df['index'] == record1[0]] = True
    precision_numerator += flair_df['has_match'].sum()
    precision_denominator += len(flair_df)
    precision = flair_df['has_match'].sum() / len(flair_df)
    #print('Precision: {}'.format(flair_df['has_match'].sum() / len(flair_df)))

print('Flair Precision: {}'.format(precision_numerator/precision_denominator))
print('Flair Recall: {}'.format(recall_numerator/recall_denominator))

Flair Precision: 0.5352343493936415
Flair Recall: 0.5069360675512666


In [140]:
# spaCy
precision_numerator = 0
precision_denominator = 0
recall_numerator = 0
recall_denominator = 0
for letter in letters:
    # Load spaCy NER
    spacy_df = pd.read_csv(os.path.join(spacy_dir, letter))
    spacy_df = spacy_df.loc[~spacy_df['text'].str.isspace()]
    spacy_df.reset_index(inplace=True)
    
    # Load gold standard NER
    gs_df = pd.read_csv(os.path.join(gold_standard_dir, letter))
    gs_df.reset_index(inplace=True)
    
    # Compute Precision and Recall
    gs_df['has_match'] = False
    for record1 in gs_df.values:
        for record2 in spacy_df.values:
            if match_function(record1[1], record1[2], record1[3], record2[1], record2[2], record2[3]):
                gs_df['has_match'].loc[gs_df['index'] == record1[0]] = True
    recall = gs_df['has_match'].sum() / len(gs_df)
    recall_numerator += gs_df['has_match'].sum()
    recall_denominator += len(gs_df)
    #print('Recall: {}'.format(gs_df['has_match'].sum() / len(gs_df)))

    spacy_df['has_match'] = False
    for record1 in spacy_df.values:
        for record2 in gs_df.values:
            if match_function(record1[1], record1[2], record1[3], record2[1], record2[2], record2[3]):
                spacy_df['has_match'].loc[spacy_df['index'] == record1[0]] = True
    precision_numerator += spacy_df['has_match'].sum()
    precision_denominator += len(spacy_df)
    precision = spacy_df['has_match'].sum() / len(spacy_df)
    #print('Precision: {}'.format(spacy_df['has_match'].sum() / len(spacy_df)))

print('Spacy Precision: {}'.format(precision_numerator/precision_denominator))
print('Spacy Recall: {}'.format(recall_numerator/recall_denominator))

Spacy Precision: 0.47377384196185285
Spacy Recall: 0.43003618817852834


In [138]:
letter = letters[0]
spacy_df = pd.read_csv(os.path.join(flair_dir, letter))
spacy_df = spacy_df.loc[~spacy_df['text'].str.isspace()]
spacy_df.reset_index(inplace=True)
spacy_df

Unnamed: 0,index,tag,begin_offsets,end_offset,text
0,0,place,32,41,MarlowHow
1,1,person,115,125,John Keats
2,2,person,434,444,John Keats
3,3,person,570,577,Johnson
4,4,person,732,745,Mr. JohnsonMr
5,5,person,746,753,Johnson
6,6,person,871,884,Mr. Northmore
7,7,person,1651,1660,Northmore
8,8,person,1670,1677,Johnson
9,9,place,2135,2141,Marlow


In [130]:
gs_df = pd.read_csv(os.path.join(gold_standard_dir, letter))
gs_df.reset_index(inplace=True)
gs_df

Unnamed: 0,index,tag,begin_offset,end_offset,text,ref
0,0,place,13,26,Seymour Court,#SeymourCt
1,1,place,32,38,Marlow,#Marlow
2,2,person,115,125,John Keats,#Keats
3,3,person,435,445,John Keats,#Keats
4,4,person,569,580,Mr. Johnson,#Johnson_Mr
5,5,person,639,645,Sister,#Johnson_Miss
6,6,person,736,747,Mr. Johnson,#Johnson_Mr
7,7,person,747,758,Mr. Johnson,#Johnson_Mr
8,8,person,876,889,Mr. Northmore,#Northmore_Thos
9,9,person,1656,1669,Mr. Northmore,#Northmore_Thos


In [78]:
gs_df['has_match'] = False
for record1 in gs_df.values:
    for record2 in spacy_df.values:
        if match_function(record1[1], record1[2], record1[3], record2[1], record2[2], record2[3]):
            gs_df['has_match'].loc[gs_df['index'] == record1[0]] = True
print(gs_df)
print('Recall: {}'.format(gs_df['has_match'].sum() / len(gs_df)))

spacy_df['has_match'] = False
for record1 in spacy_df.values:
    for record2 in gs_df.values:
        if match_function(record1[1], record1[2], record1[3], record1[1], record2[2], record2[3]):
            spacy_df['has_match'].loc[spacy_df['index'] == record1[0]] = True
print(spacy_df)
print('Precision: {}'.format(spacy_df['has_match'].sum() / len(spacy_df)))

    index     tag  begin_offset  end_offset           text              ref  \
0       0   place            13          26  Seymour Court       #SeymourCt   
1       1   place            32          38         Marlow          #Marlow   
2       2  person           115         125     John Keats           #Keats   
3       3  person           435         445     John Keats           #Keats   
4       4  person           569         580    Mr. Johnson      #Johnson_Mr   
5       5  person           639         645         Sister    #Johnson_Miss   
6       6  person           736         747    Mr. Johnson      #Johnson_Mr   
7       7  person           747         758    Mr. Johnson      #Johnson_Mr   
8       8  person           876         889  Mr. Northmore  #Northmore_Thos   
9       9  person          1656        1669  Mr. Northmore  #Northmore_Thos   
10     10  person          1675        1686    Mr. Johnson      #Johnson_Mr   
11     11   place          1829        1836        B