In [19]:
import pandas as pd

ner_class_map = {'Organization': 'ORG',
                 'Product': 'PRODUCT',
                 'Name': 'PERSON',
                 'Country': 'GPE',
                 'City': 'GPE'
                 }

exclude_ents = ['Email',
                'Card_number',
                'Phone_number',
                'Order_number',
                'URL'
                ]

def accuracy(tp, fp, ground_truth):
    """
    This metric represents the overall proportion of correct predictions.
    Calculates the accuracy of the predictions.

    Args:
        tp: Number of true positive predictions.
        fp: Number of false positive predictions.
        ground_truth: Number of positive instances in the ground truth.

    Returns:
        The accuracy as a float between 0 and 1.
    """
    total_positives = tp + fp
    # Handle division by zero
    if total_positives == 0: return 0  
    return (tp + ground_truth) / (total_positives + ground_truth)

def precision(tp, fp):
    """
    This metric measures the proportion of predicted 
    positives that are actually true positives.
    Calculates the precision of the predictions.

    Args:
        tp: Number of true positive predictions.
        fp: Number of false positive predictions.

    Returns:
        The precision as a float between 0 and 1.
    """
    total_positives = tp + fp
    # Handle division by zero
    if total_positives == 0: return 0  
    return tp / total_positives

def recall(tp, ground_truth):
    """
    This metric measures the proportion of actual positive 
    instances that were correctly identified as positive.
    Calculates the recall of the predictions.

    Args:
        tp: Number of true positive predictions.
        ground_truth: Number of positive instances in the ground truth.

    Returns:
        The recall as a float between 0 and 1.
    """
    # Handle division by zero (no positives in truth)
    if ground_truth == 0: return 1  
    return tp / ground_truth

def f1_score(precision, recall):
    """
    This is a widely used metric that is the harmonic 
    mean of precision and recall. It penalizes models 
    that favor high precision or high recall at the 
    expense of the other.
    Calculates the F1-score of the predictions.

    Args:
        precision: Precision of the predictions.
        recall: Recall of the predictions.

    Returns:
        The F1-score as a float between 0 and 1.
    """
    if precision + recall == 0:
        return 0  # Handle division by zero
    return 2 * (precision * recall) / (precision + recall)

df = pd.read_csv('data/input_data.csv')
human_items = df['ner_data'].to_list()
spacy_items = df['spacy_en_core_web_lg'].to_list()
uni_ner_items = df['universal_ner'].to_list()
data_items = []
for human_item, spacy_item, uni_ner_item in zip(human_items, spacy_items, uni_ner_items):
    human_ents = eval(human_item)
    spacy_ents = eval(spacy_item)
    uni_ner_ents = eval(uni_ner_item)
    human_possible_ents = len(human_ents)
    spacy_found_ents = 0
    spacy_false_positives = 0
    uni_ner_found_ents = 0
    uni_ner_false_positives = 0
    for human_ent in human_ents:
        if human_ent['entity_type'] not in exclude_ents:
            mapped_ent = ner_class_map[human_ent['entity_type']]
            human_ent['entity_type'] = mapped_ent
            
    for spacy_ent in spacy_ents:
        spacy_ent['span'] = (spacy_ent['span'][0], spacy_ent['span'][1] - 1)
        if spacy_ent in human_ents:
            spacy_found_ents += 1
        else:
            spacy_false_positives += 1

    for uni_ner_ent in uni_ner_ents:
        uni_ner_ent['span'] = (uni_ner_ent['span'][0], uni_ner_ent['span'][1] - 1)
        if uni_ner_ent in human_ents:
            uni_ner_found_ents += 1
        else:
            uni_ner_false_positives += 1

    spacy_accuracy = accuracy(spacy_found_ents, spacy_false_positives, human_possible_ents)
    spacy_precision = precision(spacy_found_ents, spacy_false_positives)
    spacy_recall = recall(spacy_found_ents, human_possible_ents)
    spacy_f1 = f1_score(spacy_precision, spacy_recall)
    uni_ner_accuracy = accuracy(uni_ner_found_ents, uni_ner_false_positives, human_possible_ents)
    uni_ner_precision = precision(uni_ner_found_ents, uni_ner_false_positives)
    uni_ner_recall = recall(uni_ner_found_ents, human_possible_ents)
    uni_ner_f1 = f1_score(uni_ner_precision, uni_ner_recall)

    data = {"human_possible_ents": human_possible_ents,
            "spacy_found_ents": spacy_found_ents,
            "spacy_false_positives": spacy_false_positives,
            "uni_ner_found_ents": uni_ner_found_ents,
            "uni_ner_false_positives": uni_ner_false_positives,
            "spacy_accuracy": spacy_accuracy,
            "spacy_precision": spacy_precision,
            "spacy_recall": spacy_recall,
            "spacy_f1": spacy_f1,
            "uni_ner_accuracy": uni_ner_accuracy,
            "uni_ner_precision": uni_ner_precision,
            "uni_ner_recall": uni_ner_recall,
            "uni_ner_f1": uni_ner_f1
            }
    data_items.append(data)
df2 = pd.DataFrame(data_items)
output_df = pd.concat([df, df2], axis = 1)
output_df.to_csv('ner_eval_output.csv', index = False)
output_df.head(5)
# Get averages:
print(round(output_df['spacy_accuracy'].mean(), 3))
print(round(output_df['spacy_precision'].mean(), 3))
print(round(output_df['spacy_recall'].mean(), 3))
print(round(output_df['spacy_f1'].mean(), 3))
print(round(output_df['uni_ner_accuracy'].mean(), 3))
print(round(output_df['uni_ner_precision'].mean(), 3))
print(round(output_df['uni_ner_recall'].mean(), 3))
print(round(output_df['uni_ner_f1'].mean(), 3)) 

Unnamed: 0,title,external_id,message,ner_data,spacy_en_core_web_lg,universal_ner,human_possible_ents,spacy_found_ents,spacy_false_positives,uni_ner_found_ents,uni_ner_false_positives,spacy_accuracy,spacy_precision,spacy_recall,spacy_f1,uni_ner_accuracy,uni_ner_precision,uni_ner_recall,uni_ner_f1
0,DM Feedback: BLT3 2 2.6.16 Mac OS X 10.13.6 Ne...,57158_7143758_incoming.txt,"I downloaded StreamSync to try it. However, th...","[{'entity_type': 'Product', 'entity': 'StreamS...","[{'entity_type': 'ORG', 'entity': 'StreamSync'...","[{'entity_type': 'PERSON', 'entity': 'Alex W.'...",2,0,2,2,0,0.5,0.0,0.0,0.0,1.0,1.0,1.0,1.0
1,Re: DM Feedback: BLT3 2 2.6.16 Mac OS X 10.13....,57158_7169606_incoming.txt,"Hi, that was my question. Do I have to connect...",[],[],"[{'entity_type': 'PRODUCT', 'entity': 'compute...",0,0,0,0,2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,BLT3 [Mac],57654_8598982_incoming.txt,I did a clean install and can't find my regist...,[],[],"[{'entity_type': 'PRODUCT', 'entity': 'registr...",0,0,0,0,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,BLT3 [Mac],58066_8513618_incoming.txt,"Once imported to my iPod, how do I change the ...","[{'entity_type': 'Product', 'entity': 'iPod', ...","[{'entity_type': 'PRODUCT', 'entity': 'iPod', ...","[{'entity_type': 'CARDINAL', 'entity': 'number...",1,1,0,1,4,1.0,1.0,1.0,1.0,0.333333,0.2,1.0,0.333333
4,RE: Risk Analysis issue [ ref:_00D80dhNH._5...,60647_7491265_incoming.txt,"Hello,\n\nThe customers order are going throug...",[],[],"[{'entity_type': 'MONEY', 'entity': 'charges',...",0,0,0,0,4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
