In [3]:
from flair.data import Sentence
from flair.models import SequenceTagger
import pandas as pd     
import ast
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, f2

In [4]:
# load tagger
tagger = SequenceTagger.load("flair/ner-english")

# make example sentence
sentence = Sentence("George Washington went to Washington")

# predict NER tags
tagger.predict(sentence)

print(sentence)

# predicted NER spans
print('The following NER tags are found:')
# iterate over entities 
for entity in sentence.get_spans('ner'):
    print(entity)

2023-12-01 15:40:11,058 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
Sentence[5]: "George Washington went to Washington" → ["George Washington"/PER, "Washington"/LOC]
The following NER tags are found:
Span[0:2]: "George Washington" → PER (0.9985)
Span[4:5]: "Washington" → LOC (0.9706)


In [5]:
model = SequenceTagger.load('flair/ner-english')

2023-12-01 15:40:12,897 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [6]:
def tag_text(text, model):
    sentence = Sentence(text)
    model.predict(sentence)
    return [(entity.text, entity.tag) for entity in sentence.get_spans('ner')]

In [7]:
df_english = pd.read_json("../../data/dataset_english.json")
df_english

Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text
0,A students assessment was found on device bear...,A students assessment was found on device bear...,"{'[PHONEIMEI_1]': '06-184755-866851-3', '[JOBA...","[[0, 57, O], [57, 75, PHONEIMEI_1], [75, 138, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PHON...","[a, student, s, assessment, was, found, on, de..."
1,"Dear [FIRSTNAME_1], as per our records, your l...","Dear Omer, as per our records, your license 78...","{'[FIRSTNAME_1]': 'Omer', '[VEHICLEVIN_1]': '7...","[[0, 5, O], [5, 9, FIRSTNAME_1], [9, 44, O], [...","[O, B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O...","[dear, om, ##er, ,, as, per, our, records, ,, ..."
2,[FIRSTNAME_1] could you please share your reco...,Kattie could you please share your recomndatio...,"{'[FIRSTNAME_1]': 'Kattie', '[AGE_1]': '72', '...","[[0, 6, FIRSTNAME_1], [6, 75, O], [75, 77, AGE...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...","[kat, ##tie, could, you, please, share, your, ..."
3,Emergency supplies in [BUILDINGNUMBER_1] need ...,Emergency supplies in 16356 need a refill. Use...,"{'[BUILDINGNUMBER_1]': '16356', '[MASKEDNUMBER...","[[0, 22, O], [22, 27, BUILDINGNUMBER_1], [27, ...","[O, O, O, B-BUILDINGNUMBER, I-BUILDINGNUMBER, ...","[emergency, supplies, in, 1635, ##6, need, a, ..."
4,"The [AGE_1] old child at [BUILDINGNUMBER_1], h...","The 88 old child at 5862, has showcased an unu...","{'[AGE_1]': '88', '[BUILDINGNUMBER_1]': '5862'...","[[0, 4, O], [4, 6, AGE_1], [6, 20, O], [20, 24...","[O, B-AGE, O, O, O, B-BUILDINGNUMBER, I-BUILDI...","[the, 88, old, child, at, 58, ##6, ##2, ,, has..."
...,...,...,...,...,...,...
43496,"Hello [FIRSTNAME_1], your cognitive therapy ap...","Hello Nellie, your cognitive therapy appointme...","{'[FIRSTNAME_1]': 'Nellie', '[DATE_1]': '8/21'...","[[0, 6, O], [6, 12, FIRSTNAME_1], [12, 66, O],...","[O, B-FIRSTNAME, O, O, O, O, O, O, O, O, B-DAT...","[hello, nellie, ,, your, cognitive, therapy, a..."
43497,"Dear [FIRSTNAME_1], we appreciate your active ...","Dear Jalon, we appreciate your active involvem...","{'[FIRSTNAME_1]': 'Jalon', '[CREDITCARDNUMBER_...","[[0, 5, O], [5, 10, FIRSTNAME_1], [10, 159, O]...","[O, B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O...","[dear, ja, ##lon, ,, we, appreciate, your, act..."
43498,"Dear [SEX_1] at [ZIPCODE_1], we are raising fu...","Dear Female at 32363-2779, we are raising fund...","{'[SEX_1]': 'Female', '[ZIPCODE_1]': '32363-27...","[[0, 5, O], [5, 11, SEX_1], [11, 15, O], [15, ...","[O, B-SEX, O, B-ZIPCODE, I-ZIPCODE, I-ZIPCODE,...","[dear, female, at, 323, ##6, ##3, -, 277, ##9,..."
43499,"Hello [FIRSTNAME_1], we encourage you to pay t...","Hello Tito, we encourage you to pay the fees o...","{'[FIRSTNAME_1]': 'Tito', '[ETHEREUMADDRESS_1]...","[[0, 6, O], [6, 10, FIRSTNAME_1], [10, 137, O]...","[O, B-FIRSTNAME, O, O, O, O, O, O, O, O, O, O,...","[hello, tito, ,, we, encourage, you, to, pay, ..."


In [13]:
df_small = df_english.head(500)

In [14]:
# function to check if specific tags are present in a row.
def check_data_tags(row, tags):
    return any(tag in str(row) for tag in tags)

# 'PER_flag', 'LOC_flag', and 'ORG_flag' will be set to True if the respective tags are found in the 'span_labels' column
df_small['PER_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['FIRSTNAME_', 'LASTNAME_', 'MIDDLENAME_']))
df_small['LOC_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['STREET_', 'STATE_', 'CITY_', 'COUNTRY_']))
df_small['ORG_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['COMPANYNAME_']))

df_small[['span_labels', 'PER_flag', 'LOC_flag', 'ORG_flag']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['PER_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['FIRSTNAME_', 'LASTNAME_', 'MIDDLENAME_']))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['LOC_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['STREET_', 'STATE_', 'CITY_', 'COUNTRY_']))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

Unnamed: 0,span_labels,PER_flag,LOC_flag,ORG_flag
0,"[[0, 57, O], [57, 75, PHONEIMEI_1], [75, 138, ...",False,False,False
1,"[[0, 5, O], [5, 9, FIRSTNAME_1], [9, 44, O], [...",True,False,False
2,"[[0, 6, FIRSTNAME_1], [6, 75, O], [75, 77, AGE...",True,False,False
3,"[[0, 22, O], [22, 27, BUILDINGNUMBER_1], [27, ...",False,False,False
4,"[[0, 4, O], [4, 6, AGE_1], [6, 20, O], [20, 24...",False,False,False
...,...,...,...,...
495,"[[0, 110, O], [110, 118, JOBTYPE_1], [118, 162...",False,False,False
496,"[[0, 67, O], [67, 93, URL_1], [93, 133, O], [1...",False,False,False
497,"[[0, 39, O], [39, 48, USERNAME_1], [48, 59, O]...",False,False,False
498,"[[0, 98, O], [98, 124, JOBTITLE_1], [124, 226,...",False,False,False


In [15]:
df_small['tagged_entities'] = df_small['unmasked_text'].apply(lambda text: tag_text(text, model))
df_small

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['tagged_entities'] = df_small['unmasked_text'].apply(lambda text: tag_text(text, model))


Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text,PER_flag,LOC_flag,ORG_flag,tagged_entities
0,A students assessment was found on device bear...,A students assessment was found on device bear...,"{'[PHONEIMEI_1]': '06-184755-866851-3', '[JOBA...","[[0, 57, O], [57, 75, PHONEIMEI_1], [75, 138, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PHON...","[a, student, s, assessment, was, found, on, de...",False,False,False,[]
1,"Dear [FIRSTNAME_1], as per our records, your l...","Dear Omer, as per our records, your license 78...","{'[FIRSTNAME_1]': 'Omer', '[VEHICLEVIN_1]': '7...","[[0, 5, O], [5, 9, FIRSTNAME_1], [9, 44, O], [...","[O, B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O...","[dear, om, ##er, ,, as, per, our, records, ,, ...",True,False,False,"[(78B5R2MVFAHJ48500, MISC)]"
2,[FIRSTNAME_1] could you please share your reco...,Kattie could you please share your recomndatio...,"{'[FIRSTNAME_1]': 'Kattie', '[AGE_1]': '72', '...","[[0, 6, FIRSTNAME_1], [6, 75, O], [75, 77, AGE...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...","[kat, ##tie, could, you, please, share, your, ...",True,False,False,"[(Kattie, PER)]"
3,Emergency supplies in [BUILDINGNUMBER_1] need ...,Emergency supplies in 16356 need a refill. Use...,"{'[BUILDINGNUMBER_1]': '16356', '[MASKEDNUMBER...","[[0, 22, O], [22, 27, BUILDINGNUMBER_1], [27, ...","[O, O, O, B-BUILDINGNUMBER, I-BUILDINGNUMBER, ...","[emergency, supplies, in, 1635, ##6, need, a, ...",False,False,False,[]
4,"The [AGE_1] old child at [BUILDINGNUMBER_1], h...","The 88 old child at 5862, has showcased an unu...","{'[AGE_1]': '88', '[BUILDINGNUMBER_1]': '5862'...","[[0, 4, O], [4, 6, AGE_1], [6, 20, O], [20, 24...","[O, B-AGE, O, O, O, B-BUILDINGNUMBER, I-BUILDI...","[the, 88, old, child, at, 58, ##6, ##2, ,, has...",False,False,False,"[(Y2rWliOhf8Ir, MISC)]"
...,...,...,...,...,...,...,...,...,...,...
495,A webinar discussing the latest global health ...,A webinar discussing the latest global health ...,"{'[JOBTYPE_1]': 'Director', '[IPV6_1]': '0049:...","[[0, 110, O], [110, 118, JOBTYPE_1], [118, 162...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[a, web, ##ina, ##r, discussing, the, latest, ...",False,False,False,[]
496,"Due to recent policy changes, we request all u...","Due to recent policy changes, we request all u...","{'[URL_1]': 'https://constant-emery.com', '[US...","[[0, 67, O], [67, 93, URL_1], [93, 133, O], [1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-URL,...","[due, to, recent, policy, changes, ,, we, requ...",False,False,False,"[(constant-emery.com, MISC), (Bashirian, MISC)]"
497,Subject: Securities law violation\nUser [USERN...,Subject: Securities law violation\nUser Rashee...,"{'[USERNAME_1]': 'Rasheed28', '[ACCOUNTNAME_1]...","[[0, 39, O], [39, 48, USERNAME_1], [48, 59, O]...","[O, O, O, O, O, O, B-USERNAME, I-USERNAME, I-U...","[subject, :, securities, law, violation, user,...",False,False,False,[]
498,"Due to reports of cyberbullying, we have decid...","Due to reports of cyberbullying, we have decid...","{'[JOBTITLE_1]': 'Product Mobility Assistant',...","[[0, 98, O], [98, 124, JOBTITLE_1], [124, 226,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[due, to, reports, of, cyber, ##bu, ##lly, ##i...",False,False,False,[]


In [16]:
def check_model_tags(row, entity_type):
    #if the flag for this entity type is True in the current row
    if row[entity_type + '_flag']:
        # If flag is True -> check if the entity type is present in the tagged entities
        return any(entity_type in tag for _, tag in row['tagged_entities'])
    # If the flag is False, return False as this entity type is not expected in this row
    return False

# 'PER_tag', 'LOC_tag', 'ORG_tag' will be True if the respective entity type is correctly identified by the model
df_small['PER_tag'] = df_small.apply(lambda row: check_model_tags(row, 'PER'), axis=1)
df_small['LOC_tag'] = df_small.apply(lambda row: check_model_tags(row, 'LOC'), axis=1)
df_small['ORG_tag'] = df_small.apply(lambda row: check_model_tags(row, 'ORG'), axis=1)

df_small[['span_labels', 'PER_flag', 'LOC_flag', 'ORG_flag', 'tagged_entities', 'PER_tag', 'LOC_tag', 'ORG_tag']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['PER_tag'] = df_small.apply(lambda row: check_model_tags(row, 'PER'), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['LOC_tag'] = df_small.apply(lambda row: check_model_tags(row, 'LOC'), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small['ORG_tag'] = df_small.

Unnamed: 0,span_labels,PER_flag,LOC_flag,ORG_flag,tagged_entities,PER_tag,LOC_tag,ORG_tag
0,"[[0, 57, O], [57, 75, PHONEIMEI_1], [75, 138, ...",False,False,False,[],False,False,False
1,"[[0, 5, O], [5, 9, FIRSTNAME_1], [9, 44, O], [...",True,False,False,"[(78B5R2MVFAHJ48500, MISC)]",False,False,False
2,"[[0, 6, FIRSTNAME_1], [6, 75, O], [75, 77, AGE...",True,False,False,"[(Kattie, PER)]",True,False,False
3,"[[0, 22, O], [22, 27, BUILDINGNUMBER_1], [27, ...",False,False,False,[],False,False,False
4,"[[0, 4, O], [4, 6, AGE_1], [6, 20, O], [20, 24...",False,False,False,"[(Y2rWliOhf8Ir, MISC)]",False,False,False
...,...,...,...,...,...,...,...,...
495,"[[0, 110, O], [110, 118, JOBTYPE_1], [118, 162...",False,False,False,[],False,False,False
496,"[[0, 67, O], [67, 93, URL_1], [93, 133, O], [1...",False,False,False,"[(constant-emery.com, MISC), (Bashirian, MISC)]",False,False,False
497,"[[0, 39, O], [39, 48, USERNAME_1], [48, 59, O]...",False,False,False,[],False,False,False
498,"[[0, 98, O], [98, 124, JOBTITLE_1], [124, 226,...",False,False,False,[],False,False,False


In [29]:
def calculate_and_print_metrics(df, entity_type):
    y_true = df[entity_type + '_flag']
    y_pred = df[entity_type + '_tag']

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    f2 = (5 * precision * recall) / ((4 * precision) + recall) if (precision + recall) != 0 else 0


    print(f"Metrics for {entity_type}:")
    print(f"  Accuracy:  {accuracy:.2f}")
    print(f"  Precision: {precision:.2f}")
    print(f"  Recall:    {recall:.2f}")
    print(f"  F1 Score:  {f1:.2f}")
    print(f"  F2 Score:  {f2:.2f}")
    print("------------------------------------------------")

calculate_and_print_metrics(df_small, 'PER')
calculate_and_print_metrics(df_small, 'LOC')
calculate_and_print_metrics(df_small, 'ORG')

Metrics for PER:
  Accuracy:  0.93
  Precision: 1.00
  Recall:    0.82
  F1 Score:  0.90
  F2 Score:  0.85
------------------------------------------------
Metrics for LOC:
  Accuracy:  0.97
  Precision: 1.00
  Recall:    0.83
  F1 Score:  0.90
  F2 Score:  0.86
------------------------------------------------
Metrics for ORG:
  Accuracy:  0.98
  Precision: 1.00
  Recall:    0.63
  F1 Score:  0.78
  F2 Score:  0.68
------------------------------------------------
