In [1]:
# Quelle: https://github.com/explosion/spacy-stanza
#         https://www.kaggle.com/code/curiousprogrammer/entity-extraction-and-classification-using-spacy

In [1]:
import stanza
import spacy_stanza
from spacy import displacy
from spacy import Language
import pandas as pd

In [2]:
# Download the stanza model if necessary
stanza.download("en")

# Initialize the pipeline
nlp = spacy_stanza.load_pipeline("en")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-12-07 16:47:55 INFO: Downloading default packages for language: en (English) ...
2023-12-07 16:47:56 INFO: File exists: C:\Users\Franziska\stanza_resources\en\default.zip
2023-12-07 16:48:02 INFO: Finished downloading models and saved to C:\Users\Franziska\stanza_resources.
2023-12-07 16:48:02 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-12-07 16:48:05 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

2023-12-07 16:48:05 INFO: Using device: cpu
2023-12-07 16:48:05 INFO: Loading: tokenize
2023-12-07 16:48:05 INFO: Loading: pos
2023-12-07 16:48:06 INFO: Loading: lemma
2023-12-07 16:48:06 INFO: Loading: constituency
2023-12-07 16:48:06 INFO: Loading: depparse
2023-12-07 16:48:07 INFO: Loading: sentiment
2023-12-07 16:48:07 INFO: Loading: ner
2023-12-07 16:48:08 INFO: Done loading processors!


In [9]:
doc = nlp("Barack Obama was born in Hawaii. He was elected president in 2008. Hi IBAN is 123678463. Obamas favourite City is New York.")
#doc = nlp("Your confirmation number is 67768945, and amount due is 0.001b. Contact us at @Francis.Schinner94.com for clarifications.")
for token in doc:
    #print(token.text, token.lemma_, token.pos_, token.dep_, token.ent_type_, token.tag_)
    print(token.text, token.pos_, token.ent_type_, token.tag_)
print(f"-------------------------------------------------------")
print(doc.ents)

Custom component called: Barack Obama was born in Hawaii. He was elected president in 2008. Hi IBAN is 123678463. Obamas favourite City is New York.
Barack PROPN PERSON NNP
Obama PROPN PERSON NNP
was AUX  VBD
born VERB  VBN
in ADP  IN
Hawaii PROPN GPE NNP
. PUNCT  .
He PRON  PRP
was AUX  VBD
elected VERB  VBN
president NOUN  NN
in ADP  IN
2008 NUM DATE CD
. PUNCT  .
Hi INTJ  UH
IBAN PROPN  NNP
is AUX  VBZ
123678463 NUM CARDINAL CD
. PUNCT  .
Obamas PROPN PERSON NNPS
favourite ADJ  JJ
City PROPN  NNP
is AUX  VBZ
New ADJ GPE NNP
York PROPN GPE NNP
. PUNCT  .
-------------------------------------------------------
(Barack Obama, Hawaii, 2008, 123678463, Obamas, New York)


In [4]:
# Access spaCy's lexical attributes
print([token.is_stop for token in doc])
print([token.like_num for token in doc])

# Visualize dependencies
displacy.render(doc)  # or displacy.render if you're in a Jupyter notebook

# Process texts with nlp.pipe
for doc in nlp.pipe(["Lots of texts", "Even more texts", "..."]):
    print(doc.text)

# Combine with your own custom pipeline components
@Language.component("custom_component")
def custom_component(doc):
    # Do something to the doc here
    print(f"Custom component called: {doc.text}")
    return doc

nlp.add_pipe("custom_component")
doc = nlp("Some text")

# Serialize attributes to a numpy array
np_array = doc.to_array(['ORTH', 'LEMMA', 'POS'])
np_array

[False, False, True, False, True, False, False, True, True, False, False, True, False, False, False, False, True, False, False, False, False, False, True, False, False, False]
[False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, True, False, False, False, False, False, False, False, False]


Lots of texts
Even more texts
...
Custom component called: Some text


array([[14298532990736973729,  7000492816108906599,                   90],
       [15099781594404091470, 15099781594404091470,                   92]],
      dtype=uint64)

In [5]:
df = pd.read_json("../data/dataset_english.json")
df

Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text
0,A students assessment was found on device bear...,A students assessment was found on device bear...,"{'[PHONEIMEI_1]': '06-184755-866851-3', '[JOBA...","[[0, 57, O], [57, 75, PHONEIMEI_1], [75, 138, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PHON...","[a, student, s, assessment, was, found, on, de..."
1,"Dear [FIRSTNAME_1], as per our records, your l...","Dear Omer, as per our records, your license 78...","{'[FIRSTNAME_1]': 'Omer', '[VEHICLEVIN_1]': '7...","[[0, 5, O], [5, 9, FIRSTNAME_1], [9, 44, O], [...","[O, B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O...","[dear, om, ##er, ,, as, per, our, records, ,, ..."
2,[FIRSTNAME_1] could you please share your reco...,Kattie could you please share your recomndatio...,"{'[FIRSTNAME_1]': 'Kattie', '[AGE_1]': '72', '...","[[0, 6, FIRSTNAME_1], [6, 75, O], [75, 77, AGE...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...","[kat, ##tie, could, you, please, share, your, ..."
3,Emergency supplies in [BUILDINGNUMBER_1] need ...,Emergency supplies in 16356 need a refill. Use...,"{'[BUILDINGNUMBER_1]': '16356', '[MASKEDNUMBER...","[[0, 22, O], [22, 27, BUILDINGNUMBER_1], [27, ...","[O, O, O, B-BUILDINGNUMBER, I-BUILDINGNUMBER, ...","[emergency, supplies, in, 1635, ##6, need, a, ..."
4,"The [AGE_1] old child at [BUILDINGNUMBER_1], h...","The 88 old child at 5862, has showcased an unu...","{'[AGE_1]': '88', '[BUILDINGNUMBER_1]': '5862'...","[[0, 4, O], [4, 6, AGE_1], [6, 20, O], [20, 24...","[O, B-AGE, O, O, O, B-BUILDINGNUMBER, I-BUILDI...","[the, 88, old, child, at, 58, ##6, ##2, ,, has..."
...,...,...,...,...,...,...
43496,"Hello [FIRSTNAME_1], your cognitive therapy ap...","Hello Nellie, your cognitive therapy appointme...","{'[FIRSTNAME_1]': 'Nellie', '[DATE_1]': '8/21'...","[[0, 6, O], [6, 12, FIRSTNAME_1], [12, 66, O],...","[O, B-FIRSTNAME, O, O, O, O, O, O, O, O, B-DAT...","[hello, nellie, ,, your, cognitive, therapy, a..."
43497,"Dear [FIRSTNAME_1], we appreciate your active ...","Dear Jalon, we appreciate your active involvem...","{'[FIRSTNAME_1]': 'Jalon', '[CREDITCARDNUMBER_...","[[0, 5, O], [5, 10, FIRSTNAME_1], [10, 159, O]...","[O, B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O...","[dear, ja, ##lon, ,, we, appreciate, your, act..."
43498,"Dear [SEX_1] at [ZIPCODE_1], we are raising fu...","Dear Female at 32363-2779, we are raising fund...","{'[SEX_1]': 'Female', '[ZIPCODE_1]': '32363-27...","[[0, 5, O], [5, 11, SEX_1], [11, 15, O], [15, ...","[O, B-SEX, O, B-ZIPCODE, I-ZIPCODE, I-ZIPCODE,...","[dear, female, at, 323, ##6, ##3, -, 277, ##9,..."
43499,"Hello [FIRSTNAME_1], we encourage you to pay t...","Hello Tito, we encourage you to pay the fees o...","{'[FIRSTNAME_1]': 'Tito', '[ETHEREUMADDRESS_1]...","[[0, 6, O], [6, 10, FIRSTNAME_1], [10, 137, O]...","[O, B-FIRSTNAME, O, O, O, O, O, O, O, O, O, O,...","[hello, tito, ,, we, encourage, you, to, pay, ..."


In [11]:
pd.set_option('display.max_colwidth', None)
df["privacy_mask"]

0                                                                                                                                                                                                                                                                        {'[PHONEIMEI_1]': '06-184755-866851-3', '[JOBAREA_1]': 'Optimization'}
1                                                                                                                                                                                                                                                                              {'[FIRSTNAME_1]': 'Omer', '[VEHICLEVIN_1]': '78B5R2MVFAHJ48500'}
2                                                                                                                                                                                                                                 {'[FIRSTNAME_1]': 'Kattie', '[AGE_1]': '72', '[GENDER_1]': 'Intersex person', '[HEIGHT_1]': '158centim

In [10]:
df_small = df.head(500)
df_small

Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text
0,A students assessment was found on device bear...,A students assessment was found on device bear...,"{'[PHONEIMEI_1]': '06-184755-866851-3', '[JOBA...","[[0, 57, O], [57, 75, PHONEIMEI_1], [75, 138, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PHON...","[a, student, s, assessment, was, found, on, de..."
1,"Dear [FIRSTNAME_1], as per our records, your l...","Dear Omer, as per our records, your license 78...","{'[FIRSTNAME_1]': 'Omer', '[VEHICLEVIN_1]': '7...","[[0, 5, O], [5, 9, FIRSTNAME_1], [9, 44, O], [...","[O, B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O...","[dear, om, ##er, ,, as, per, our, records, ,, ..."
2,[FIRSTNAME_1] could you please share your reco...,Kattie could you please share your recomndatio...,"{'[FIRSTNAME_1]': 'Kattie', '[AGE_1]': '72', '...","[[0, 6, FIRSTNAME_1], [6, 75, O], [75, 77, AGE...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...","[kat, ##tie, could, you, please, share, your, ..."
3,Emergency supplies in [BUILDINGNUMBER_1] need ...,Emergency supplies in 16356 need a refill. Use...,"{'[BUILDINGNUMBER_1]': '16356', '[MASKEDNUMBER...","[[0, 22, O], [22, 27, BUILDINGNUMBER_1], [27, ...","[O, O, O, B-BUILDINGNUMBER, I-BUILDINGNUMBER, ...","[emergency, supplies, in, 1635, ##6, need, a, ..."
4,"The [AGE_1] old child at [BUILDINGNUMBER_1], h...","The 88 old child at 5862, has showcased an unu...","{'[AGE_1]': '88', '[BUILDINGNUMBER_1]': '5862'...","[[0, 4, O], [4, 6, AGE_1], [6, 20, O], [20, 24...","[O, B-AGE, O, O, O, B-BUILDINGNUMBER, I-BUILDI...","[the, 88, old, child, at, 58, ##6, ##2, ,, has..."
...,...,...,...,...,...,...
495,A webinar discussing the latest global health ...,A webinar discussing the latest global health ...,"{'[JOBTYPE_1]': 'Director', '[IPV6_1]': '0049:...","[[0, 110, O], [110, 118, JOBTYPE_1], [118, 162...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[a, web, ##ina, ##r, discussing, the, latest, ..."
496,"Due to recent policy changes, we request all u...","Due to recent policy changes, we request all u...","{'[URL_1]': 'https://constant-emery.com', '[US...","[[0, 67, O], [67, 93, URL_1], [93, 133, O], [1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-URL,...","[due, to, recent, policy, changes, ,, we, requ..."
497,Subject: Securities law violation\nUser [USERN...,Subject: Securities law violation\nUser Rashee...,"{'[USERNAME_1]': 'Rasheed28', '[ACCOUNTNAME_1]...","[[0, 39, O], [39, 48, USERNAME_1], [48, 59, O]...","[O, O, O, O, O, O, B-USERNAME, I-USERNAME, I-U...","[subject, :, securities, law, violation, user,..."
498,"Due to reports of cyberbullying, we have decid...","Due to reports of cyberbullying, we have decid...","{'[JOBTITLE_1]': 'Product Mobility Assistant',...","[[0, 98, O], [98, 124, JOBTITLE_1], [124, 226,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[due, to, reports, of, cyber, ##bu, ##lly, ##i..."


In [None]:
# function to check if specific tags are present in a row.
def check_data_tags(row, tags):
    return any(tag in str(row) for tag in tags)

# 'PER_flag', 'LOC_flag', and 'ORG_flag' will be set to True if the respective tags are found in the 'span_labels' column
df_small['PERSON_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['FIRSTNAME_', 'LASTNAME_', 'MIDDLENAME_']))
df_small['NORP_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['STREET_', 'STATE_', 'CITY_', 'COUNTRY_']))
df_small['FAC_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['COMPANYNAME_']))
df_small['ORG_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['COMPANYNAME_']))
df_small['GPE_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['COMPANYNAME_']))
df_small['LOC_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['COMPANYNAME_']))
df_small['PRODUCT_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['COMPANYNAME_']))
df_small['EVENT_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['COMPANYNAME_']))
df_small['WORK_OF_ART_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['COMPANYNAME_']))
df_small['LAW_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['COMPANYNAME_']))
df_small['LANGUAGE_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['COMPANYNAME_']))
df_small['DATE_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['COMPANYNAME_']))
df_small['TIME_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['COMPANYNAME_']))
df_small['PERCENT_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['COMPANYNAME_']))
df_small['MONEY_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['COMPANYNAME_']))
df_small['QUANTITY_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['COMPANYNAME_']))
df_small['ORDINAL_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['COMPANYNAME_']))
df_small['CARDINAL_flag'] = df_small['span_labels'].apply(lambda row: check_data_tags(row, ['COMPANYNAME_']))




df_small[['span_labels', 'PER_flag', 'LOC_flag', 'ORG_flag']]