In [20]:
'''
In this notebook, 3 steps are done to anonymize text data. First, we created a dataset from enron data for fine-
tuning on 2 NER models (en_core_web_sm and en_core_web_trf) and finally the one with better F1 score was chosen (en_core_web_trf)
The dataset is labelled manually then similar data points are generated to increase total data points. 
Once the NER model is finetuned on the dataset, its ready to extract entities from enron emails. Entities extracted
through NER are listed below in the code.(check results).

Enron email dataset is passed through predefined regex for email and password extracting as theres a recognizable pattern
that can be used. Then other entities are extracted through the finetuned NER model. Finally all the extracted entities 
are anonymized and replaced by place holder value. The code is further commented for each step.

'''

'\nIn this notebook, 3 steps are done to anonymize text data. First, we created a dataset from enron data for fine-\ntuning on 2 NER models (en_core_web_sm and en_core_web_trf) and finally the one with better F1 score was chosen (en_core_web_trf)\nThe dataset is labelled manually then similar data points are generated to increase total data points. \nOnce the NER model is finetuned on the dataset, its ready to extract entities from enron emails. Entities extracted\nthrough NER are listed below in the code.(check results).\n\nEnron email dataset is passed through predefined regex for email and password extracting as theres a recognizable pattern\nthat can be used. Then other entities are extracted through the finetuned NER model. Finally all the extracted entities \nare anonymized and replaced by place holder value. The code is further commented for each step.\n\n'

In [1]:
from operator import itemgetter

In [2]:
import numpy as np
from copy import deepcopy
from collections import Counter
from presidio_evaluator import InputSample
from presidio_evaluator.evaluation import Evaluator, ModelError
from presidio_evaluator.models import SpacyModel
import pandas as pd
import spacy

stanza and spacy_stanza are not installed
Flair is not installed by default
Flair is not installed


# 1 Finetune NER model

In [5]:
#train data for NER MODEL, This data is manually tagged by hand and sampled from the bigger enron dataset
dataset_name = "enron_train.json"
dataset = InputSample.read_dataset_json('/home/ammar/Desktop/LMU/Data Security NLP/{}'.format(dataset_name))


tokenizing input:   0%|                                                               | 0/1061 [00:00<?, ?it/s]

loading model en_core_web_sm


tokenizing input: 100%|████████████████████████████████████████████████████| 1061/1061 [00:12<00:00, 82.13it/s]


In [6]:
# load spacy model for NER tagging to later anonymize
nlp = spacy.load('en_core_web_trf')


In [7]:
# Select entities to anonymize other than email and password
model = SpacyModel(model=nlp, entities_to_keep=["PERSON","DATE",'MONEY', "GPE", "ORG", "TIME"])
evaluator = Evaluator(model=model)
evaluation_results = evaluator.evaluate_all(deepcopy(dataset))
results = evaluator.calculate_score(evaluation_results)

Mapping entity values using this dictionary: {'PERSON': 'PERSON', 'LOCATION': 'LOC', 'GPE': 'GPE', 'ORGANIZATION': 'ORG', 'DATE_TIME': 'DATE', 'NRP': 'NORP'}


Evaluating <class 'presidio_evaluator.models.spacy_model.SpacyModel'>: 100%|█| 1061/1061 [02:38<00:00,  6.70it/


In [8]:
print(results)

              Entity           Precision              Recall   Number of samples
                DATE              89.49%              83.59%                7417
                 GPE              34.02%              65.66%                 530
                 ORG              79.28%              60.88%                4696
              PERSON              92.53%              86.38%                4032
                 PII              85.82%              86.04%               16675
PII F measure: 86.01%


In [9]:
# Output results
entities, confmatrix = results.to_confusion_matrix()
print("Confusion matrix:")
print(pd.DataFrame(confmatrix, columns=entities, index=entities))
print("Precision and recall")
print(results)

Confusion matrix:
        DATE  GPE      O   ORG  PERSON
DATE    6200    0    160     0       0
GPE        1  348    161    19       1
O        702  670  61771   529     129
ORG       24    3   1659  2859     151
PERSON     1    2    347   199    3483
Precision and recall
              Entity           Precision              Recall   Number of samples
                DATE              89.49%              83.59%                7417
                 GPE              34.02%              65.66%                 530
                 ORG              79.28%              60.88%                4696
              PERSON              92.53%              86.38%                4032
                 PII              85.82%              86.04%               16675
PII F measure: 86.01%


# 2 Regex To EXTRACT and REPLACE Email and Password

In [10]:
enron = pd.read_csv("/home/ammar/Desktop/LMU/Data Security NLP/data/enron_clean.csv")

In [11]:
enron.shape

(30170, 3)

In [12]:
# Extracting Email IDs is easier through REGEX than NER model
email = enron["content"].str.extractall(r'([\w\.-]+@[\w\.-]+(?:\.[\w]+)+)')

In [13]:
# Theres a specific pattern found in the emails for password sharing, we detect that pattern and extract it to anonymize later
password = enron["content"].str.extractall(r'Password:[ \t]{0,50}(.{4,10})')

In [14]:
enron["pass"] = password.groupby(level=0)[0].apply(list)

In [15]:
enron["regex"] = email.groupby(level=0)[0].apply(list)

In [16]:
enron = enron.replace({'content': r'[\w\.-]+@[\w\.-]+(?:\.[\w]+)+'}, {'content': '<EMAIL>'}, regex=True)

In [17]:
enron['content'] = enron['content'].str.replace(r'(Password:[ \t]{0,50})(.{4,10})', lambda m: m.group(1)+'<PASSWORD>')


  enron['content'] = enron['content'].str.replace(r'(Password:[ \t]{0,50})(.{4,10})', lambda m: m.group(1)+'<PASSWORD>')


# 3 Apply NER  and Anonymize entities



In [18]:
# Randomly sample the dataset to anonymize (can be applied to whole dataset, 35k values but it takes a lot of time)

enron = enron.sample(n=1000)

In [19]:
enron["NER"] = None
for k,v in enron.iterrows():
    doc = nlp(v['content'])
    out = model.predict(InputSample(full_text=v['content']))
    enron["NER"].loc[k] = out



mismatch between input tokens and new tokens
mismatch between input tokens and new tokens


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enron["NER"].loc[k] = out


mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch b

mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch b

mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch b

mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch b

mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch b

mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch between input tokens and new tokens
mismatch b

In [20]:
# replace each entity with its placeholder
it = 0
enron['anon'] = None
enron['original'] = None
for k,v in enron.iterrows():
    doc = nlp(v['content'])
    out = model.predict(InputSample(full_text=v['content']))
    print('##################################')
    ind = [i for i, e in enumerate(out) if e != "O"]
    #print(itemgetter(*ind)(doc))
    #print(itemgetter(*ind)(out))
    i=0
    anon=[]
    original=[]
    for i in range(len(out)):
        if out[i] == 'PERSON':
            anon.append('<PERSON>')
            original.append(str(doc[i]))
        elif out[i] == 'ORG':
            anon.append('<ORG>')
            original.append(str(doc[i]))
        elif out[i] == 'EVENT':
            anon.append('<EVENT>')
            original.append(str(doc[i]))
        elif out[i] == 'DATE':
            anon.append('<DATE>')
            original.append(str(doc[i]))
        elif out[i] == 'TIME':
            anon.append('<TIME>')
            original.append(str(doc[i]))
        elif out[i] == 'GPE':
            anon.append('<GPE>')
            original.append(str(doc[i]))
        elif out[i] == 'MONEY':
            anon.append('<MONEY>')
            original.append(str(doc[i]))
        else:
            anon.append(str(doc[i]))
            original.append(str(doc[i]))
    it+=1
    enron['anon'].loc[k] = anon
    enron['original'].loc[k] = original



mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enron['anon'].loc[k] = anon
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enron['original'].loc[k] = original


mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new to

mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new to

mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new to

mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new to

mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new to

mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new to

mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new to

mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new to

mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new to

mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new tokens
##################################
mismatch between input tokens and new to

In [18]:
# save the anonymized data
enron.to_csv("anon_emails.csv")

In [19]:
# 10 samples from the dataset with orginal data
re=0
for k,v in enron.iterrows():
    print(" ".join(v["anon"]))
    print("------------------")
    print(v["content"])
    print()
    print("#####################")
    print()
    re+=1
    if re==10:
        break

	 screw the english jersey .   just wear red 

  -----Original Message----- 
 From : 	 " Eva Pao " < < EMAIL>>@ENRON [ mailto : IMCEANOTES-+22Eva+20Pao+22 + 20 + 3Cepao+40mba2002 + 2Ehbs+2Eedu+3E+<EMAIL > ] 
 Sent : 	 <DATE> <DATE> <DATE> <DATE> <DATE> <DATE> <TIME> <TIME> 
 To : 	 <PERSON> <PERSON> 
 Subject : 	 soccer gear 

 for <TIME> 's game , i 've got a <ORG> <ORG> jersey .   will people jeer 
 at me because it is english or because it looks like the <ORG> jersey ? 

 kid
------------------
	screw the english jersey.  just wear red

 -----Original Message-----
From: 	"Eva Pao" <<EMAIL>>@ENRON [mailto:IMCEANOTES-+22Eva+20Pao+22+20+3Cepao+40mba2002+2Ehbs+2Eedu+3E+<EMAIL>] 
Sent:	Wednesday, June 20, 2001 2:31 PM
To:	John Arnold
Subject:	soccer gear

for tonight's game, i've got a manchester united jersey.  will people jeer
at me because it is english or because it looks like the trinidad jersey?

kid

#####################

<PERSON> & <PERSON> <PERSON> 

 I received the below email

In [26]:
# User input to ANONYMIZE custom text
user_input = "My name is Ammar Ahmed, and Im studying in LMU. Ill make the transaction tomorrow "

In [33]:
def anonymize(user_input):
    doc = nlp(user_input)
    out = model.predict(InputSample(full_text=doc))
    anon=[]
    original=[]
    for i in range(len(out)):
        if out[i] == 'PERSON':
            anon.append('<PERSON>')
            original.append(str(doc[i]))
        elif out[i] == 'ORG':
            anon.append('<ORG>')
            original.append(str(doc[i]))
        elif out[i] == 'EVENT':
            anon.append('<EVENT>')
            original.append(str(doc[i]))
        elif out[i] == 'DATE':
            anon.append('<DATE>')
            original.append(str(doc[i]))
        elif out[i] == 'TIME':
            anon.append('<TIME>')
            original.append(str(doc[i]))
        elif out[i] == 'GPE':
            anon.append('<GPE>')
            original.append(str(doc[i]))
        elif out[i] == 'MONEY':
            anon.append('<MONEY>')
            original.append(str(doc[i]))
        else:
            anon.append(str(doc[i]))
            original.append(str(doc[i]))
    return anon,original

In [35]:
anon,original = anonymize(user_input)

mismatch between input tokens and new tokens


In [36]:
" ".join(original)

'My name is Ammar Ahmed , and I m studying in LMU . Ill make the transaction tomorrow'

In [37]:
" ".join(anon)

'My name is <PERSON> <PERSON> , and I m studying in <ORG> . Ill make the transaction <DATE>'

In [38]:
# User input to ANONYMIZE custom text
user_input = "Hey, this is Vladana for Data Science department. I need to make 150 euro payment for next semester on 2023-02-20"

In [39]:
anon,original = anonymize(user_input)

mismatch between input tokens and new tokens


In [40]:
" ".join(original)

'Hey , this is Vladana for Data Science department . I need to make 150 euro payment for next semester on 2023 - 02 - 20'

In [41]:
" ".join(anon)

'Hey , this is <PERSON> for Data Science department . I need to make <MONEY> <MONEY> payment for next semester on <DATE> <DATE> <DATE> <DATE> <DATE>'