# ----------- DistilBERT - finetuned_ai4privacy_v2 -----------

### DistilBERT:
Paper: https://arxiv.org/abs/1910.01108

Huggingface: https://huggingface.co/distilbert-base-uncased

Documentation: https://huggingface.co/docs/transformers/main/en/model_doc/distilbert

### ai4privacy:

Huggingface: https://huggingface.co/Isotonic/distilbert_finetuned_ai4privacy_v2

GitHub: https://github.com/Sripaad/ai4privacy

## Libraries

In [63]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import pandas as pd

In [32]:
model_name = "Isotonic/distilbert_finetuned_ai4privacy_v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

In [87]:
nlp = pipeline("token-classification", model=model, tokenizer=tokenizer)

text = "My name is Max and im from Frankfurt. I study at the University of Mannheim."
results = nlp(text)

results


[{'entity': 'B-FIRSTNAME',
  'score': 0.86211693,
  'index': 4,
  'word': 'max',
  'start': 11,
  'end': 14},
 {'entity': 'B-STATE',
  'score': 0.9750011,
  'index': 8,
  'word': 'frankfurt',
  'start': 27,
  'end': 36},
 {'entity': 'B-STATE',
  'score': 0.9271666,
  'index': 16,
  'word': 'mannheim',
  'start': 67,
  'end': 75}]

In [None]:
labels = model.config.id2label
print(labels)

In [53]:
def simplify_model_output(model_output):
    simplified_output = []
    for item in model_output:
        # Entfernen des "B-" oder "I-" Präfixes 
        simplified_entity = item['entity'][2:] 
        simplified_item = {**item, 'entity': simplified_entity}
        simplified_output.append(simplified_item)
    return simplified_output

In [85]:
data = pd.read_json("../../data/dataset_english.json")
data_small = data.head(10000)
data

Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text
0,A students assessment was found on device bear...,A students assessment was found on device bear...,"{'[PHONEIMEI_1]': '06-184755-866851-3', '[JOBA...","[[0, 57, O], [57, 75, PHONEIMEI_1], [75, 138, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PHON...","[a, student, s, assessment, was, found, on, de..."
1,"Dear [FIRSTNAME_1], as per our records, your l...","Dear Omer, as per our records, your license 78...","{'[FIRSTNAME_1]': 'Omer', '[VEHICLEVIN_1]': '7...","[[0, 5, O], [5, 9, FIRSTNAME_1], [9, 44, O], [...","[O, B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O...","[dear, om, ##er, ,, as, per, our, records, ,, ..."
2,[FIRSTNAME_1] could you please share your reco...,Kattie could you please share your recomndatio...,"{'[FIRSTNAME_1]': 'Kattie', '[AGE_1]': '72', '...","[[0, 6, FIRSTNAME_1], [6, 75, O], [75, 77, AGE...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...","[kat, ##tie, could, you, please, share, your, ..."
3,Emergency supplies in [BUILDINGNUMBER_1] need ...,Emergency supplies in 16356 need a refill. Use...,"{'[BUILDINGNUMBER_1]': '16356', '[MASKEDNUMBER...","[[0, 22, O], [22, 27, BUILDINGNUMBER_1], [27, ...","[O, O, O, B-BUILDINGNUMBER, I-BUILDINGNUMBER, ...","[emergency, supplies, in, 1635, ##6, need, a, ..."
4,"The [AGE_1] old child at [BUILDINGNUMBER_1], h...","The 88 old child at 5862, has showcased an unu...","{'[AGE_1]': '88', '[BUILDINGNUMBER_1]': '5862'...","[[0, 4, O], [4, 6, AGE_1], [6, 20, O], [20, 24...","[O, B-AGE, O, O, O, B-BUILDINGNUMBER, I-BUILDI...","[the, 88, old, child, at, 58, ##6, ##2, ,, has..."
...,...,...,...,...,...,...
43496,"Hello [FIRSTNAME_1], your cognitive therapy ap...","Hello Nellie, your cognitive therapy appointme...","{'[FIRSTNAME_1]': 'Nellie', '[DATE_1]': '8/21'...","[[0, 6, O], [6, 12, FIRSTNAME_1], [12, 66, O],...","[O, B-FIRSTNAME, O, O, O, O, O, O, O, O, B-DAT...","[hello, nellie, ,, your, cognitive, therapy, a..."
43497,"Dear [FIRSTNAME_1], we appreciate your active ...","Dear Jalon, we appreciate your active involvem...","{'[FIRSTNAME_1]': 'Jalon', '[CREDITCARDNUMBER_...","[[0, 5, O], [5, 10, FIRSTNAME_1], [10, 159, O]...","[O, B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O...","[dear, ja, ##lon, ,, we, appreciate, your, act..."
43498,"Dear [SEX_1] at [ZIPCODE_1], we are raising fu...","Dear Female at 32363-2779, we are raising fund...","{'[SEX_1]': 'Female', '[ZIPCODE_1]': '32363-27...","[[0, 5, O], [5, 11, SEX_1], [11, 15, O], [15, ...","[O, B-SEX, O, B-ZIPCODE, I-ZIPCODE, I-ZIPCODE,...","[dear, female, at, 323, ##6, ##3, -, 277, ##9,..."
43499,"Hello [FIRSTNAME_1], we encourage you to pay t...","Hello Tito, we encourage you to pay the fees o...","{'[FIRSTNAME_1]': 'Tito', '[ETHEREUMADDRESS_1]...","[[0, 6, O], [6, 10, FIRSTNAME_1], [10, 137, O]...","[O, B-FIRSTNAME, O, O, O, O, O, O, O, O, O, O,...","[hello, tito, ,, we, encourage, you, to, pay, ..."


In [86]:
#  an empty list to store the results
model_results = []

# Iterate through the rows in the DataFrame
for index, row in data_small.iterrows():
    text = row['unmasked_text']
    # Apply the model to the text
    result = nlp(text)
    # simplify_model_output function
    simplified_result = simplify_model_output(result)
    # Append to the list
    model_results.append(simplified_result)

# results as a new column to the DataFrame
data_small['model_results'] = model_results

print(data_small['model_results'])

0       [{'entity': 'PHONEIMEI', 'score': 0.73580664, ...
1       [{'entity': 'FIRSTNAME', 'score': 0.9954371, '...
2       [{'entity': 'FIRSTNAME', 'score': 0.99750215, ...
3       [{'entity': 'ZIPCODE', 'score': 0.36478347, 'i...
4       [{'entity': 'AGE', 'score': 0.9643515, 'index'...
                              ...                        
9995    [{'entity': 'FIRSTNAME', 'score': 0.9967218, '...
9996    [{'entity': 'COMPANYNAME', 'score': 0.9955193,...
9997    [{'entity': 'FIRSTNAME', 'score': 0.99230766, ...
9998    [{'entity': 'IP', 'score': 0.37932673, 'index'...
9999    [{'entity': 'PHONEIMEI', 'score': 0.9932232, '...
Name: model_results, Length: 10000, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_small['model_results'] = model_results


In [88]:
def extract_entities(model_output):
    entities = [item['entity'] for item in model_output]
    return entities
data_small['model_entities'] = data_small['model_results'].apply(extract_entities)

data_small['model_entities']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_small['model_entities'] = data_small['model_results'].apply(extract_entities)


0       [PHONEIMEI, PHONEIMEI, PHONEIMEI, PHONEIMEI, P...
1       [FIRSTNAME, FIRSTNAME, SSN, SSN, DOB, DOB, DOB...
2       [FIRSTNAME, FIRSTNAME, AGE, GENDER, GENDER, GE...
3       [ZIPCODE, ZIPCODE, SSN, SSN, SSN, SSN, SSN, SS...
4       [AGE, BUILDINGNUMBER, BUILDINGNUMBER, BUILDING...
                              ...                        
9995    [FIRSTNAME, FIRSTNAME, LASTNAME, USERAGENT, US...
9996    [COMPANYNAME, COMPANYNAME, COMPANYNAME, FIRSTN...
9997    [FIRSTNAME, LASTNAME, LASTNAME, COMPANYNAME, C...
9998    [IP, IPV6, IPV6, IPV6, IPV6, IPV6, IPV6, IPV6,...
9999    [PHONEIMEI, PHONEIMEI, PHONEIMEI, PHONEIMEI, P...
Name: model_entities, Length: 10000, dtype: object

In [89]:
data_small

Unnamed: 0,masked_text,unmasked_text,privacy_mask,span_labels,bio_labels,tokenised_text,model_results,model_entities
0,A students assessment was found on device bear...,A students assessment was found on device bear...,"{'[PHONEIMEI_1]': '06-184755-866851-3', '[JOBA...","[[0, 57, O], [57, 75, PHONEIMEI_1], [75, 138, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-PHON...","[a, student, s, assessment, was, found, on, de...","[{'entity': 'PHONEIMEI', 'score': 0.73580664, ...","[PHONEIMEI, PHONEIMEI, PHONEIMEI, PHONEIMEI, P..."
1,"Dear [FIRSTNAME_1], as per our records, your l...","Dear Omer, as per our records, your license 78...","{'[FIRSTNAME_1]': 'Omer', '[VEHICLEVIN_1]': '7...","[[0, 5, O], [5, 9, FIRSTNAME_1], [9, 44, O], [...","[O, B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O...","[dear, om, ##er, ,, as, per, our, records, ,, ...","[{'entity': 'FIRSTNAME', 'score': 0.9954371, '...","[FIRSTNAME, FIRSTNAME, SSN, SSN, DOB, DOB, DOB..."
2,[FIRSTNAME_1] could you please share your reco...,Kattie could you please share your recomndatio...,"{'[FIRSTNAME_1]': 'Kattie', '[AGE_1]': '72', '...","[[0, 6, FIRSTNAME_1], [6, 75, O], [75, 77, AGE...","[B-FIRSTNAME, I-FIRSTNAME, O, O, O, O, O, O, O...","[kat, ##tie, could, you, please, share, your, ...","[{'entity': 'FIRSTNAME', 'score': 0.99750215, ...","[FIRSTNAME, FIRSTNAME, AGE, GENDER, GENDER, GE..."
3,Emergency supplies in [BUILDINGNUMBER_1] need ...,Emergency supplies in 16356 need a refill. Use...,"{'[BUILDINGNUMBER_1]': '16356', '[MASKEDNUMBER...","[[0, 22, O], [22, 27, BUILDINGNUMBER_1], [27, ...","[O, O, O, B-BUILDINGNUMBER, I-BUILDINGNUMBER, ...","[emergency, supplies, in, 1635, ##6, need, a, ...","[{'entity': 'ZIPCODE', 'score': 0.36478347, 'i...","[ZIPCODE, ZIPCODE, SSN, SSN, SSN, SSN, SSN, SS..."
4,"The [AGE_1] old child at [BUILDINGNUMBER_1], h...","The 88 old child at 5862, has showcased an unu...","{'[AGE_1]': '88', '[BUILDINGNUMBER_1]': '5862'...","[[0, 4, O], [4, 6, AGE_1], [6, 20, O], [20, 24...","[O, B-AGE, O, O, O, B-BUILDINGNUMBER, I-BUILDI...","[the, 88, old, child, at, 58, ##6, ##2, ,, has...","[{'entity': 'AGE', 'score': 0.9643515, 'index'...","[AGE, BUILDINGNUMBER, BUILDINGNUMBER, BUILDING..."
...,...,...,...,...,...,...,...,...
9995,"Dear [FIRSTNAME_1] [LASTNAME_1], We hope your ...","Dear Pinkie Boyle, We hope your e-learning exp...","{'[FIRSTNAME_1]': 'Pinkie', '[LASTNAME_1]': 'B...","[[0, 5, O], [5, 11, FIRSTNAME_1], [11, 12, O],...","[O, B-FIRSTNAME, I-FIRSTNAME, B-LASTNAME, O, O...","[dear, pink, ##ie, boyle, ,, we, hope, your, e...","[{'entity': 'FIRSTNAME', 'score': 0.9967218, '...","[FIRSTNAME, FIRSTNAME, LASTNAME, USERAGENT, US..."
9996,The literacy event is seeking sponsors. If you...,The literacy event is seeking sponsors. If you...,"{'[COMPANYNAME_1]': 'Fisher and Sons', '[FIRST...","[[0, 56, O], [56, 71, COMPANYNAME_1], [71, 126...","[O, O, O, O, O, O, O, O, O, O, B-COMPANYNAME, ...","[the, literacy, event, is, seeking, sponsors, ...","[{'entity': 'COMPANYNAME', 'score': 0.9955193,...","[COMPANYNAME, COMPANYNAME, COMPANYNAME, FIRSTN..."
9997,"Dear [FIRSTNAME_1] [LASTNAME_1], we noticed su...","Dear Keith Senger, we noticed suspicious activ...","{'[FIRSTNAME_1]': 'Keith', '[LASTNAME_1]': 'Se...","[[0, 5, O], [5, 10, FIRSTNAME_1], [10, 11, O],...","[O, B-FIRSTNAME, B-LASTNAME, I-LASTNAME, O, O,...","[dear, keith, sen, ##ger, ,, we, noticed, susp...","[{'entity': 'FIRSTNAME', 'score': 0.99230766, ...","[FIRSTNAME, LASTNAME, LASTNAME, COMPANYNAME, C..."
9998,Could you provide legal assistance for transac...,Could you provide legal assistance for transac...,{'[ETHEREUMADDRESS_1]': '0x5c2e0a39b2bebecfa31...,"[[0, 67, O], [67, 109, ETHEREUMADDRESS_1], [10...","[O, O, O, O, O, O, O, O, O, B-ETHEREUMADDRESS,...","[could, you, provide, legal, assistance, for, ...","[{'entity': 'IP', 'score': 0.37932673, 'index'...","[IP, IPV6, IPV6, IPV6, IPV6, IPV6, IPV6, IPV6,..."
