In [None]:
#This notebook is by Anastasia Ruzmaikina for Kaggle Competition Learning Agency Lab - PII Data Detection

The goal of this competition is to develop a model that detects personally identifiable information (PII) in student writing. Your efforts to automate the detection and removal of PII from educational data will lower the cost of releasing educational datasets. This will support learning science research and the development of educational tools.

Reliable automated techniques could allow researchers and industry to tap into the potential that large public educational datasets offer to support the development of effective tools and interventions for supporting teachers and students.

In today’s era of abundant educational data from sources such as ed tech, online learning, and research, widespread PII is a key challenge. PII’s presence is a barrier to analyze and create open datasets that advance education because releasing the data publicly puts students at risk. To reduce these risks, it’s crucial to screen and cleanse educational data for PII before public release, which data science could streamline.
Manually reviewing the entire dataset for PII is currently the most reliable screening method, but this results in significant costs and restricts the scalability of educational datasets. While techniques for automatic PII detection that rely on named entity recognition (NER) exist, these work best for PII that share common formatting such as emails and phone numbers. PII detection systems struggle to correctly label names and distinguish between names that are sensitive (e.g., a student's name) and those that are not (e.g., a cited author).

This notebook uses Spacy Presidio-Analyzer to detect PII in student essays. Llama-2-7b-chat was too slow given the size of the dataset. The accuracy score of this notebook on the competition test dataset is 68%

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#!sudo apt install python-dev 

In [None]:

!pip install -qq --no-deps /kaggle/input/boto3-13461/boto3-1.34.61-py3-none-any.whl
#!pip install -qq --no-deps /kaggle/input/es-core-news-sm/es_core_news_sm-2.3.1
!pip install -qq --no-deps /kaggle/input/es-core-news/es_core_news_md-3.7.0-py3-none-any.whl
!pip install -qq --no-deps /kaggle/input/phonenumbers/phonenumbers-8.13.32-py2.py3-none-any.whl
!pip install -qq --no-deps /kaggle/input/presidio-analyzer-and-anonymizer-v2-2-351/presidio_analyzer-2.2.351-py3-none-any.whl
!pip install -qq --no-deps /kaggle/input/presidio-analyzer-and-anonymizer-v2-2-351/presidio_anonymizer-2.2.351-py3-none-any.whl
!pip install -qq --no-deps /kaggle/input/presidio-analyzer/presidio_analyzer-2.2.353-py3-none-any.whl
!pip install -qq --no-deps /kaggle/input/requests-file/requests_file-2.0.0-py2.py3-none-any.whl
!pip install -qq --no-deps /kaggle/input/sagemaker/sagemaker-2.212.0-py3-none-any.whl
#!pip install -qq --no-deps /kaggle/input/script-deb-sh/script.deb.sh
!pip install -qq --no-deps /kaggle/input/tldextract/tldextract-5.1.1-py3-none-any.whl

In [None]:
#!pip install -qq --no-deps /kaggle/input/es-core-news-sm/es_core_news_sm-2.3.1

In [None]:
import pandas as pd
df_train = pd.read_json('/kaggle/input/pii-detection-removal-from-educational-data/train.json')
df_test = pd.read_json('/kaggle/input/pii-detection-removal-from-educational-data/test.json')

In [None]:
#not commented out when internet on
#!pip install sagemaker --upgrade
#import sagemaker
#!pip install /kaggle/input/d/hashidoyuto/presidio-analyzer/presidio_analyzer
#assert sagemaker.__version__ >= "2.75.0"


In [None]:
#not commented out when internet on
#!sudo apt-get update -y
#!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
#!sudo apt-get install git-lfs git -y

In [None]:
#import sagemaker
#import boto3
#sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
#sagemaker_session_bucket=None
#if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
   # sagemaker_session_bucket = sess.default_bucket()

#try:
    #role = sagemaker.get_execution_role()
#except ValueError:
    #iam = boto3.client('iam')
    #role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

#sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

#print(f"sagemaker role arn: {role}")
#print(f"sagemaker bucket: {sess.default_bucket()}")
#print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
#not commented out when internet on
#!pip install presidio_analyzer

In [None]:
from presidio_analyzer import PatternRecognizer
titles_recognizer = PatternRecognizer(supported_entity="TITLE",
                                      deny_list=["Mr.","Mrs.","Miss"])

In [None]:
titles_recognizer.analyze(text="Mr. Schmidt", entities="TITLE")

In [None]:
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry

registry = RecognizerRegistry()
registry.load_predefined_recognizers()

# Add the recognizer to the existing list of recognizers
registry.add_recognizer(titles_recognizer)

# Set up analyzer with our updated recognizer registry
analyzer = AnalyzerEngine(registry=registry)

# Run with input text
text="His name is Mr. Jones"
results = analyzer.analyze(text=text, language="en")
print(results)

In [None]:
from presidio_analyzer import AnalyzerEngine

analyzer = AnalyzerEngine()

analyzer.registry.add_recognizer(titles_recognizer)

results = analyzer.analyze(text=text, language="en")
print(results)

In [None]:
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry

import regex as re

registry = RecognizerRegistry(global_regex_flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
engine = AnalyzerEngine(registry=registry)
engine.analyze(text, language="en")

In [None]:
#this works!!!
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngineProvider

# Create configuration containing engine name and models
configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "es", "model_name": "es_core_news_md"},
                {"lang_code": "en", "model_name": "en_core_web_lg"}],
}

# Create NLP engine based on configuration
provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine_with_spanish = provider.create_engine()

# Pass the created NLP engine and supported_languages to the AnalyzerEngine
analyzer = AnalyzerEngine(
    nlp_engine=nlp_engine_with_spanish, 
    supported_languages=["en", "es"]
)

# Analyze in different languages
results_spanish = analyzer.analyze(text="Mi nombre es Morris", language="es")
print(results_spanish)

results_english = analyzer.analyze(text="My name is Morris", language="en")
print(results_english)

In [None]:
from presidio_analyzer import EntityRecognizer, RecognizerResult
class TransformersRecognizer(EntityRecognizer):
    def __init__(self,model_id_or_path=None,aggregation_strategy="average",supported_language="en",ignore_labels=["O","MISC"]):
      # inits transformers pipeline for given mode or path
      self.pipeline = pipeline("token-classification",model=model_id_or_path,aggregation_strategy="average",ignore_labels=ignore_labels)
      # map labels to presidio labels
      self.label2presidio={
        "PER": "PERSON",
        "LOC": "LOCATION",
        "ORG": "ORGANIZATION",
      }

      # passes entities from model into parent class
      super().__init__(supported_entities=list(self.label2presidio.values()),supported_language=supported_language)

    def load(self) -> None:
        """No loading is required."""
        pass

    def analyze(
        self, text: str, entities: list[str]=None, nlp_artifacts: list[str]=None #NlpArtifacts
    ) -> list[RecognizerResult]:
        """
        Extracts entities using Transformers pipeline
        """
        results = []

        # keep max sequence length in mind
        predicted_entities = self.pipeline(text)
        if len(predicted_entities) >0:
          for e in predicted_entities:
            converted_entity = self.label2presidio[e["entity_group"]]
            if converted_entity in entities or entities is None:
              results.append(
                  RecognizerResult(
                      entity_type=converted_entity,
                      start=e["start"],
                      end=e["end"],
                      score=e["score"]
                      )
                  )
        return results

In [None]:
!mkdir code

In [None]:
%%writefile code/inference.py
#%%writefile inference.py

from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer import AnalyzerEngine
from typing import List

from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult
from presidio_analyzer.nlp_engine import NlpArtifacts
from transformers import pipeline

# load spacy model -> workaround
import os
os.system("spacy download en_core_web_lg")

# list of entities: https://microsoft.github.io/presidio/supported_entities/#list-of-supported-entities
DEFAULT_ANOYNM_ENTITIES = [
    "CREDIT_CARD",
    "CRYPTO",
    "DATE_TIME",
    "EMAIL_ADDRESS",
    "IBAN_CODE",
    "IP_ADDRESS",
    "NRP",
    "LOCATION",
    "PERSON",
    "PHONE_NUMBER",
    "MEDICAL_LICENSE",
    "URL",
    "ORGANIZATION"
]

# init anonymize engine
engine = AnonymizerEngine()

class HFTransformersRecognizer(EntityRecognizer):
    def __init__(
        self,
        model_id_or_path=None,
        aggregation_strategy="simple",
        supported_language="en",
        ignore_labels=["O", "MISC"],
    ):
        # inits transformers pipeline for given mode or path
        self.pipeline = pipeline(
            "token-classification", model=model_id_or_path, aggregation_strategy=aggregation_strategy, ignore_labels=ignore_labels
        )
        # map labels to presidio labels
        self.label2presidio = {
            "PER": "PERSON",
            "LOC": "LOCATION",
            "ORG": "ORGANIZATION",
        }

        # passes entities from model into parent class
        super().__init__(supported_entities=list(self.label2presidio.values()), supported_language=supported_language)

    def load(self) -> None:
        """No loading is required."""
        pass

    def analyze(
        self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
    ) -> List[RecognizerResult]:
        """
        Extracts entities using Transformers pipeline
        """
        results = []

        # keep max sequence length in mind
        predicted_entities = self.pipeline(text)
        if len(predicted_entities) > 0:
            for e in predicted_entities:
                converted_entity = self.label2presidio[e["entity_group"]]
                if converted_entity in entities or entities is None:
                    results.append(
                        RecognizerResult(
                            entity_type=converted_entity, start=e["start"], end=e["end"], score=e["score"]
                        )
                    )
        return results


def model_fn(model_dir):
    transformers_recognizer = HFTransformersRecognizer(model_dir)
    # Set up the engine, loads the NLP module (spaCy model by default) and other PII recognizers
    analyzer = AnalyzerEngine()
    analyzer.registry.add_recognizer(transformers_recognizer)
    return analyzer


def predict_fn(data, analyzer):
    sentences = data.pop("inputs", data)
    if "parameters" in data:
        anonymization_entities = data["parameters"].get("entities", DEFAULT_ANOYNM_ENTITIES)
        anonymize_text = data["parameters"].get("anonymize", False)
    else:
        anonymization_entities = DEFAULT_ANOYNM_ENTITIES
        anonymize_text = False

    # identify entities
    results = analyzer.analyze(text=sentences, entities=anonymization_entities, language="en")
    # anonymize text
    if anonymize_text:
        result = engine.anonymize(text=sentences, analyzer_results=results)
        return {"anonymized": result.text}

    return {"found": [entity.to_dict() for entity in results]}


In [None]:
%%writefile code/requirements.txt

presidio-analyzer
spacy
transformers
presidio-anonymizer

In [None]:
#from sagemaker.huggingface.model import HuggingFaceModel


# create Hugging Face Model Class
#huggingface_model = HuggingFaceModel(
   #model_data=df_train,       # path to your model and script
   #role=role,                    # iam role with permissions to create an Endpoint
   #transformers_version="4.17",  # transformers version used
   #pytorch_version="1.10",        # pytorch version used
   #py_version='py38',            # python version used
#)

# deploy the endpoint endpoint
#predictor = huggingface_model.deploy(
    #initial_instance_count=1,
    #instance_type="ml.g4dn.xlarge"
    #)

In [None]:
payload="""
Hello, my name is David Johnson and I live in Maine.
I work as a software engineer at Amazon.
My username for Amazon is djon, my username for ebay is Jnd.
You can call me at (123) 456-7890.
My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.

On September 18 I visited microsoft.com and sent an email to test@presidio.site, from the IP 192.168.0.1.
My passport: 191280342 and my phone number: (212) 555-1234.
This is a valid International Bank Account Number: IL150120690000003111111. Can you please check the status on bank account 954567876544?
Kate's social security number is 078-05-1126.  Her driver license? it is 1234567A.

"""

In [None]:
results_english = analyzer.analyze(payload, language='en')
print(results_english)

In [None]:
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngineProvider

LANGUAGES_CONFIG_FILE = "./docs/analyzer/languages-config.yml"

# Create NLP engine based on configuration file
provider = NlpEngineProvider(conf_file=LANGUAGES_CONFIG_FILE)
nlp_engine_with_spanish = provider.create_engine()

# Pass created NLP engine and supported_languages to the AnalyzerEngine
analyzer = AnalyzerEngine(
    nlp_engine=nlp_engine_with_spanish, 
    supported_languages=["en"]#, "es"]
)

# Analyze in different languages
#results_spanish = analyzer.analyze(text="Mi nombre es David", language="es")
#print(results_spanish)

results_english = analyzer.analyze(text="My name is David", language="en")
print(results_english)

In [None]:
df=df_test
text = df.loc[1, 'full_text']
token_list =df.loc[1, 'tokens']
whitespace_list = df.loc[1, 'trailing_whitespace']
New_text = ""   #converts tokens into text and gives locations of each token in the text
l = []
for i in range(len(token_list)):
    if whitespace_list[i] == True:
        temp1=len(New_text)
        New_text += token_list[i] + ' '
        temp2=len(New_text)-1
        l.append((token_list[i], temp1, temp2))
    if whitespace_list[i] == False:
        temp1=len(New_text)
        New_text += token_list[i]
        temp2=len(New_text)
        l.append((token_list[i], temp1, temp2))
print(New_text[:100])    
print(l[:11])

In [None]:
new = []
l_list = []
for j in range(len(df_test)):
    token_list =df_test.loc[j, 'tokens']
    whitespace_list = df_test.loc[j, 'trailing_whitespace']
    New_text = ""   #converts tokens into text and gives locations of each token in the text
    l = []
    for i in range(len(token_list)):
        if whitespace_list[i] == True:
            temp1=len(New_text)
            New_text += token_list[i] + ' '
            temp2=len(New_text)-1
            l.append((token_list[i], temp1, temp2))
        if whitespace_list[i] == False:
            temp1=len(New_text)
            New_text += token_list[i]
            temp2=len(New_text)
            l.append((token_list[i], temp1, temp2))
    new.append(New_text) 
    l_list.append(l)
df_test['text_from_tokens'] = new
df_test['tokens_with_locations'] = l_list

In [None]:
df_train

In [None]:
#summ[0][0]

In [None]:
summ = {}
for i in range(len(df_test)):
    payload = df_test.loc[i,'text_from_tokens']
    results_english = analyzer.analyze(payload, language='en')
    summ[i] = results_english
print(summ[0])    

In [None]:
print(len(summ), len(df_test))

In [None]:
summary_list = {}
for i in range(len(summ)):
    summary_list[i] = []
for j in range(len(summ)):
    summary_list[j] = summ[j]
    #summary = summ[j]
    for i in range(len(summary_list[j])):
        recognizer_result = summ[j][i]
        summary_list[j][i] = str(recognizer_result)
        #if j < 11:
            #print(summary_list[j][i], type(summary_list[j][i]))   

In [None]:
print(summary_list[0])
print(summ[0])

In [None]:
#cont = summ[0]
summary_dict = {}
list_dict ={}
for i in range(len(summ)): 
    summary_dict[i] = []
    list_dict[i] = []
for j in range(len(summ)):
    #summary_dict[j] = [{}]*len(summ[i])
    #print(summary_dict[j])
    summary = summary_list[j]
    for i in range(len(summary)):
        summary_dict[j].append({})#[i] = {}
    for i in range(len(summary)):
        string = summary[i]

    #string = cont
        start_word = 'type:'
        end_word = 'start:'
    #end_index = string.find("hello", start_index + len("hello"))
        start_index = string.find(start_word)
        end_index = string.find(end_word, start_index+len(start_word))
        text_between_words = string[start_index+len(start_word):end_index].strip()
    #print(text_between_words)
    #print(end_index)
        start_word1 = 'start:'
        end_word1 = 'end:'
    #end_index = string.find("hello", start_index + len("hello"))
        start_index1 = string.find(start_word1)
        end_index1 = string.find(end_word1, start_index1+len(start_word1))
        text_between_words1 = string[start_index1+len(start_word1):end_index1].strip()[:-1]
        start_word2 = 'end:'
        end_word2 = 'score:'
    #end_index = string.find("hello", start_index + len("hello"))
        start_index2 = string.find(start_word2)
        end_index2 = string.find(end_word2, start_index2+len(start_word2))
        text_between_words2 = string[start_index2+len(start_word2):end_index2].strip()[:-1]
        start_word3 = 'score:'
        start_index3 = string.find(start_word3)
        end_index3 = start_index3+11 #string.find(end_word3)
        text_score = float(string[start_index3 + len(start_word3):end_index3].strip())
    #print(start_index3, end_index3, text_score)
    #print(text_between_words, text_between_words1, text_between_words2)
        if text_between_words == 'PERSON,' and text_score > 0.3:
             summary_dict[j][i]['NAME_STUDENT']=(int(text_between_words1),int(text_between_words2))
        if text_between_words == 'EMAIL_ADDRESS,' and text_score > 0.3:
             summary_dict[j][i]['EMAIL']=(int(text_between_words1),int(text_between_words2))
        if text_between_words == 'USERNAME,' and text_score > 0.3:
             summary_dict[j][i]['USERNAME,']=(int(text_between_words1),int(text_between_words2))
        if (text_between_words == 'US_SSN' or text_between_words == 'US_DRIVER_LICENSE') and text_score > 0.3:
             summary_dict[j][i]['ID_NUM,']=(int(text_between_words1), int(text_between_words2))
        if text_between_words == 'PHONE_NUMBER,' and text_score > 0.3:
             summary_dict[j][i]['PHONE_NUM']=(int(text_between_words1), int(text_between_words2))
        if text_between_words == 'URL,' and text_score > 0.3:
             summary_dict[j][i]['URL_PERSONAL']=(int(text_between_words1), int(text_between_words2))
        if text_between_words == 'LOCATION,' and text_score > 0.3:
             summary_dict[j][i]['STREET_ADDRESS']=(int(text_between_words1), int(text_between_words2))
    #print(text_score)
    #cont = cont[end_index-len(end_word):]
    list_dict[j] = []
    for i in range(len(summary)):
        if len(summary_dict[j][i])>0:
        #print(summary_dict[i]) 
            list_dict[j].append(summary_dict[j][i])

In [None]:
print(list_dict[0])
print(summary_list[0])

In [None]:
#print(list_tokens[0][:100])

In [None]:
list_dict[0][0].values()

In [None]:
df_test.loc[0, 'tokens_with_locations'][:11]
#df_train.loc[0,'labels']
list_tokens = [[]]*len(df_test)
for i in range(len(df_test)):
    list_tokens[i] = df_test.loc[i, 'tokens_with_locations']
#print(list_tokens[11])    

In [None]:
print(list_dict[0])
print(list_tokens[0][:100])

In [None]:
n_token = {}
for i in range(len(df_test)):
    n_token[i] =[]
for l in range(len(df_test)):
    for i in range(len(list_tokens[l])):
        #for j in range(len(list_dict)):
            for k in range(len(list_dict[l])):
                j = l
                if list_tokens[l][i][1]==list(list_dict[j][k].values())[0][0]:
                #print(list_tokens[i], list_dict[j][k])
                    if list_tokens[l][i][0] != '\n' and  list_tokens[l][i][0] != '\n\n':
                        n_token[l].append((i, 'B-'+list(list_dict[j][k].keys())[0]))
                if list_tokens[l][i][2]==list(list_dict[j][k].values())[0][1]:
                #print(list_tokens[i], list_dict[j][k])
                    if list_tokens[l][i][0] != '\n' and  list_tokens[l][i][0] != '\n\n':
                        n_token[l].append((i, 'I-'+list(list_dict[j][k].keys())[0]))
                if list_tokens[l][i][1] > list(list_dict[j][k].values())[0][0] and list_tokens[l][i][2]<list(list_dict[j][k].values())[0][1]:
                #print(list_tokens[i], list_dict[j][k]) 
                    if list_tokens[l][i][0] != '\n' and  list_tokens[l][i][0] != '\n\n':
                        n_token[l].append((i, 'I-'+list(list_dict[j][k].keys())[0]))
print(sorted(list(set(n_token[0]))))   

In [None]:
print(sorted(list(set(n_token[1])))) 
print(list_tokens[1][:11], list_tokens[1][464:466])

In [None]:
list_n_token ={}
token_number_list = {}
token_value_list ={}
for i in range(len(df_test)):
    list_n_token[i] = sorted(list(set(n_token[i])))
    token_number_list[i] = []
    token_value_list[i] = []
    for j in range(len(list_n_token[i])):
        token_number_list[i].append(list_n_token[i][j][0])
        token_value_list[i].append(list_n_token[i][j][1])
#print(list_n_token[0][0])
print(token_number_list[1])
print(token_value_list[1])

In [None]:
df_new = df_test

df_new = df_new.iloc[:, :1]
token_info = {}
#token_number =[]
#token_value = []
for i in range(len(df_new)):
#for i, row in df_new.iterrows():
    token_info[i] = []
    for j in range(len(token_number_list[i])):
        token_info[i].append((token_number_list[i][j], token_value_list[i][j]))
    #token_number.append(token_number_list[i])
    #token_value.append(token_value_list[i])
#df_new['token_number'] = token_number
#df_new['token_value'] = token_value
token_infor =[]
for i in range(len(df_new)):
    token_infor.append(token_info[i])
df_new['token_info'] = token_infor
df_new

In [None]:
df_new = df_new.explode('token_info')
print(df_new)

In [None]:
df_new = df_new.dropna()

In [None]:
df_new = df_new.reset_index()
df_new = df_new.drop('index', axis=1)
df_new

In [None]:
df_new.loc[0, 'token_info']

In [None]:
token =[]
label =[]
for i in range(len(df_new)):
    token.append(df_new.loc[i, 'token_info'][0])
    label.append(df_new.loc[i, 'token_info'][1])
df_new['token'] = token
df_new['label'] = label
df_new

In [None]:
df_new = df_new.drop(['token_info'], axis=1)
df_new['row_id'] = df_new.index
df_new = df_new.set_index('row_id')
df_new

In [None]:
import os

def remove_folder_contents(folder):
    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                remove_folder_contents(file_path)
                os.rmdir(file_path)
        except Exception as e:
            print(e)

folder_path = '/kaggle/working'

In [None]:
df_new.to_csv('submission.csv')

In [None]:
df1 = pd.read_csv('submission.csv')
df1

In [None]:
df_sample = pd.read_csv('/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv')
df_sample