<a href="https://colab.research.google.com/github/batust/presidio-db-json/blob/main/pre-prcoessor-data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install presidio_analyzer
!pip install presidio_anonymizer
!python -m spacy download en_core_web_lg

Collecting presidio_analyzer
  Downloading presidio_analyzer-2.2.357-py3-none-any.whl.metadata (3.3 kB)
Collecting phonenumbers<9.0.0,>=8.12 (from presidio_analyzer)
  Downloading phonenumbers-8.13.55-py2.py3-none-any.whl.metadata (11 kB)
Collecting tldextract (from presidio_analyzer)
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract->presidio_analyzer)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading presidio_analyzer-2.2.357-py3-none-any.whl (112 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.3/112.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading phonenumbers-8.13.55-py2.py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tldextract-5.1.3-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.9/104.9 kB[

In [None]:
# Tranformers library
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = """My name is Allwyn Bat Thomas and I live in Kothamangalam. My phone number is 0509821676"""

ner_results = nlp(example)
print(ner_results)


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'entity': 'B-PER', 'score': 0.99717784, 'index': 4, 'word': 'All', 'start': 11, 'end': 14}, {'entity': 'I-PER', 'score': 0.5963278, 'index': 5, 'word': '##wyn', 'start': 14, 'end': 17}, {'entity': 'I-PER', 'score': 0.9789915, 'index': 6, 'word': 'Bat', 'start': 18, 'end': 21}, {'entity': 'I-PER', 'score': 0.9963566, 'index': 7, 'word': 'Thomas', 'start': 22, 'end': 28}, {'entity': 'B-LOC', 'score': 0.997614, 'index': 12, 'word': 'Ko', 'start': 43, 'end': 45}, {'entity': 'I-LOC', 'score': 0.97299033, 'index': 13, 'word': '##tham', 'start': 45, 'end': 49}, {'entity': 'I-LOC', 'score': 0.99213374, 'index': 14, 'word': '##anga', 'start': 49, 'end': 53}, {'entity': 'I-LOC', 'score': 0.996071, 'index': 15, 'word': '##lam', 'start': 53, 'end': 56}]


In [None]:
# Anonymizing data from transformer side
def anonymize(text, results):
  entities = {}
  anonymized_text = text
  prev_start = None

  for entity in results:
    entity_type = entity['entity'].split('-')[-1]
    word = entity['word'].replace('##', '')
    start, end = entity['start'], entity['end']

    if prev_start is not None and entities[prev_start]['end'] == start:
      entities[prev_start]['word'] += word
      entities[prev_start]['end'] = end

    else:
      entities[start] = {'word':word, 'end': end, 'entity_type': entity_type}
      prev_start = start

  for item in entities.items():
    print('Iter:', item)

anonymize(example, ner_results)


Iter: (11, {'word': 'Allwyn', 'end': 17, 'entity_type': 'PER'})
Iter: (18, {'word': 'Bat', 'end': 21, 'entity_type': 'PER'})
Iter: (22, {'word': 'Thomas', 'end': 28, 'entity_type': 'PER'})
Iter: (43, {'word': 'Kothamangalam', 'end': 56, 'entity_type': 'LOC'})


In [None]:
# Creating patterns in presidio
from presidio_analyzer import PatternRecognizer, AnalyzerEngine

titles_list = [
    "Sir",
    "Ma'am",
    "Madam",
    "Mr.",
    "Mrs.",
    "Ms.",
    "Miss",
    "Dr.",
    "Professor",
]

text = 'Sir Allwyn Bat Thomas is a CIA agent!'

pattern = PatternRecognizer(
    supported_entity="TITLE",
    deny_list=titles_list,
)

result = pattern.analyze(
    text=text,
    entities=['TITLE']
)
print(f'Result: {result}')

analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(pattern)
ans = analyzer.analyze(text=text, language='en')
print(ans)

Result: [type: TITLE, start: 0, end: 3, score: 1.0]




[type: TITLE, start: 0, end: 3, score: 1.0, type: PERSON, start: 4, end: 21, score: 0.85]


In [None]:
json_data = {
    "name": "Allwyn Bat Thomas",
    "email":[ "allwynbatthomas@ust.com", 'allwynt700@gmail.com', '287945@ust.com'],
    "backup_email": "b20cs100@mace.ac.in",
    "phone": "+973 30982167",
    "location": "Wakanda, Africa",
    "description": "Allwyn is a software engineer from Wakanda.",
    "metadata": {
        "emergency_contact": "+91 9678785654",
        "company": "UST Global",
        "address": {
            "city": "New York",
            "country": "USA"
        }
    }
}

In [16]:
# Processing JSON Spacey
def processing_json(data):
  if isinstance(data, dict):
    return {key: processing_json(value) for key, value in data.items()}
  elif isinstance(data, list):
    return [processing_json(item) for item in data]
  elif isinstance(data, str):
    result = analyzer.analyze(
        text = data,
        entities = ['PHONE_NUMBER', 'ADDRESS', 'LOCATION', 'PERSON', 'EMAIL_ADDRESS', 'STREET_ADDRESS'],
        language = 'en'
    )
    return anonymizer.anonymize(text=data, analyzer_results=result).text if result else data
  else: return data

In [17]:
# Uploading json file and doing NER on attributes
import json
from google.colab import files
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

file_upload = files.upload()
for filename in file_upload.keys():
  if filename.endswith('.json'):
    with open(filename, 'r') as fp:
      json_data = json.load(fp)
      processed_data = processing_json(json_data)
      print(json.dumps(processed_data, indent = 4))





Saving test.json to test.json
{
    "name": "<PERSON>",
    "age": 25,
    "city": "<LOCATION>",
    "married": false,
    "hobbies": [
        "reading",
        "traveling",
        "programming"
    ],
    "address": {
        "street": "Civil Station, Kudappanakkunnu",
        "city": "<LOCATION>",
        "postal_code": "695043"
    },
    "country": "<LOCATION>",
    "phone": "<PHONE_NUMBER>"
}


In [None]:
addresses = [
    '123 MG Road, Indiranagar, Bengaluru 560038, Karnataka, INDIA',
    'ul. Tverskaya, d. 16, kv. 5, Moskva 125009, RUSSIA',
    '123 Main Street, Springfield, IL 62701, USA',
    'Musterstraße 1, 12345 Musterstadt, GERMANY',
    'Al-Mashtal Street, Khartoum 11111, SUDAN',
    'King Abdulaziz Road, Building 15, Al-Murabba District, 11564 Riyadh, SAUDI ARABIA',
]

In [14]:
# Trying to detect addresses
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern

ADDRESS_REGEX = r"(\d*?[a-zA-z-]?,?)?\s[A-Za-z]+\s(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Place|Pl|Square|Sq)\b"
address_pattern = Pattern(
    name="address",
    regex=ADDRESS_REGEX,
    score=0.8
)
address_recognizer = PatternRecognizer(
    supported_entity="ADDRESS",
    patterns=[address_pattern]
)
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()
analyzer.registry.add_recognizer(address_recognizer)

for address in addresses:
  results = analyzer.analyze(
      text=address,
      entities=['ADDRESS', 'LOCATION', 'PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER'],
      language='en'
  )
  if results:
    print(
        anonymizer.anonymize(
            text=address,
            analyzer_results=results
        ).text
    )




<ADDRESS>, <LOCATION>, <LOCATION> 560038, <LOCATION>, <LOCATION>
ul. <PERSON>, d. 16, kv. 5, Moskva 125009, <LOCATION>
<ADDRESS>, <LOCATION>, IL 62701, <LOCATION>
<PERSON>, 12345 <LOCATION>, <LOCATION>
Al-Mashtal Street, <LOCATION>, SUDAN
Kin<ADDRESS>, <LOCATION>, <PERSON>, 11564 <LOCATION>, SAUDI ARABIA


**TESTING OUT FLAIR**

In [1]:
!pip install flair
!pip install presidio_analyzer
!pip install presidio_anonymizer
!python -m spacy download en_core_web_lg

Collecting flair
  Downloading flair-0.15.1-py3-none-any.whl.metadata (12 kB)
Collecting boto3>=1.20.27 (from flair)
  Downloading boto3-1.37.11-py3-none-any.whl.metadata (6.7 kB)
Collecting conllu<5.0.0,>=4.0 (from flair)
  Downloading conllu-4.5.3-py2.py3-none-any.whl.metadata (19 kB)
Collecting ftfy>=6.1.0 (from flair)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting langdetect>=1.0.9 (from flair)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mpld3>=0.3 (from flair)
  Downloading mpld3-0.5.10-py3-none-any.whl.metadata (5.1 kB)
Collecting pptree>=3.1 (from flair)
  Downloading pptree-3.1.tar.gz (3.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytorch-revgrad>=0.2.0 (from flair)
  Downloading pytorch_revgrad-0.2.0-py3-none-any.whl.metadata (1.7 kB)


In [10]:
story_text = '''My name is Trivandrum'''

In [11]:
# Solution ( kinda ;) )
from flair.data import Sentence
from flair.nn import Classifier
from presidio_analyzer import AnalyzerEngine, RecognizerResult, Pattern, PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from flair.models import SequenceTagger

# Presidio objects
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

# Address pattern
ADDRESS_REGEX = r"\d+\s[A-Za-z]+\s(?:Street|St|Avenue|Ave|Road|Rd|Boulevard|Blvd|Lane|Ln|Drive|Dr|Court|Ct|Place|Pl|Square|Sq)\b"
street_address_pattern = Pattern(
    name="street_address_pattern",
    regex=ADDRESS_REGEX,
    score=0.85
)
street_recognizer = PatternRecognizer(
    supported_entity="STREET_ADDRESS",
    patterns=[street_address_pattern]
)

analyzer.registry.add_recognizer(street_recognizer)

def presidio_text(data):
  results = analyzer.analyze(
      text=data,
      entities=['PERSON', 'EMAIL_ADDRESS', 'PHONE_NUMBER', 'STREET_ADDRESS', 'LOCATION'],
      language='en'
  )
  return results

def flair_text(story_text):
  sentence = Sentence(story_text)
  tagger = SequenceTagger.load("flair/ner-english-large")
  tagger.predict(sentence)

  print('Printing:', tagger.predict(sentence))
  print('Printing:', sentence.to_tagged_string())

  entity_map = {
        'LOC': 'LOCATION',
        'GPE': 'LOCATION',
        'ORG': 'ORGANIZATION',
        'PER': 'PERSON'
    }

  return [
      RecognizerResult(
          entity_type=entity_map[entity.tag],
          start=entity.start_position,
          end=entity.end_position,
          score=entity.score,
      )
      for entity in sentence.get_spans('ner') if entity.tag in entity_map
  ]

flair_data = flair_text(story_text)
presidio_data = presidio_text(story_text)

print('Flair Data:', flair_data)
print('Presidio Data:', presidio_data)

all_data = presidio_data + flair_data

updated_text = anonymizer.anonymize(
    text=story_text,
    analyzer_results=all_data
).text

print('\n\n', updated_text)




2025-03-12 05:28:39,902 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
Printing: None
Printing: Sentence[4]: "My name is Trivandrum" → ["Trivandrum"/PER]
Flair Data: [type: PERSON, start: 11, end: 21, score: 0.998149037361145]
Presidio Data: [type: LOCATION, start: 11, end: 21, score: 0.85]


 My name is <PERSON>


In [8]:
json_test_data = {
    "name": "Allwyn BAT Thomas",
    "age": 25,
    "city": "Thiruvananthapuram",
    "married": False,
    "hobbies": ["reading", "traveling", "programming"],
    "address": {
        "street": "Civil Station, Kudappanakkunnu",
        "city": "Thiruvananthapuram City",
        "postal_code": "695043 Area Code"
    },
    "country": "Australia",
    "phone": "+61 8 8672 4617"
}

In [1]:
!pip install flair

Collecting flair
  Downloading flair-0.15.1-py3-none-any.whl.metadata (12 kB)
Collecting boto3>=1.20.27 (from flair)
  Downloading boto3-1.37.12-py3-none-any.whl.metadata (6.7 kB)
Collecting conllu<5.0.0,>=4.0 (from flair)
  Downloading conllu-4.5.3-py2.py3-none-any.whl.metadata (19 kB)
Collecting ftfy>=6.1.0 (from flair)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting langdetect>=1.0.9 (from flair)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mpld3>=0.3 (from flair)
  Downloading mpld3-0.5.10-py3-none-any.whl.metadata (5.1 kB)
Collecting pptree>=3.1 (from flair)
  Downloading pptree-3.1.tar.gz (3.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytorch-revgrad>=0.2.0 (from flair)
  Downloading pytorch_revgrad-0.2.0-py3-none-any.whl.metadata (1.7 kB)


In [None]:
import pandas as pd
import json
from flair.data import Sentence
from flair.models import SequenceTagger
import re
from pathlib import Path
from google.colab import files

# Load Flair NER model (fast version for efficiency)
tagger = SequenceTagger.load("flair/ner-english-fast")

# Define sensitive and numeric keywords
SENSITIVE_KEYWORDS = ["name", "emp", "tel_emp", "location", "address", "credit", "mobile", "email"]
NUMERIC_KEYWORDS = ["revenue", "tel_rev", "profits"]

# Updated regex patterns for non-NER entities
PHONE_REGEX = r"\b(\+\d{1,4}\s?)?(\(?\d{1,4}\)?[-.]?\s?)?\d{3,4}[-.]?\d{3,4}[-.]?\d{3,4}\b"
EMAIL_REGEX = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
CREDIT_REGEX = r"\b\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}\b"

def classify_column(header):
    """Classify column as sensitive or numeric based on header."""
    header_lower = header.lower()
    if any(keyword in header_lower for keyword in SENSITIVE_KEYWORDS):
        return "sensitive"
    if any(keyword in header_lower for keyword in NUMERIC_KEYWORDS):
        return "numeric"
    return "unknown"  # Default to treating as sensitive

def mask_cell(cell, mappings, col_type, entity_counter):
    """Mask a single cell's content if sensitive, return original if numeric."""
    if pd.isna(cell) or col_type != "sensitive":
        return cell, mappings, entity_counter

    cell_str = str(cell)
    sentence = Sentence(cell_str)
    tagger.predict(sentence)

    replacements = {}

    # Detect entities with Flair
    for entity in sentence.get_spans("ner"):
        original_text = entity.text
        if original_text not in replacements:
            entity_placeholder = f"MASKED_VALUE_{entity_counter:03d}"
            replacements[original_text] = entity_placeholder
            mappings[entity_placeholder] = original_text
            entity_counter += 1

    # Detect phone numbers, emails, credit cards with regex
    for pattern, entity_type in [(PHONE_REGEX, "PHONE"), (EMAIL_REGEX, "EMAIL"), (CREDIT_REGEX, "CREDIT")]:
        matches = re.findall(pattern, cell_str)
        for match in matches:
            if match not in replacements:
                entity_placeholder = f"MASKED_VALUE_{entity_counter:03d}"
                replacements[match] = entity_placeholder
                mappings[entity_placeholder] = match
                entity_counter += 1

    # Replace all detected entities
    masked_text = cell_str
    for original, placeholder in replacements.items():
        masked_text = masked_text.replace(original, placeholder)

    return masked_text, mappings, entity_counter

def mask_data(input_file):
    """Mask sensitive data in uploaded file while keeping a mapping for deanonymization."""
    df = pd.read_csv(input_file)

    # Store mappings
    header_mapping = {}
    data_mappings = {col: {} for col in df.columns}

    # Mask headers
    new_columns = []
    for i, col in enumerate(df.columns):
        col_type = classify_column(col)
        masked_col = f"masked_col{i+1}"
        header_mapping[masked_col] = col
        new_columns.append(masked_col)

    df.columns = new_columns

    # Mask data in sensitive columns
    entity_counter = 1  # Unique entity numbering
    for col in df.columns:
        original_col = header_mapping[col]
        col_type = classify_column(original_col)
        if col_type == "sensitive":
            df[col], data_mappings[original_col], entity_counter = zip(
                *[mask_cell(cell, data_mappings[original_col], col_type, entity_counter) for cell in df[col]]
            )

    # Save masked data
    masked_file = "masked_output.csv"
    df.to_csv(masked_file, index=False)
    files.download(masked_file)

    # Save mappings
    mapping_file = "mappings.json"
    with open(mapping_file, "w") as f:
        json.dump({"headers": header_mapping, "data": data_mappings}, f)
    files.download(mapping_file)

def main():
    """Handle file upload in Google Colab."""
    print("Upload the CSV file to anonymize:")
    uploaded = files.upload()
    input_file = list(uploaded.keys())[0]
    mask_data(input_file)

if __name__ == "__main__":
    main()
