# Masking personally identifiable information (PII)

In [9]:
# !pip install faker faker-schema

Collecting numexpr
  Using cached numexpr-2.8.4-cp310-cp310-win_amd64.whl (92 kB)
Collecting numpy>=1.13.3 (from numexpr)
  Using cached numpy-1.24.3-cp310-cp310-win_amd64.whl (14.8 MB)
Installing collected packages: numpy, numexpr
Successfully installed numexpr-2.8.4 numpy-1.24.3


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyter-bokeh 3.0.7 requires bokeh==3.*, but you have bokeh 2.4.3 which is incompatible.
mlflow 1.28.0 requires importlib-metadata!=4.7.0,<5,>=3.7.0, but you have importlib-metadata 6.5.0 which is incompatible.
mlflow 1.28.0 requires packaging<22, but you have packaging 23.1 which is incompatible.
mlflow 1.28.0 requires pytz<2023, but you have pytz 2023.3 which is incompatible.
numba 0.56.2 requires numpy<1.24,>=1.18, but you have numpy 1.24.3 which is incompatible.
pandas-profiling 3.3.0 requires joblib~=1.1.0, but you have joblib 1.2.0 which is incompatible.
pandas-profiling 3.3.0 requires matplotlib<3.6,>=3.2, but you have matplotlib 3.7.1 which is incompatible.
pandas-profiling 3.3.0 requires numpy<1.24,>=1.16.0, but you have numpy 1.24.3 which is incompatible.
pandas-profiling 3.3.0 requires pandas!=1.4.0,<1.

## Direct Masking (Failed - fail to detect unique ID from dictionary, must be in integer)

In [23]:
from faker import Faker
import random
import string

# Create a Faker instance
fake = Faker()

# Generate dummy personal data
num_records = 10
personal_data = []
for i in range(num_records):
    # Generate a random 9-digit ID number
    id_no = ''.join(random.choices(string.digits, k=9))
    data = {
        "id_no": id_no,
        "name": fake.name(),
        "email": fake.email(),
        "phone_number": fake.phone_number(),
        "address": fake.address(),
    }
    personal_data.append(data)

# Display the original personal data
print("Original Personal Data:")
for data in personal_data:
    print(data)

# Mask the id_no and name columns
masked_data = []
for data in personal_data:
    masked_id_no = 'X' * 7 + data['id_no'][-4:]
    masked_name = 'X' * len(data['name'].split()[0]) + ' ' + ' '.join(data['name'].split()[1:])
    masked_data.append({**data, "id_no": masked_id_no, "name": masked_name})

# Display the masked personal data
print("\nMasked Personal Data:")
for data in masked_data:
    print(data)

# Create a mapping dictionary
mapping_dict = {}
for original_data, masked_data in zip(personal_data, masked_data):
    original_id_no = original_data['id_no']
    masked_id_no = masked_data['id_no']
    original_name = original_data['name']
    masked_name = masked_data['name']
    mapping_dict[masked_id_no] = original_id_no
    mapping_dict[masked_name] = original_name

# Remap the masked data back to the original data
remapped_data = []
for masked_record in masked_data:
    masked_id_no = mapping_dict.get(masked_record['id_no'], masked_record['id_no'])
    masked_name = mapping_dict.get(masked_record['name'], masked_record['name'])
    original_id_no = mapping_dict.get(masked_id_no, masked_id_no)
    original_name = mapping_dict.get(masked_name, masked_name)
    remapped_data.append({**masked_record, "id_no": original_id_no, "name": original_name})

# Display the remapped personal data
print("\nRemapped Personal Data:")
for data in remapped_data:
    print(data)


Original Personal Data:
{'id_no': '543609320', 'name': 'Julie Parker', 'email': 'reynoldsstephanie@example.com', 'phone_number': '001-754-672-7246x524', 'address': '2210 Charles Ford\nCisnerosshire, AR 45535'}
{'id_no': '555367890', 'name': 'Margaret Coleman', 'email': 'bdavis@example.com', 'phone_number': '3669458174', 'address': '43025 Gonzales Squares\nWongfurt, TX 76564'}
{'id_no': '250725242', 'name': 'Patricia Dickerson', 'email': 'taylor27@example.com', 'phone_number': '+1-056-145-7357x74634', 'address': 'PSC 6566, Box 0488\nAPO AA 54827'}
{'id_no': '975874059', 'name': 'Jennifer Miller', 'email': 'griffinjesus@example.org', 'phone_number': '168-944-5536x261', 'address': '7537 Jamie Trail\nWest Terrimouth, AL 01255'}
{'id_no': '438481931', 'name': 'Tyler Bailey', 'email': 'vsanchez@example.com', 'phone_number': '(636)899-3975x8461', 'address': '45918 Goodwin Isle\nPort Autumn, IN 76290'}
{'id_no': '652241524', 'name': 'Kenneth Barker', 'email': 'davidrodriguez@example.net', 'pho

TypeError: string indices must be integers

#### suitable for display the actual data with masking

## SHA-hashlib (Succeed) - Can remapping the original data

### masking column no_id and name

In [37]:
from faker import Faker
import random
import string
import hashlib

# Create a Faker instance
fake = Faker()

# Generate dummy personal data
num_records = 10
personal_data = []
for i in range(num_records):
    # Generate a random 9-digit ID number
    id_no = ''.join(random.choices(string.digits, k=9))
    data = {
        "id_no": id_no,
        "name": fake.name(),
        "email": fake.email(),
        "phone_number": fake.phone_number(),
        "address": fake.address(),
    }
    personal_data.append(data)

# Display the original personal data
print("Original Personal Data:")
for data in personal_data:
    print(data)

# Mask the id_no and name columns
masked_data = []
for data in personal_data:
    masked_id_no = hashlib.sha256(data['id_no'].encode()).hexdigest()
    masked_name = hashlib.sha256(data['name'].encode()).hexdigest()
    masked_data.append({**data, "id_no": masked_id_no, "name": masked_name})

# Display the masked personal data
print("\nMasked Personal Data:")
for data in masked_data:
    print(data)

# Create a mapping dictionary
mapping_dict = {}
for original_data, masked_record in zip(personal_data, masked_data):
    original_id_no = original_data['id_no']
    masked_id_no = masked_record['id_no']
    original_name = original_data['name']
    masked_name = masked_record['name']
    mapping_dict[masked_id_no] = original_id_no
    mapping_dict[masked_name] = original_name

# Remap the masked data back to the original data
remapped_data = []
for masked_record in masked_data:
    masked_id_no = masked_record['id_no']
    masked_name = masked_record['name']
    original_id_no = mapping_dict.get(masked_id_no, masked_id_no)
    original_name = mapping_dict.get(masked_name, masked_name)
    remapped_data.append({**masked_record, "id_no": original_id_no, "name": original_name})

# Display the remapped personal data
print("\nRemapped Personal Data:")
for data in remapped_data:
    print(data)

Original Personal Data:
{'id_no': '516421849', 'name': 'Susan Ortiz', 'email': 'thomasrodriguez@example.net', 'phone_number': '001-670-678-3257x98629', 'address': 'USNS Davis\nFPO AP 47322'}
{'id_no': '730055429', 'name': 'Emily Garcia', 'email': 'mirandatyler@example.com', 'phone_number': '332.400.2721x385', 'address': '9225 Henry Circles\nSouth Ronald, HI 86563'}
{'id_no': '307138181', 'name': 'Stephanie Andrade', 'email': 'kelly29@example.com', 'phone_number': '(623)752-2745', 'address': '40793 Timothy Stream Suite 179\nMarieville, CA 69187'}
{'id_no': '614912960', 'name': 'John James', 'email': 'mark47@example.org', 'phone_number': '581.774.7315', 'address': '91156 Smith Gardens Suite 999\nRobertmouth, ID 96694'}
{'id_no': '242655126', 'name': 'Danny Mathews', 'email': 'johnjensen@example.org', 'phone_number': '(012)940-4785x90533', 'address': '49134 Benjamin Village\nMelissafort, ME 82403'}
{'id_no': '830360754', 'name': 'Susan Miller', 'email': 'barbara96@example.org', 'phone_num

In [38]:
print(personal_data)

[{'id_no': '516421849', 'name': 'Susan Ortiz', 'email': 'thomasrodriguez@example.net', 'phone_number': '001-670-678-3257x98629', 'address': 'USNS Davis\nFPO AP 47322'}, {'id_no': '730055429', 'name': 'Emily Garcia', 'email': 'mirandatyler@example.com', 'phone_number': '332.400.2721x385', 'address': '9225 Henry Circles\nSouth Ronald, HI 86563'}, {'id_no': '307138181', 'name': 'Stephanie Andrade', 'email': 'kelly29@example.com', 'phone_number': '(623)752-2745', 'address': '40793 Timothy Stream Suite 179\nMarieville, CA 69187'}, {'id_no': '614912960', 'name': 'John James', 'email': 'mark47@example.org', 'phone_number': '581.774.7315', 'address': '91156 Smith Gardens Suite 999\nRobertmouth, ID 96694'}, {'id_no': '242655126', 'name': 'Danny Mathews', 'email': 'johnjensen@example.org', 'phone_number': '(012)940-4785x90533', 'address': '49134 Benjamin Village\nMelissafort, ME 82403'}, {'id_no': '830360754', 'name': 'Susan Miller', 'email': 'barbara96@example.org', 'phone_number': '001-184-627

In [39]:
print(masked_data)

[{'id_no': '5bc95e09b059851c850ed505d50d34b271214e299566a49709897319c9e202ac', 'name': 'e5bac38d518f2d06de391217fc97cda20ea39c1b86691ad91a2b665382003d7d', 'email': 'thomasrodriguez@example.net', 'phone_number': '001-670-678-3257x98629', 'address': 'USNS Davis\nFPO AP 47322'}, {'id_no': '9ac5857ea9485d32ea42951ff47a1825e17961b7f857ed28a7b8cf1a41056887', 'name': '290ae1e53dc556580a340775dc11fcc1168eb1558cfc55ed31300c0fa932aa70', 'email': 'mirandatyler@example.com', 'phone_number': '332.400.2721x385', 'address': '9225 Henry Circles\nSouth Ronald, HI 86563'}, {'id_no': '8df390b6cbb295a016925749da36831f6fcc1ff4c3fa0b01ca4760ab17139086', 'name': '8c4ff027bdece52970e7fbcdc344324a3888e84d032956310878bbd702458608', 'email': 'kelly29@example.com', 'phone_number': '(623)752-2745', 'address': '40793 Timothy Stream Suite 179\nMarieville, CA 69187'}, {'id_no': '33ec0830b9074fdb8c757f5fde667c02a99c25b01b469bfdaafdab5a9d06a909', 'name': '0ff038bca6b6171d19920ca6cdf1103fbfed6fc2d0655bf8c485424267e41dc6

In [40]:
print(mapping_dict)

{'5bc95e09b059851c850ed505d50d34b271214e299566a49709897319c9e202ac': '516421849', 'e5bac38d518f2d06de391217fc97cda20ea39c1b86691ad91a2b665382003d7d': 'Susan Ortiz', '9ac5857ea9485d32ea42951ff47a1825e17961b7f857ed28a7b8cf1a41056887': '730055429', '290ae1e53dc556580a340775dc11fcc1168eb1558cfc55ed31300c0fa932aa70': 'Emily Garcia', '8df390b6cbb295a016925749da36831f6fcc1ff4c3fa0b01ca4760ab17139086': '307138181', '8c4ff027bdece52970e7fbcdc344324a3888e84d032956310878bbd702458608': 'Stephanie Andrade', '33ec0830b9074fdb8c757f5fde667c02a99c25b01b469bfdaafdab5a9d06a909': '614912960', '0ff038bca6b6171d19920ca6cdf1103fbfed6fc2d0655bf8c485424267e41dc6': 'John James', 'fb205051d8d7f452eb4128221791dd78fe819875ff9d90f34ad5a297a2bc5349': '242655126', 'ff49843a50a584ccf88e41eb390fc42889612e3b6fa20f882112aca3876cd466': 'Danny Mathews', '07e7c9f9463aa28850c19ee3a6dabe4aadb6cfa1d2d094794d49400d1cceee34': '830360754', '0300bcf1a0a8e08609506f69c666210b2c6bfecb324afc4c33db680b7eeedc52': 'Susan Miller', 'a1a5b

In [41]:
print(remapped_data)

[{'id_no': '516421849', 'name': 'Susan Ortiz', 'email': 'thomasrodriguez@example.net', 'phone_number': '001-670-678-3257x98629', 'address': 'USNS Davis\nFPO AP 47322'}, {'id_no': '730055429', 'name': 'Emily Garcia', 'email': 'mirandatyler@example.com', 'phone_number': '332.400.2721x385', 'address': '9225 Henry Circles\nSouth Ronald, HI 86563'}, {'id_no': '307138181', 'name': 'Stephanie Andrade', 'email': 'kelly29@example.com', 'phone_number': '(623)752-2745', 'address': '40793 Timothy Stream Suite 179\nMarieville, CA 69187'}, {'id_no': '614912960', 'name': 'John James', 'email': 'mark47@example.org', 'phone_number': '581.774.7315', 'address': '91156 Smith Gardens Suite 999\nRobertmouth, ID 96694'}, {'id_no': '242655126', 'name': 'Danny Mathews', 'email': 'johnjensen@example.org', 'phone_number': '(012)940-4785x90533', 'address': '49134 Benjamin Village\nMelissafort, ME 82403'}, {'id_no': '830360754', 'name': 'Susan Miller', 'email': 'barbara96@example.org', 'phone_number': '001-184-627

### masking column name only

In [43]:
from faker import Faker
import random
import string
import hashlib

# Create a Faker instance
fake = Faker()

# Generate dummy personal data
num_records = 10
personal_data = []
for i in range(num_records):
    # Generate a random 9-digit ID number
    id_no = ''.join(random.choices(string.digits, k=9))
    data = {
        "id_no": id_no,
        "name": fake.name(),
        "email": fake.email(),
        "phone_number": fake.phone_number(),
        "address": fake.address(),
    }
    personal_data.append(data)

# Display the original personal data
print("Original Personal Data:")
for data in personal_data:
    print(data)

# Mask the name column
masked_data = []
for data in personal_data:
    masked_name = hashlib.sha256(data['name'].encode()).hexdigest()
    masked_data.append({**data, "name": masked_name})

# Display the masked personal data
print("\nMasked Personal Data:")
for data in masked_data:
    print(data)

# Create a mapping dictionary
mapping_dict = {}
for original_data, masked_record in zip(personal_data, masked_data):
    original_name = original_data['name']
    masked_name = masked_record['name']
    mapping_dict[masked_name] = original_name

# Remap the masked data back to the original data
remapped_data = []
for masked_record in masked_data:
    masked_name = masked_record['name']
    original_name = mapping_dict.get(masked_name, masked_name)
    remapped_data.append({**masked_record, "name": original_name})

# Display the remapped personal data
print("\nRemapped Personal Data:")
for data in remapped_data:
    print(data)


Original Personal Data:
{'id_no': '200748009', 'name': 'James Smith', 'email': 'ygamble@example.org', 'phone_number': '+1-084-062-8814x04455', 'address': '32729 Greg Crest Apt. 319\nSimmonsbury, MI 47066'}
{'id_no': '439509617', 'name': 'Matthew Long', 'email': 'richard38@example.com', 'phone_number': '256.563.6226x505', 'address': '97807 Sandra Knolls Suite 722\nKathleenborough, WA 76037'}
{'id_no': '675581399', 'name': 'James Peterson', 'email': 'melissa01@example.com', 'phone_number': '119.616.8843', 'address': '1713 Arias Corners Apt. 791\nEast Michelle, MI 50773'}
{'id_no': '420819284', 'name': 'Philip Schultz', 'email': 'ftaylor@example.com', 'phone_number': '001-326-527-6602x8659', 'address': 'Unit 6186 Box 9197\nDPO AA 73365'}
{'id_no': '421977096', 'name': 'Kimberly Diaz', 'email': 'steven90@example.org', 'phone_number': '001-687-329-1557x89647', 'address': '531 Wang Courts\nSouth Christopher, MP 70619'}
{'id_no': '296819419', 'name': 'Robin Wade', 'email': 'gary66@example.com

### apply XXXX to masking name and id_no; still can use id_no as reference key

In [50]:
from faker import Faker

# Create a Faker instance
fake = Faker()

# Generate dummy personal data
num_records = 10
personal_data = []
for i in range(num_records):
    data = {
        "id_no": fake.ssn(),
        "name": fake.name(),
        "email": fake.email(),
        "phone_number": fake.phone_number(),
        "address": fake.address(),
    }
    personal_data.append(data)

# Display the original personal data
print("Original Personal Data:")
for data in personal_data:
    print(data)

# Mask the id_no and name columns
masked_data = []
id_no_mapping = {}
for data in personal_data:
    masked_id_no = 'X' * 7 + data['id_no'][-3:] # Mask the first 7 digits of the SSN
    masked_name ='X' * len(data['name'].split()[0]) + ' ' + ' '.join(data['name'].split()[1:]) # Mask the first name
    masked_data.append({**data, "id_no": masked_id_no, "name": masked_name})
    id_no_mapping[data['id_no']] = masked_id_no

# Display the masked personal data
print("\nMasked Personal Data:")
for data in masked_data:
    print(data)

# Use the masked id_no for external references
print("\nExternal References:")
for original_id_no, masked_id_no in id_no_mapping.items():
    print(original_id_no, "->", masked_id_no)

Original Personal Data:
{'id_no': '680-57-8288', 'name': 'Mario Chavez', 'email': 'kimberlytaylor@example.net', 'phone_number': '(814)426-9115', 'address': '8000 Ward Drive Apt. 602\nAmandaside, NJ 23472'}
{'id_no': '770-36-1559', 'name': 'Michael Richard', 'email': 'kevin51@example.org', 'phone_number': '+1-013-328-0938x230', 'address': '025 Tanya Circle\nEast Josephmouth, AK 73669'}
{'id_no': '029-26-6431', 'name': 'Jordan Smith', 'email': 'eterry@example.com', 'phone_number': '357-522-5068', 'address': '982 Young Avenue\nObrienberg, AS 15711'}
{'id_no': '056-20-0822', 'name': 'Pamela Medina', 'email': 'jamesmorgan@example.org', 'phone_number': '265.739.2213x85603', 'address': '47465 Jacob Spring Suite 602\nLake Luis, NC 89396'}
{'id_no': '308-93-1905', 'name': 'Charles Torres', 'email': 'cruzpaul@example.net', 'phone_number': '076.406.4997', 'address': '1946 Joshua Drive Apt. 289\nWest Kathy, CO 46723'}
{'id_no': '583-34-9554', 'name': 'Grant Johnson', 'email': 'sbowman@example.org'

In [51]:
# Remap the masked data back to the original data
remapped_data = []
for masked_record in masked_data:
    original_id_no = id_no_mapping.get(masked_record['id_no'], masked_record['id_no'])
    remapped_data.append({**masked_record, "id_no": original_id_no})

# Display the remapped personal data
print("\nRemapped Personal Data:")
for data in remapped_data:
    print(data)


Remapped Personal Data:
{'id_no': 'XXXXXXX288', 'name': 'XXXXX Chavez', 'email': 'kimberlytaylor@example.net', 'phone_number': '(814)426-9115', 'address': '8000 Ward Drive Apt. 602\nAmandaside, NJ 23472'}
{'id_no': 'XXXXXXX559', 'name': 'XXXXXXX Richard', 'email': 'kevin51@example.org', 'phone_number': '+1-013-328-0938x230', 'address': '025 Tanya Circle\nEast Josephmouth, AK 73669'}
{'id_no': 'XXXXXXX431', 'name': 'XXXXXX Smith', 'email': 'eterry@example.com', 'phone_number': '357-522-5068', 'address': '982 Young Avenue\nObrienberg, AS 15711'}
{'id_no': 'XXXXXXX822', 'name': 'XXXXXX Medina', 'email': 'jamesmorgan@example.org', 'phone_number': '265.739.2213x85603', 'address': '47465 Jacob Spring Suite 602\nLake Luis, NC 89396'}
{'id_no': 'XXXXXXX905', 'name': 'XXXXXXX Torres', 'email': 'cruzpaul@example.net', 'phone_number': '076.406.4997', 'address': '1946 Joshua Drive Apt. 289\nWest Kathy, CO 46723'}
{'id_no': 'XXXXXXX554', 'name': 'XXXXX Johnson', 'email': 'sbowman@example.org', 'ph