# Broadridge India Innovation Hackathon 
# Title : PII data masker 

Problem statemenet: Many organisations collect and process huge amounts of PII information, such as names, aadhaar numbers, credit card numbers and email addresses, which are subject to privacy laws and regulations such as GDPR, CCPA. Failing to protect this information results in legal action, penalties and reputational damages. However, traditional manual techniques for detecting and masking PII data are complex and prone to error processes that can be time consuming for organisations. A dependable automated system that can solve this problem is necessary in this situation. 

In [1]:
# Data collection(data can be created using faker library)
from faker import Faker
import csv

fake = Faker()

num_records = 1000

fields = ['Name', 'Social Security Number', 'Email', 'Phone number', 'Country']
faker_methods = [fake.name, fake.ssn, fake.email, fake.phone_number, fake.country]

with open('dataset.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(fields)
    for _ in range(num_records):
        row = [method() for method in faker_methods]
        writer.writerow(row)

In [2]:
# importing necessary libraries
import pandas as pd
import random
import string

In [3]:
# Identifying PII data types
pii_columns = ['Name', 'Social Security Number', 'Email', 'Phone number']

sensitive_columns = pii_columns

In [4]:
# Randomization ( using random module to create random strings)

def randomize_string(length):
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

def randomize_data(data):
    return randomize_string(len(data))

In [5]:
# Substitution of PII data

def substitute_email(data):
    return 'example@gmail.com'

def substitute_phone(data):
    return '**********'


In [6]:
masking_rules = {
    'Name': randomize_data,
    'Social Security Number': randomize_data,
    'Email': substitute_email,
    'Phone number': substitute_phone
}

In [7]:
# Apply masking techniques to the dataset
def mask_dataset(dataset):
    masked_dataset = dataset.copy()
    for column in sensitive_columns:
        if column in masked_dataset.columns:
            masked_dataset[column] = dataset[column].apply(masking_rules[column])
    return masked_dataset

In [8]:
# Read the dataset from CSV
dataset = pd.read_csv('dataset.csv')  
# Excluding the "Country" column from masking
sensitive_columns = [column for column in sensitive_columns if column in dataset.columns]
# Masking the dataset
masked_dataset = mask_dataset(dataset)
# Writing the masked dataset to CSV
masked_dataset.to_csv('masked_dataset.csv', index=False)
masked_data = pd.read_csv('masked_dataset.csv')

In [9]:
# Original dataset 
dataset.head() 

Unnamed: 0,Name,Social Security Number,Email,Phone number,Country
0,Thomas Cooke,741-38-7563,doylejack@example.net,+1-716-661-0066,Estonia
1,Christopher Hicks,038-38-0133,bradleypatricia@example.com,646.266.5755x8857,Slovakia (Slovak Republic)
2,Carrie Aguirre,718-97-6493,heatherallen@example.net,(965)542-2865x67180,Palestinian Territory
3,Tommy Klein,476-87-4020,jjones@example.net,2956698160,Ireland
4,Renee Wade,771-87-3011,tescobar@example.net,8549447933,Saint Lucia


In [10]:
# Masked dataset
masked_data.head()

Unnamed: 0,Name,Social Security Number,Email,Phone number,Country
0,qHUeFRkvrnGY,gWp3xMtKrx4,example@gmail.com,**********,Estonia
1,EU4cUGWANO9UN9FRO,XDaJ5F7ZNpc,example@gmail.com,**********,Slovakia (Slovak Republic)
2,YcBjzxjr9LhoHz,ICtIlpxOUfP,example@gmail.com,**********,Palestinian Territory
3,FZcQixWd0SJ,nhhASoQywb0,example@gmail.com,**********,Ireland
4,9S7fJypqLv,MBiuyfouZDo,example@gmail.com,**********,Saint Lucia
