In [4]:
import pandas as pd
from pprint import pprint
from comprehend_detect import ComprehendDetect
from detect_pi_si import *
from PIDetectAnalysis import *
import boto3
from botocore.exceptions import ClientError
import warnings
warnings.filterwarnings('ignore')
comprehend_client = boto3.client("comprehend")

In [16]:
text = "Sean Dyer a person lived at 4972 W. 129th Ter. Leawood, KS has ssn 489864926 credit card number 4734391837502655 analyticmodels@gmail.com https://analyticmodels.net"
comp_detect = ComprehendDetect(comprehend_client)
pii_list = comp_detect.detect_pii(text, 'en')
pprint(pii_list)

[{'BeginOffset': 0,
  'EndOffset': 9,
  'Score': 0.9999734163284302,
  'Type': 'NAME'},
 {'BeginOffset': 28,
  'EndOffset': 58,
  'Score': 0.9999976754188538,
  'Type': 'ADDRESS'},
 {'BeginOffset': 67,
  'EndOffset': 76,
  'Score': 0.9999994039535522,
  'Type': 'SSN'},
 {'BeginOffset': 96,
  'EndOffset': 112,
  'Score': 0.9999847412109375,
  'Type': 'CREDIT_DEBIT_NUMBER'},
 {'BeginOffset': 113,
  'EndOffset': 137,
  'Score': 0.9999936819076538,
  'Type': 'EMAIL'},
 {'BeginOffset': 138,
  'EndOffset': 164,
  'Score': 0.9999992251396179,
  'Type': 'URL'}]


### Names fail to detect in common scenarios with addresses

In [25]:
text = "James Street, Seattle, WA working at Wood and Sons says: 'Reduce raise author play move."
comp_detect = ComprehendDetect(comprehend_client)
pii_list = comp_detect.detect_pii(text, 'en')
pprint(pii_list)

[{'BeginOffset': 0,
  'EndOffset': 12,
  'Score': 0.9874647259712219,
  'Type': 'ADDRESS'}]


### Run Comprehend detection and redaction on 10 rows of sample data from S3 save results to local storage

In [16]:
df = pd.read_csv("s3://pearsoncomprehend/comprehendData/pii_test_faker.csv",delimiter=",", nrows=10)
df.dropna()

'''instantiate wrapper class'''
comp_detect = ComprehendDetect(comprehend_client)

'''return PI detections as dataframe'''
df_pi = detect_pi(df,comp_detect)

'''Redact PI detections'''
df_redacted = redact_df(df,df_pi)

'''Rescan for excluded names'''
df_names = detect_names(df_redacted,comp_detect)

'''redact names'''
df_redacted = redact_df(df_redacted,df_names)

'''Write 10 row data to local storage'''
df_pi.to_csv("data/detectedPI.csv",index=False)
df_names.to_csv("data/detectedNames.csv",index=False)
df_redacted.to_csv("data/fakerRedacted.csv",index=False)
df[:10].to_csv("data/faker10Rows.csv",index=False)


### Run Comprehend detection and redaction on 1000 rows of sample data in S3

In [17]:
df = pd.read_csv("s3://pearsoncomprehend/comprehendData/pii_test_faker.csv",delimiter=",")
df.dropna()

'''instantiate wrapper class'''
comp_detect = ComprehendDetect(comprehend_client)

'''return PI detections as dataframe'''
df_pi = detect_pi(df,comp_detect)

'''Redact PI detections'''
df_redacted = redact_df(df,df_pi)

'''Rescan for excluded names'''
df_names = detect_names(df_redacted,comp_detect)

'''redact names'''
df_redacted = redact_df(df_redacted,df_names)

'''Write results back to S3'''
df_pi.to_csv("s3://pearsoncomprehend/comprehendData/detectedPI.csv",index=False)
df_names.to_csv("s3://pearsoncomprehend/comprehendData/detectedNames.csv",index=False)
df_redacted.to_csv("s3://pearsoncomprehend/comprehendData/fakerRedacted.csv",index=False)

### Generate PI Detection Report

In [None]:
generator = PIDetectionReportGenerator("s3://pearsoncomprehend/comprehendData/detectedPI.csv")
generator.generate_markdown_report()

In [None]:
from IPython.display import Markdown, display
with open("pi_detection_report.md", "r") as f:
    markdown_string = f.read()
display(Markdown(markdown_string))