In [1]:
import csv
import re
from collections import defaultdict
from pathlib import Path

### Patterns

#### Credit Card Number

In [2]:
# Modified from https://www.regular-expressions.info/creditcard.html
ccn = r"""
^(?:4[0-9]{12}(?:[0-9]{3})?          # Visa
 |  ^5[0-5][0-9]{14}$                # MasterCard
 |  3[47][0-9]{13}                   # American Express
 |  3(?:0[0-5]|[68][0-9])[0-9]{11}   # Diners Club
 |  6(?:011|5[0-9]{2})[0-9]{12}      # Discover
 |  (?:2131|1800|35\d{3})\d{11}      # JCB
)$
"""

ccn = re.compile(ccn, re.X)

#### Social Security Number

In [3]:
ssn = r"^(?!(000|666|9))\d{3}-(?!00)\d{2}-(?!0000)\d{4}$|^(?!(000|666|9))\d{3}(?!00)\d{2}(?!0000)\d{4}$"

#### Email Address

In [4]:
email = r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"

#### IP Address

In [5]:
ipv4 = r"^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$"

### Mock PII Sample

In [6]:
pii_file = Path('../tests/data/mock_pii.csv')

In [7]:
# What's in the file?
print(pii_file.read_text(encoding="utf-8"))

first_name,last_name,email,ssn,ccn,ipv4
Gaven,Withnall,gwithnall0@spotify.com,231-31-8375,5048370929250009,0.119.63.89
Hermie,Giorgio,hgiorgio1@domainmarket.com,651-11-9512,5108753996251538,227.30.154.229
Miller,Maytum,mmaytum2@about.me,266-56-5007,5108751515114302,230.142.200.232
Effie,O'Regan,eoregan3@123-reg.co.uk,133-87-3446,5048370400186615,130.41.186.30
Starla,Linggood,slinggood4@merriam-webster.com,619-50-6491,5048370083353243,107.44.242.8
Jeanelle,Warricker,jwarricker5@1und1.de,561-50-3535,5108756745083623,210.27.5.152
Brandice,Gallihawk,bgallihawk6@scientificamerican.com,377-33-6286,5108759765086468,19.94.135.19
Bradley,Stainland,bstainland7@skype.com,758-39-7295,5048374873244448,114.136.212.234
Jakob,Breffit,jbreffit8@devhub.com,415-35-2679,5048376186247091,71.34.164.53
Kendricks,Rohlfing,krohlfing9@globo.com,856-61-8322,5048371006435703,205.209.127.138
Yuri,Side,ysidea@globo.com,769-39-6596,5108758076212508,79.123.251.11
Pauly,Bottrill,pbottrillb@alibaba.com,288-81-8197,5048

### Look for patterns in mock PII file

In [8]:
from blacktape.lib import match_patterns_in_text

# pattern-label pairs, with patterns being either strings or compiled regular expressions
patterns = [
    (ccn, "ccn"),
    (ssn, "ssn"),
    (email, "email"),
    (ipv4, "ipv4"),
]

found = defaultdict(list)

with pii_file.open(mode="r") as f:
    # For each line of CSV data
    for person in csv.DictReader(f):

        # For each CSV field
        for key, value in person.items():

            # For each match of our selected patterns
            for match in match_patterns_in_text(value, patterns):

                # Store match result in label-match mapping
                found[match["label"]].append(match)


# Total match count by pattern
for label, matches in found.items():
    print(f"{label}:\t{len(matches)}")

email:	50
ssn:	50
ccn:	50
ipv4:	50
