In [None]:
import csv
import re
from collections import defaultdict
from pathlib import Path

### Patterns

#### Credit Card Number

In [None]:
# Modified from https://www.regular-expressions.info/creditcard.html
ccn = r"""
^(?:4[0-9]{12}(?:[0-9]{3})?          # Visa
 |  ^5[0-5][0-9]{14}$                # MasterCard
 |  3[47][0-9]{13}                   # American Express
 |  3(?:0[0-5]|[68][0-9])[0-9]{11}   # Diners Club
 |  6(?:011|5[0-9]{2})[0-9]{12}      # Discover
 |  (?:2131|1800|35\d{3})\d{11}      # JCB
)$
"""

ccn = re.compile(ccn, re.X)

#### Social Security Number

In [None]:
ssn = r"^(?!(000|666|9))\d{3}-(?!00)\d{2}-(?!0000)\d{4}$|^(?!(000|666|9))\d{3}(?!00)\d{2}(?!0000)\d{4}$"

#### Email Address

In [None]:
email = r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"

#### IP Address

In [None]:
ipv4 = r"^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$"

### Mock PII Sample

In [None]:
pii_file = Path('../tests/data/mock_pii.csv')

In [None]:
# What's in the file?
print(pii_file.read_text(encoding="utf-8"))

### Look for patterns in mock PII file

In [None]:
from blacktape.lib import match_patterns_in_text

# pattern-label pairs, with patterns being either strings or compiled regular expressions
patterns = [
    (ccn, "ccn"),
    (ssn, "ssn"),
    (email, "email"),
    (ipv4, "ipv4"),
]

found = defaultdict(list)

with pii_file.open(mode="r") as f:
    # For each line of CSV data
    for person in csv.DictReader(f):

        # For each CSV field
        for key, value in person.items():

            # For each match of our selected patterns
            for match in match_patterns_in_text(value, patterns):

                # Store match result in label-match mapping
                found[match["label"]].append(match)


# Total match count by pattern
for label, matches in found.items():
    print(f"{label}:\t{len(matches)}")