In [0]:
%pip install -r ../../requirements.txt -q

In [0]:
dbutils.widgets.dropdown("aws_secret_creation", defaultValue="True", choices=["True", "False"])
dbutils.widgets.text("schema", defaultValue="human_resources")
dbutils.widgets.text("region", defaultValue="eu-west-1")
dbutils.widgets.text("catalog", defaultValue="production")
dbutils.widgets.text("uc_service_credential", defaultValue="production-aws-secrets-manager")

metastore_id = sql("SELECT element_at(split(current_metastore(), ':'), -1) AS metastore").first().metastore
catalog = dbutils.widgets.get("catalog")
region = dbutils.widgets.get("region")
schema = dbutils.widgets.get("schema")
service_credential = dbutils.widgets.get("uc_service_credential")

## Step 1
* Download the titanic dataset and store it in a UC volume for raw files. 
* We'll use this to simulate a dataset that contains PII (Name, Age, Sex)

In [0]:
%sql
USE CATALOG IDENTIFIER(:catalog);
CREATE SCHEMA IF NOT EXISTS IDENTIFIER(concat(:catalog, '.', :schema));
CREATE VOLUME IF NOT EXISTS IDENTIFIER(concat(:catalog, '.', :schema, '.raw_files'));

In [0]:
import subprocess

file_url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
volume_path = f"/Volumes/{catalog}/{schema}/raw_files/titanic.csv"

subprocess.run(["wget", file_url, "-O", volume_path], check=True)
display(dbutils.fs.ls(f"/Volumes/{catalog}/{schema}/raw_files/"))

## Step 2
* Generate a KEK and DEK 

In [0]:
crypto_functions = dbutils.import_notebook("notebooks.envelope_encryption_v2.common.pycrypto_functions")

kek = crypto_functions.generate_kek()
dek = crypto_functions.generate_dek()
encrypted_dek = crypto_functions.encrypt_with_kek(
    kek_password=kek.get("kek_password"), 
    kek_salt=kek.get("kek_salt"), 
    to_encrypt=dek.get("private_key"))
encrypted_dek["dek"] = encrypted_dek.get("encrypted_string")
del dek["private_key"]
del encrypted_dek["encrypted_string"]
secret = encrypted_dek | dek
updated_secret = {f"{schema}_{k}": v for k, v in secret.items()}

## Step 3
* Create an AWS secret to store our KEK and DEK
* In order for this to work your UC service credential will need the following privileges: `"secretsmanager:ListSecrets", "secretsmanager:CreateSecret"`
* You can optionally create your secret manually, via your own scripts or via IaaC such as Terraform

In [0]:
import json 
import boto3
from botocore.exceptions import ClientError

if eval(dbutils.widgets.get("aws_secret_creation")):

    secret_string = json.dumps(kek|updated_secret)

    boto3_session = boto3.Session(botocore_session=dbutils.credentials.getServiceCredentialsProvider(service_credential), region_name=dbutils.widgets.get("region"))

    try:
        secret = crypto_functions.create_aws_secret(
            session=boto3_session, 
            secret_name=f"unity_catalog/{metastore_id}/{catalog}", 
            secret_description=f"KEK and encrypted DEKs for the UC catalog {catalog} in metastore {metastore_id}", 
            secret_string=secret_string, 
            tags=[], 
            kms_key="alias/aws/secretsmanager")
        print(f"Successfully created secret in AWS!\nName: {secret.get('Name')}\nARN: {secret.get('ARN')}\nVersion: {secret.get('VersionId')}")
    except ClientError as e:
        print(e)

## Step 4
* Create an `unwrap_key()` function that can be used to return a decrypted DEK 
* This function is a [Unity Catalog batch python user-defined function (UDF)](https://docs.databricks.com/aws/en/udf/python-batch-udf) that uses a [Unity Catalog service credential to connect to external cloud services](https://docs.databricks.com/aws/en/connect/unity-catalog/cloud-services/use-service-credentials) (in this instance AWS secrets manager)
* Only privileged crypto administrators should have permissions to access this function, and the UC service credential it uses to connect to AWS

> ### IMPORTANT: 
Please update the `CREDENTIALS()` section below with your `uc_service_credential`

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS crypto;
USE SCHEMA crypto;
-- IMPORTANT!!! 
---> BEFORE RUNNING THIS STEP PLEASE UPDATE THE CREDENTIALS() SECTION TO REFERENCE YOUR uc_service_credential
CREATE OR REPLACE FUNCTION crypto.unwrap_key(secret_name STRING, key_name STRING) 
RETURNS STRING
LANGUAGE PYTHON
PARAMETER STYLE PANDAS
HANDLER 'batchhandler'
CREDENTIALS (
  `production-aws-secrets-manager` DEFAULT -- IMPORTANT! REPLACE THIS WITH YOUR UC SERVICE CREDENTIAL!!!
  -- service credential should align with the catalog!
)
ENVIRONMENT (
  dependencies = '["pycryptodome==3.22.0"]',
  environment_version = 'None'
)
AS $$
import boto3
from pyspark.taskcontext import TaskContext
from botocore.exceptions import ClientError
from Crypto.Cipher import AES
from Crypto.Protocol.KDF import scrypt
import base64
from typing import Iterator, Tuple
import json
import pandas as pd

def setup_session():

  session = boto3.Session()
  region = TaskContext.get().getLocalProperty("spark.databricks.clusterUsageTags.region")
  client = session.client("secretsmanager", region_name=region)
  return client

def decrypt_with_kek(kek_password, kek_salt, dek, nonce, tag):

  salt = base64.b64decode(kek_salt)
  kek = scrypt(kek_password, salt, key_len=32, N=2**17, r=8, p=1)
  cipher = AES.new(kek, AES.MODE_GCM, nonce=base64.b64decode(nonce))
  decrypted = cipher.decrypt(base64.b64decode(dek))
  try:
    cipher.verify(base64.b64decode(tag))
  except ValueError as e:
    raise e
  return decrypted.decode('utf-8')

client = setup_session()

def batchhandler(batch_iter: Iterator[Tuple[pd.Series, pd.Series]]) -> Iterator[pd.DataFrame]:

  for s, d in batch_iter:

    key_name = d[0]
    try:
      response = client.get_secret_value(SecretId=s[0])
      secret = json.loads(response.get("SecretString"))
    except ClientError as e:
      raise e
    
    secret["dek"] = decrypt_with_kek(
      kek_password=secret.get("kek_password"), 
      kek_salt=secret.get("kek_salt"), 
      dek=secret.get(f"{key_name}_dek"), 
      nonce=secret.get(f"{key_name}_nonce"),
      tag=secret.get(f"{key_name}_tag"))
    yield pd.Series(secret.get("dek"))
$$;

## Step 5
* Create `encrypt()` and `decrypt` functions that can be used to encrypt/decrypt data within our catalog. 
* These function will call our more privileged `unwrap_key` function in order to unwrap DEKs and encrypt or decrypt the data

In [0]:
sql(f"""
    CREATE OR REPLACE FUNCTION crypto.encrypt(col STRING, schema STRING) 
    RETURNS STRING
    RETURN base64(aes_encrypt(
            col, 
            (SELECT * FROM (SELECT crypto.unwrap_key("unity_catalog/{metastore_id}/{catalog}", schema))),
            'GCM',  
            'DEFAULT'
        ))
    """)

In [0]:
sql(f"""
    CREATE OR REPLACE FUNCTION crypto.decrypt(col STRING, schema STRING) 
    RETURNS STRING
    RETURN 
        CASE WHEN is_account_group_member(CONCAT('{catalog}.', schema, '.crypto.user')) THEN 
        nvl(CAST(try_aes_decrypt(unbase64(col), 
        (SELECT * FROM (SELECT crypto.unwrap_key("unity_catalog/{metastore_id}/{catalog}", schema))),
        'GCM',  
        'DEFAULT') AS STRING), 
        col)
        ELSE col END
    """)

## Step 6
* Create a table from the raw data we downloaded above, encrypting the columns that contain sensitive data

In [0]:
%sql
USE SCHEMA IDENTIFIER(:schema);
CREATE OR REPLACE TABLE IDENTIFIER(:schema || '.titanic') AS (
SELECT 
PassengerId,
crypto.encrypt(Name, :schema) AS Name,
crypto.encrypt(Age, :schema) AS Age,
crypto.encrypt(Sex, :schema) AS Sex,
* EXCEPT(PassengerId, Name, Age, Sex)
FROM read_files(
  concat('/Volumes/', :catalog, '/', :schema, '/raw_files/titanic.csv'),
  format => 'csv',
  header => true,
  mode => 'FAILFAST')
);
SELECT * FROM IDENTIFIER(:catalog || '.' || :schema || '.titanic');

## Step 7
* Check that the decrypt functions work as expected...

In [0]:
%sql
SELECT
PassengerId, 
crypto.decrypt(Name, :schema) AS Name,
crypto.decrypt(Age, :schema) AS Age,
crypto.decrypt(Sex, :schema) AS Sex,
* EXCEPT(PassengerId, Name, Age, Sex)
FROM IDENTIFIER(:catalog || '.' || :schema || '.titanic');

## Step 8 
* You can also add a column mask to the encrypted table
* A column mask serves the following purposes:
  * The calling users don't even need permissions to the `encrypt()` and `decrypt` functions or the `crypto` schema
  * The whole process of encryption/decryption is abstracted away from them

> ### NOTE: 
Adding a column mask is likely to cause calling the `decrypt()` function directly to fail, since the column mask will try to decrypt the data automatically and you'll be trying to decrypt the already decrypted results!

> ### IMPORTANT: 
Please update the `USING COLUMNS('customers')` section below with your `schema` name

In [0]:
%sql
-- IMPORTANT!!! 
---> BEFORE RUNNING THIS STEP PLEASE UPDATE THE USING COLUMNS() SECTION TO REFERENCE YOUR UC SCHEMA NAME
ALTER TABLE IDENTIFIER(:catalog || '.' || :schema || '.titanic') ALTER COLUMN Name SET MASK crypto.decrypt USING COLUMNS('human_resources');
ALTER TABLE IDENTIFIER(:catalog || '.' || :schema || '.titanic') ALTER COLUMN Age SET MASK crypto.decrypt USING COLUMNS ('human_resources');
ALTER TABLE IDENTIFIER(:catalog || '.' || :schema || '.titanic') ALTER COLUMN Sex SET MASK crypto.decrypt USING COLUMNS ('human_resources');

In [0]:
%sql
SELECT 
*
FROM IDENTIFIER(:catalog || '.' || :schema || '.titanic');