In [0]:
%pip install -q faker mimesis

In [0]:
dbutils.library.restartPython()

In [0]:
from databricks.sdk import WorkspaceClient

wsc = WorkspaceClient()

dbutils.widgets.dropdown("num_rows", defaultValue="1000", choices=["1000", "10000", "100000", "1000000"])
catalogs = [x.full_name for x in list(wsc.catalogs.list())]
dbutils.widgets.dropdown("catalog", defaultValue=catalogs[0], choices=catalogs)
schemas = [x.name for x in list(wsc.schemas.list(catalog_name=dbutils.widgets.get("catalog")))]
dbutils.widgets.dropdown("schema", defaultValue=schemas[0], choices=schemas)
num_rows = int(dbutils.widgets.get("num_rows"))
dbutils.widgets.text("table_name", defaultValue=f"fake_pii_data")

In [0]:
from faker import Faker
from mimesis import Generic
from mimesis.locales import Locale
import random
from pyspark.sql.types import *
import pandas as pd

fake = Faker('en_US')

schema = StructType([
    StructField("name", StringType(), False),
    # StructField("date_of_birth", StringType(), False),
    StructField("email", StringType(), False),
    StructField("passport", StringType(), False),
    StructField("phone_number", StringType(), False),
    StructField("ipv4", StringType(), False),
    StructField("ipv6", StringType(), False),
    StructField("address", StringType(), False),
    StructField("location", StringType(), False),
    StructField("ssn", StringType(), False),
    StructField("itin", StringType(), False),
    StructField("bank_number", StringType(), False),
    StructField("iban", StringType(), False),
    StructField("credit_card_number", StringType(), False)
    ])

def generate_data(pdf):

    pdf["name"] = fake.name()
    # pdf["date_of_birth"] = fake.date_of_birth()
    pdf["email"] = random.choice([fake.email(), fake.company_email(), fake.free_email()])
    pdf["passport"] = random.choice([fake.passport_number(), fake.passport_full()])
    pdf["phone_number"] = random.choice([fake.phone_number(), fake.basic_phone_number()])
    pdf["ipv4"] = random.choice([fake.ipv4(), fake.ipv4_private(), fake.ipv4_public()])
    pdf["ipv6"] = fake.ipv6()
    pdf["address"] = fake.address()
    pdf["location"] = random.choice([fake.city(), fake.country(), fake.state(), fake.street_address()])
    pdf["ssn"] = fake.ssn()
    pdf["itin"] = fake.itin()
    pdf["bank_number"] = fake.bban()
    pdf["iban"] = fake.iban()
    pdf["credit_card_number"] = fake.credit_card_number() #credit_card_full
    #us_driver_license = Todo!
    return pdf

def generate_fake_data(pdf: pd.DataFrame) -> pd.DataFrame:

    return pdf.apply(generate_data, axis=1).drop(["id", "partition_id"], axis=1)

In [0]:
from pyspark.sql.functions import spark_partition_id

df = (spark.range(0, num_rows).withColumn("partition_id", spark_partition_id())
      .groupBy("partition_id")
      .applyInPandas(generate_fake_data, schema))
display(df)

In [0]:
df.write.mode("overwrite").saveAsTable(f"{dbutils.widgets.get('catalog')}.{dbutils.widgets.get('schema')}.{dbutils.widgets.get('table_name')}")