In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from datetime import datetime, timedelta
import random



output_path = "dbfs/Workspace/Users/jusoares_flor@hotmail.com/kyc_risk_project/data/csv_sources"

names = [
    'Carlos L.', 'Sofia R.', 'Lucas M.', 'Isabela F.', 'Gabriel A.',
    'Laura C.', 'Mateus G.', 'Júlia S.', 'Pedro R.', 'Beatriz S.',
    'John S.', 'Maria G.', 'James J.', 'Patricia B.', 'Robert J.',
    'Wei C.', 'Li N.', 'Jing W.', 'Yang L.', 'Wei Z.',
    'Fatima A.', 'Mohammed A.', 'Ahmed A.', 'Aisha A.'
]

countries = [
    'Brazil', 'Portugal', 'Spain', 'China', 'Oman',
    'Germany', 'India', 'Russia', 'Argentina', 'Mexico',
    'Iran', 'Venezuela', 'Myanmar', 'Lebanon', 'North Korea'
]


def generate_clients(spark, n_clients=150):
    age_range = (15, 75)
    clients = [
        (i, random.choice(names), random.randint(*age_range), random.choice(countries))
        for i in range(1, n_clients + 1)
    ]

    schema = StructType([
        StructField("client_id", IntegerType(), False),
        StructField("name", StringType(), False),
        StructField("age", IntegerType(), False),
        StructField("country", StringType(), False)
    ])

    return spark.createDataFrame(clients, schema=schema)


def generate_transactions(spark, clients_df, n_transactions=500):
    clients = clients_df.collect()
    transactions = []

    for i in range(1, n_transactions + 1):
        client = random.choice(clients)
        amount = round(random.uniform(10.0, 5000.0), 2)
        date = (datetime.now() - timedelta(days=random.randint(0, 365))).date()
        transactions.append((i, client["client_id"], amount, date.isoformat()))

    schema = StructType([
        StructField("transaction_id", IntegerType(), False),
        StructField("client_id", IntegerType(), False),
        StructField("transaction_amount", FloatType(), False),
        StructField("transaction_date", StringType(), False)
    ])

    return spark.createDataFrame(transactions, schema=schema)


def generate_high_risk_countries(spark, n_high_risk_countries=10):
    high_risk = [(c,) for c in random.sample(countries, n_high_risk_countries)]

    schema = StructType([
        StructField("country", StringType(), False)
    ])

    return spark.createDataFrame(high_risk, schema=schema)

print("Generating clients...")
clients_df = generate_clients(spark)
print(clients_df)


In [0]:

def save_csv(df, sub_path):
    (
        df.coalesce(1)
        .write
        .mode("overwrite")
        .option("header", True)
        .csv(f"{output_path}/{sub_path}")
    )