In [1]:
%pip install duckdb

Collecting duckdb
  Downloading duckdb-1.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (762 bytes)
Downloading duckdb-1.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.1/20.1 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: duckdb
Successfully installed duckdb-1.1.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
import duckdb
from pathlib import Path

In [4]:
con = duckdb.connect()
con.execute("CREATE TABLE training_data AS SELECT * FROM 'data/train.parquet'")
con.execute("CREATE TABLE validation_data AS SELECT * FROM 'data/validation.parquet'")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x7f7aee1b9c70>

In [5]:
columns_to_drop = [
    "CUSTOMER_ID_index",
    "customer_name_index",
    "customer_email_index",
    "phone_index",
    "billing_zip",
    "billing_city_index",
    "billing_state_index",
    "x_customer_id",
    "y_customer_id",
]

In [6]:
for table_name in ["training_data", "validation_data"]:
    # drop bad feature columns
    for column in columns_to_drop:
        con.execute(f"ALTER TABLE {table_name} DROP COLUMN {column}")
    
    # convert TX_AMOUNT to double
    con.execute(f"ALTER TABLE {table_name} ADD COLUMN TX_AMOUNT_TEMP DOUBLE")
    con.execute(f"UPDATE {table_name} SET TX_AMOUNT_TEMP = CAST(TX_AMOUNT AS DOUBLE)")
    con.execute(f"ALTER TABLE {table_name} DROP COLUMN TX_AMOUNT")
    con.execute(f"ALTER TABLE {table_name} RENAME COLUMN TX_AMOUNT_TEMP TO TX_AMOUNT")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [7]:
rows_per_file = 1_000_000
cleaned_data_path = "data/cleaned_data"

Path(cleaned_data_path).mkdir(parents=True, exist_ok=True)

for table_name in ["training_data", "validation_data"]:
    num_rows = con.execute(f"SELECT COUNT(*) FROM {table_name}").fetchone()[0]
    
    Path(f"{cleaned_data_path}/{table_name}").mkdir(parents=True, exist_ok=True)
    
    for i in range(0, num_rows, rows_per_file):
        query = f"SELECT * FROM {table_name} LIMIT {rows_per_file} OFFSET {i}"
        df = con.execute(query).fetchdf()
        
        df.to_parquet(f"{cleaned_data_path}/{table_name}/{table_name}_{i}.parquet")

In [8]:
!aws s3 sync data/cleaned_data/ s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/

upload: data/cleaned_data/training_data/training_data_0.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_0.parquet
upload: data/cleaned_data/training_data/training_data_13000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_13000000.parquet
upload: data/cleaned_data/training_data/training_data_1000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_1000000.parquet
upload: data/cleaned_data/training_data/training_data_10000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_10000000.parquet
upload: data/cleaned_data/training_data/training_data_12000000.parquet to s3://sagemaker-us-east-2-386900942011/tech-summit-payments-fraud-workshop-1017/training_data/training_data_12000000.parquet
upload: data/cleaned_data/