In [0]:
import time
import uuid
from pyspark.sql.functions import col, expr


In [0]:
txn_df = (
    spark.read
        .option("header", True)
        .option("inferSchema", True)
        .csv("/Volumes/angad_kumar91/fraud_detection_raw_data_files/raw_data_files/train_transaction.csv")
)

id_df = (
    spark.read
        .option("header", True)
        .option("inferSchema", True)
        .csv("/Volumes/angad_kumar91/fraud_detection_raw_data_files/raw_data_files/train_identity.csv")
)


In [0]:
base_df = (
    txn_df
        .join(id_df, on="TransactionID", how="left")
)


In [0]:
stream_df = base_df.select(
    "TransactionID",
    "TransactionDT",
    "TransactionAmt",
    "ProductCD",
    "card1", "card2", "card3", "card4", "card5", "card6",
    "addr1", "addr2",
    "dist1", "dist2",
    "P_emaildomain", "R_emaildomain",
    "DeviceType", "DeviceInfo",
    "isFraud"
)


In [0]:
from pyspark.sql.functions import unix_timestamp, from_unixtime, col, lit

stream_df = stream_df.withColumn(
    "event_timestamp",
    from_unixtime(
        unix_timestamp(lit("2017-12-01 00:00:00")) + col("TransactionDT")
    )
)


In [0]:
ROWS_PER_FILE = 1000
SLEEP_SECONDS = 5

total_rows = stream_df.count()
offset = 0


In [0]:
ROWS_PER_FILE = 1000
MAX_BATCHES = 20        # ðŸ‘ˆ CONTROL HERE
SLEEP_SECONDS = 5

offset = 0
batch_count = 0

while batch_count < MAX_BATCHES:
    batch_df = (
        stream_df_with_id
            .filter(
                (col("_rid") >= offset) &
                (col("_rid") < offset + ROWS_PER_FILE)
            )
            .drop("_rid")
    )

    if batch_df.rdd.isEmpty():
        break

    batch_df.write.mode("append").json(
        "/Volumes/angad_kumar91/fraud_detection_raw_data_files/stream_input"
    )

    print(f"âœ… Emitted batch {batch_count + 1}")

    offset += ROWS_PER_FILE
    batch_count += 1
    time.sleep(SLEEP_SECONDS)


In [0]:
%sql
CREATE VOLUME angad_kumar91.fraud_detection_raw_data_files.stream_input;


In [0]:
while offset < total_rows:
    batch_df = (
        stream_df
            .limit(offset + ROWS_PER_FILE)
            .subtract(stream_df.limit(offset))
    )

    batch_id = str(uuid.uuid4())
    output_path = (
        f"/Volumes/angad_kumar91/"
        f"fraud_detection_raw_data_files/stream_input/"
        f"batch_{batch_id}"
    )

    (
        batch_df
            .write
            .mode("append")
            .json(output_path)
    )

    print(f"âœ… Emitted batch {batch_id} with {batch_df.count()} rows")

    offset += ROWS_PER_FILE
    time.sleep(SLEEP_SECONDS)


In [0]:
dbutils.fs.ls(
  "/Volumes/angad_kumar91/fraud_detection_raw_data_files/stream_input/"
)


In [0]:
dbutils.fs.ls(
    "/Volumes/angad_kumar91/fraud_detection_raw_data_files/stream_input/batch_006b16f4-398a-42fb-9a36-0fd1bdb23675/"
)


In [0]:
files = dbutils.fs.ls(
    "/Volumes/angad_kumar91/fraud_detection_raw_data_files/stream_input/"
    "batch_006b16f4-398a-42fb-9a36-0fd1bdb23675/"
)

files


In [0]:
df = spark.read.json(
    "/Volumes/angad_kumar91/fraud_detection_raw_data_files/stream_input/"
    "batch_006b16f4-398a-42fb-9a36-0fd1bdb23675/"
)

df.show(10, truncate=False)
df.count()


In [0]:
all_df = (
    spark.read
        .option("recursiveFileLookup", "true")
        .json("/Volumes/angad_kumar91/fraud_detection_raw_data_files/stream_input/")
)

all_df.count()
display(all_df.limit(20))


In [0]:
test_df = spark.read.json(
    "/Volumes/angad_kumar91/fraud_detection_raw_data_files/stream_input/batch_006b16f4-398a-42fb-9a36-0fd1bdb23675/"
)

display(test_df.limit(10))

In [0]:
test_df.count()
