### Description
_Mechanism X puts `CustomerImportance.csv` at once and creates a chunk of 10,000 transaction entries from `transactions.csv` every 1 second and puts them into an S3 folder._

In [0]:
import time
import os
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, row_number, monotonically_increasing_id, floor

# Creating SparkSession
spark = SparkSession.builder\
    .appName("Mechanism_X")\
    .getOrCreate()

In [0]:
# Ingesting CustomerImportance.csv to S3 Bucket
customer_input_path = "/Volumes/workspace/devdolphins/source/CustomerImportance.csv"
#customer_s3_path = "/Volumes/workspace/devdolphins/output/customer/"
customer_s3_path = "s3://customer-transactions-detection/input/customers/"

customer_importance_schema = StructType([
    StructField("customer", StringType(), True),
    StructField("merchant", StringType(), True),
    StructField("weight", DoubleType(), True),
    StructField("typeTrans", StringType(), True),
    StructField("fraud", IntegerType(), True),
])

customer_importance_df = spark.read.csv(customer_input_path, header=True, schema=customer_importance_schema)

# Writing data to S3 bucket
customer_importance_df.write.format("csv")\
        .mode("overwrite")\
        .option("header",True)\
        .save(customer_s3_path)

In [0]:
# Ingesting next 10000 records from transactions.csv to S3 Bucket every 1 second
transaction_input_path = "/Volumes/workspace/devdolphins/source/transactions.csv"
#transaction_s3_path = "/Volumes/workspace/devdolphins/output/transactions/"
transaction_s3_path = "s3://customer-transactions-detection/input/transactions/"
chunk_size = 10000

transaction_schema = StructType([
    StructField("step", IntegerType()),
    StructField("customer", StringType()),
    StructField("age", StringType()),
    StructField("gender", StringType()),
    StructField("zipcodeOri", StringType()),
    StructField("merchant", StringType()),
    StructField("zipMerchant", StringType()),
    StructField("category", StringType()),
    StructField("amount", DoubleType()),
    StructField("fraud", IntegerType()),
])

transaction_df = spark.read.csv(transaction_input_path, header=True, schema=transaction_schema)

# Add row_number() for accurate chunking
windowSpec = Window.orderBy("step")  # or any other column that makes sense for ordering
transaction_df = transaction_df.withColumn("row_num", row_number().over(windowSpec) - 1)
transaction_df = transaction_df.withColumn("chunk_id", floor(col("row_num") / chunk_size))

# Get total number of chunks
total_chunks = transaction_df.select("chunk_id").distinct().count()

# Write each chunk
for i in range(10):
    chunk_path = os.path.join(transaction_s3_path, f"chunk_{i:04d}")
    chunk_df = transaction_df.filter(col("chunk_id") == i).drop("row_num", "chunk_id")
    
    if chunk_df.count() == 0:
        break
    
    print(f"{chunk_path}: {chunk_df.count()}")
    
    chunk_df.write.format("csv")\
        .mode("overwrite")\
        .option("header", True)\
        .save(chunk_path)
    
    time.sleep(1)  # Simulate 1-second delay'''