In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
import random
from datetime import datetime, timedelta

output_path = "/Volumes/workspace/default/taxi_data/raw"
dbutils.fs.rm(output_path, recurse=True)

schema = StructType([
    StructField("trip_id", StringType(), True),
    StructField("vendor_id", StringType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("payment_type", StringType(), True),
])

def generate_rows(n, start_trip_id):
    rows = []
    base_time = datetime(2024, 1, 1, 8, 0, 0)
    for i in range(n):
        trip_id = f"trip_{start_trip_id + i}"
        vendor_id = random.choice(["V1", "V2"])
        pickup_time = base_time + timedelta(minutes=random.randint(0, 60*10))
        trip_minutes = random.randint(5, 40)
        dropoff_time = pickup_time + timedelta(minutes=trip_minutes)
        passenger_count = random.randint(1, 4)
        trip_distance = round(random.uniform(-1.0, 15.0), 2)  # includes bad values

        if trip_distance > 0:
            fare_amount = round(3 + trip_distance * 2 + random.uniform(-1, 3), 2)
        else:
            fare_amount = round(random.uniform(0, 5), 2)

        total_amount = round(fare_amount * random.uniform(1.0, 1.3), 2)
        payment_type = random.choice(["CASH", "CARD"])
        rows.append((trip_id, vendor_id, pickup_time, dropoff_time,
                     passenger_count, trip_distance, fare_amount,
                     total_amount, payment_type))
    return rows

rows = generate_rows(500, 1)
df = spark.createDataFrame(rows, schema=schema)

# write as one folder of CSV files
df.write.mode("overwrite").option("header", "true").csv(output_path)

print("Done writing data.")
display(dbutils.fs.ls(output_path))


Done writing data.


path,name,size,modificationTime
dbfs:/Volumes/workspace/default/taxi_data/raw/_SUCCESS,_SUCCESS,0,1764015237000
dbfs:/Volumes/workspace/default/taxi_data/raw/_committed_4438643072072260288,_committed_4438643072072260288,736,1764015237000
dbfs:/Volumes/workspace/default/taxi_data/raw/_started_4438643072072260288,_started_4438643072072260288,0,1764015237000
dbfs:/Volumes/workspace/default/taxi_data/raw/part-00000-tid-4438643072072260288-40ed151a-90b5-4828-9782-52c21c383d02-274-1-c000.csv,part-00000-tid-4438643072072260288-40ed151a-90b5-4828-9782-52c21c383d02-274-1-c000.csv,5363,1764015236000
dbfs:/Volumes/workspace/default/taxi_data/raw/part-00001-tid-4438643072072260288-40ed151a-90b5-4828-9782-52c21c383d02-275-1-c000.csv,part-00001-tid-4438643072072260288-40ed151a-90b5-4828-9782-52c21c383d02-275-1-c000.csv,5474,1764015236000
dbfs:/Volumes/workspace/default/taxi_data/raw/part-00002-tid-4438643072072260288-40ed151a-90b5-4828-9782-52c21c383d02-278-1-c000.csv,part-00002-tid-4438643072072260288-40ed151a-90b5-4828-9782-52c21c383d02-278-1-c000.csv,5427,1764015236000
dbfs:/Volumes/workspace/default/taxi_data/raw/part-00003-tid-4438643072072260288-40ed151a-90b5-4828-9782-52c21c383d02-276-1-c000.csv,part-00003-tid-4438643072072260288-40ed151a-90b5-4828-9782-52c21c383d02-276-1-c000.csv,5492,1764015237000
dbfs:/Volumes/workspace/default/taxi_data/raw/part-00004-tid-4438643072072260288-40ed151a-90b5-4828-9782-52c21c383d02-277-1-c000.csv,part-00004-tid-4438643072072260288-40ed151a-90b5-4828-9782-52c21c383d02-277-1-c000.csv,5436,1764015237000
dbfs:/Volumes/workspace/default/taxi_data/raw/part-00005-tid-4438643072072260288-40ed151a-90b5-4828-9782-52c21c383d02-279-1-c000.csv,part-00005-tid-4438643072072260288-40ed151a-90b5-4828-9782-52c21c383d02-279-1-c000.csv,5523,1764015236000
dbfs:/Volumes/workspace/default/taxi_data/raw/part-00006-tid-4438643072072260288-40ed151a-90b5-4828-9782-52c21c383d02-280-1-c000.csv,part-00006-tid-4438643072072260288-40ed151a-90b5-4828-9782-52c21c383d02-280-1-c000.csv,5425,1764015237000
