- include pdf of analysis once done

In [1]:
# all imports

import pyarrow.csv as pv
import pyarrow.parquet as pq
import pyarrow as pa
import polars as pl

In [None]:
# preprocessing function (polars)

csv_file = "data/2022_place_canvas_history.csv"
parquet_file = "data/2022pyarrow.parquet"

DATESTRING_FORMAT = "%Y-%m-%d %H:%M:%S"
BLOCK_SIZE = 100_000_000

read_options = pv.ReadOptions(block_size=BLOCK_SIZE)
csv_reader = pv.open_csv(csv_file, read_options=read_options)

parquet_writer = None

try:
    for record_batch in csv_reader:
        print(f"Processing batch with {record_batch.num_rows} rows...")

        df = pl.from_arrow(record_batch)

        # convert timstamp to datetime object
        df = df.with_columns(
            pl.col("timestamp")
            .str.replace(r" UTC$", "")  
            .str.strptime(
                pl.Datetime, 
                format="%Y-%m-%d %H:%M:%S%.f",
                strict=False
            )
            .alias("timestamp")
        )

        # coordinate
        df = (
            df.filter(
                pl.col("coordinate").str.count_matches(",") == 1
            )
            .with_columns(
                pl.col("coordinate")
                .str.split_exact(",", 1)
                .struct.field("field_0")
                .cast(pl.Int64)
                .alias("x"),
                pl.col("coordinate")
                .str.split_exact(",", 1)
                .struct.field("field_1")
                .cast(pl.Int64)
                .alias("y"),
            )
            .drop("coordinate")
            )

        # map user_id to ints to save memory
        df = df.with_columns(
            pl.col('user_id')
            # .map_dict(id_to_int) # need to define a data structure that stores unique id's
            # .alias('user_id_int')
        )

        # can drop user_id now
        df = df.drop("user_id")

        table = df.to_arrow()

        if parquet_writer is None:
            parquet_writer = pq.ParquetWriter(
                parquet_file, 
                schema=table.schema, 
                compression="zstd"
            )
        parquet_writer.write_table(table)

finally:
    if parquet_writer:
        parquet_writer.close()

print(f"Successfully converted {csv_file} to {parquet_file}")


# Why were the 3 pixel colors hit so often?
- black, red, white

In [None]:
# distribution of color counts among pixels

In [None]:
# confounding variables
    # what pixel colors at time (if a bunch of ppl place black pixel around same time, or evenly spread out)
    # how pixel location changes between pixel color
    # pixel location over time