- include pdf of analysis once done

In [1]:
# all imports

import pyarrow.csv as pv
import pyarrow.parquet as pq
import pyarrow as pa
import polars as pl

In [None]:
# preprocessing function (polars)

csv_file = "2022_place_canvas_history.csv"
parquet_file = "2022pyarrow.parquet"

DATESTRING_FORMAT = "%Y-%m-%d %H:%M:%S"
BLOCK_SIZE = 100_000_000

read_options = pv.ReadOptions(block_size=BLOCK_SIZE)
csv_reader = pv.open_csv(csv_file, read_options=read_options)

parquet_writer = None

try:
    for record_batch in csv_reader:
        print(f"Processing batch with {record_batch.num_rows} rows...")

        df = pl.from_arrow(record_batch)

        # convert timstamp to datetime object
        df = df.with_columns(
            pl.col("timestamp")
            .str.replace(r" UTC$", "")  
            .str.strptime(
                pl.Datetime, 
                format="%Y-%m-%d %H:%M:%S%.f",
                strict=False
            )
            .alias("timestamp")
        )

        # split coordinate into cols 'x', 'y' 
        df = (
            df.filter(
                pl.col("coordinate").str.count_matches(",") == 1
            )
            .with_columns(
                pl.col("coordinate")
                .str.split_exact(",", 1)
                .struct.field("field_0")
                .cast(pl.Int64)
                .alias("x"),
                pl.col("coordinate")
                .str.split_exact(",", 1)
                .struct.field("field_1")
                .cast(pl.Int64)
                .alias("y"),
            )
            .drop("coordinate")
            )

        # map user_id to ints to save memory
        df = df.with_columns(
            pl.col('user_id')
            .cast(pl.Categorical)  # convert to categorical (assigns unique codes)
            .to_physical()         # get int rep
            .alias('user_id_int')
        )
        df = df.drop("user_id") # can drop user_id now

        table = df.to_arrow()

        if parquet_writer is None:
            parquet_writer = pq.ParquetWriter(
                parquet_file, 
                schema=table.schema, 
                compression="zstd"
            )
        parquet_writer.write_table(table)

finally:
    if parquet_writer:
        parquet_writer.close()

print(f"Successfully converted {csv_file} to {parquet_file}")

In [20]:
parquet_file = pq.ParquetFile("2022pyarrow.parquet")

row_count = parquet_file.metadata.num_rows
columns = parquet_file.schema.names

print(f"Number of rows: {row_count}")
print(f"Columns: {columns}")
# and parquet is 1.45 GB

Number of rows: 160353085
Columns: ['timestamp', 'pixel_color', 'x', 'y', 'user_id_int']


In [None]:
# can retrieve original user_id with: 

# df.with_columns(
#     pl.col('user_id_int')
#     .reverse()
#     .alias('user_id')
# )

# What are the top 3 most painted pixels?

In [8]:
parquet_file = "2022pyarrow.parquet"

df = pl.scan_parquet(parquet_file)  # LazyFrame (efficient for big data)
df.collect().head()

timestamp,pixel_color,x,y,user_id_int
datetime[μs],str,i64,i64,u32
2022-04-04 00:53:51.577,"""#00CCC0""",826,1048,0
2022-04-04 00:53:53.758,"""#94B3FF""",583,1031,1
2022-04-04 00:53:54.685,"""#6A5CFF""",1873,558,2
2022-04-04 00:54:57.541,"""#009EAA""",1627,255,3
2022-04-04 00:55:16.307,"""#94B3FF""",49,1478,4


In [9]:
paint_counts = (
    df.group_by(['x', 'y'])
    .agg(pl.len().alias('count')) # count occurances
    .sort('count', descending=True) # sort by count
    .limit(3) # top 3 pixels
)

paint_counts.collect()

x,y,count
i64,i64,u32
0,0,98807
359,564,69198
349,564,55230


The top 3 painted pixels are at locations
1. (0, 0) painted 98807 times
2. (359, 564) painted 69198 times
3. (349, 564) painted 55230 times 

# Actual Analysis: Why were the 3 most painted pixels hit so often?

In [None]:
# distribution of color counts among pixels

In [None]:
# confounding variables
    # what pixel colors at time (if a bunch of ppl place black pixel around same time, or evenly spread out)
    # how pixel location changes between pixel color
    # pixel location over time