In [1]:
# color mapping 
import numpy as np

color_table = {
        "red": (255, 0, 0),
        "green": (0, 255, 0),
        "blue": (0, 0, 255),
        "yellow": (255, 255, 0),
        "cyan": (0, 255, 255),
        "magenta": (255, 0, 255),
        "pink": (255, 192, 203), 
        "orange": (255, 165, 0), 
        "purple": (160, 32, 240)
    }

colors = list(color_table.keys())
rgb_vals = np.array(list(color_table.values()))

def most_similar_color(hexcode):

    hex_rgb = np.array([int(hexcode.lstrip('#')[i:i+2], 16) for i in (0, 2, 4)])

    if hex_rgb[0] == hex_rgb[1] == hex_rgb[2]:
        if hex_rgb[0] == 0:
            return "black"
        elif hex_rgb[0] == 255:
            return "white" 
        else: 
            return "gray"

    return colors[np.argmin(np.sum((rgb_vals - hex_rgb) ** 2, axis=1))]


In [2]:
# moss green -> green
most_similar_color('#001C00')

'green'

In [3]:
# hashing 
import base64 

def hashbase64(user_id):   
    return base64.b64encode(user_id.encode()).decode()[:8]

In [4]:
hashbase64("6NSgFa1CvIPly1VniNhlbrmoN3vgDFbMSKqh+c4TTfrr3dMib91oUWONX96g5PPcioIxedF24ldNOu/g5yqDrg==")

'Nk5TZ0Zh'

In [None]:
# loading data
import pyarrow.parquet as pq
import pyarrow.compute as pc 
import polars as pl

parq = pq.ParquetFile('2022_place_canvas_history.parquet')

In [59]:
# batches 
chunk_size = 10_000_000
processed = []

for batch in parq.iter_batches(batch_size=chunk_size, columns=['timestamp', 'user_id', 'pixel_color']):
    chunk = pl.from_arrow(batch)

    chunk = chunk.with_columns(pl.col('user_id').map_elements(hashbase64, return_dtype=pl.String))
    chunk = chunk.with_columns(pl.col('pixel_color').map_elements(most_similar_color, return_dtype=pl.String))
    
    processed.append(chunk) 

In [None]:
# combining 
combined_df = pl.concat(processed)
# sorting time 
combined_df = combined_df.sort('timestamp') 
# adding is_first 
combined_df = combined_df.with_columns(pl.col("user_id").is_first_distinct().alias("is_first"))

In [63]:
combined_df.write_parquet('processed_data.parquet')