In [1]:
# Importing necessary libraries for Parquet
import pyarrow.csv as csv
import pyarrow.parquet as pq
import glob
import os
import pyarrow as pa
from pyarrow import parquet
import polars as pl
from datetime import datetime

In [81]:
# Providing folder path
folder_path = "/Users/ben/Documents/GitHub/CloudComputing/RedditPlaceData/"

# Partquet file name
parquet_file = "combined_place_data.parquet"

# Getting csv_file names
csv_files = glob.glob(os.path.join(folder_path, '**/*.csv'), recursive=True)

# Create Schema
schema = pa.schema([
    pa.field("timestamp", pa.string()),
    pa.field("user", pa.string()),
    pa.field("coordinate", pa.string()),
    pa.field("pixel_color", pa.string())
])

# Initializing writer
writer = parquet.ParquetWriter(parquet_file, schema, compression='snappy')

# Iterating through csv files
for file in sorted(csv_files):
    
    print("reading", file)
    table = csv.read_csv(file)
    # Casting table to schema
    table = table.cast(schema)

    print("writing", file)
    writer.write_table(table)
    
if writer:
    writer.close()
    
print(f"Combined CSVs to {parquet_file}")

reading /Users/ben/Documents/GitHub/CloudComputing/RedditPlaceData/2023_place_canvas_history-000000000000.csv
writing /Users/ben/Documents/GitHub/CloudComputing/RedditPlaceData/2023_place_canvas_history-000000000000.csv
reading /Users/ben/Documents/GitHub/CloudComputing/RedditPlaceData/2023_place_canvas_history-000000000001.csv
writing /Users/ben/Documents/GitHub/CloudComputing/RedditPlaceData/2023_place_canvas_history-000000000001.csv
reading /Users/ben/Documents/GitHub/CloudComputing/RedditPlaceData/2023_place_canvas_history-000000000002.csv
writing /Users/ben/Documents/GitHub/CloudComputing/RedditPlaceData/2023_place_canvas_history-000000000002.csv
reading /Users/ben/Documents/GitHub/CloudComputing/RedditPlaceData/2023_place_canvas_history-000000000003.csv
writing /Users/ben/Documents/GitHub/CloudComputing/RedditPlaceData/2023_place_canvas_history-000000000003.csv
reading /Users/ben/Documents/GitHub/CloudComputing/RedditPlaceData/2023_place_canvas_history-000000000004.csv
writing /U

In [82]:
test_file = pq.ParquetFile("/Users/ben/Documents/GitHub/CloudComputing/combined_place_data.parquet")

In [83]:
print(f"Number of entries (rows) in the Parquet file: {test_file.metadata.num_rows:,d}")

Number of entries (rows) in the Parquet file: 121,339,051


In [2]:
def parse_coordinate(coordinate):
    if len(coordinate) < 20:
        return coordinate
    else:
        coordinate = coordinate.replace(" ", "").replace(":", ",").split(",")
        x = coordinate[1]
        y = coordinate[3]
        return f"{x}" + "," + f"{y}"

In [3]:
print("Read parquet and cast datetime")
# Read the Parquet file
df = pl.scan_parquet('combined_place_data.parquet')
df = df.with_columns(pl.col("timestamp").str.strptime(pl.Datetime, '%Y-%m-%d %H:%M:%S%.f %Z').cast(pl.Datetime, strict=False))
df = df.with_columns(
    pl.col('coordinate').map_elements(parse_coordinate, return_dtype=pl.Object)
)
df = df.with_columns([
    pl.col('coordinate').map_elements(lambda x: int(x.split(",")[0]), return_dtype=pl.Int32).alias('x'),
    pl.col('coordinate').map_elements(lambda x: int(x.split(",")[1]), return_dtype=pl.Int32).alias('y')
])

Read parquet and cast datetime


In [86]:
df.head(10).collect(streaming=True)

timestamp,user,coordinate,pixel_color,x,y
datetime[μs],str,object,str,i32,i32
2023-07-20 13:00:26.088,"""no+8HEIDjbdx7/…","-199,-235","""#FFFFFF""",-199,-235
2023-07-20 13:00:43.658,"""qJ7O6cuUNfkDyn…","0,-298","""#FF4500""",0,-298
2023-07-20 13:00:43.705,"""uqi5XwkBePwcPK…","-42,-218","""#FFFFFF""",-42,-218
2023-07-20 13:01:02.487,"""rgSTj7FHZUHsLX…","-418,-232","""#B44AC0""",-418,-232
2023-07-20 13:01:40.445,"""2bmivBNj8NYvnp…",182164,"""#FF4500""",182,164
2023-07-20 13:01:51.457,"""iyPavVpo8ojDYs…","-113,-1","""#FFFFFF""",-113,-1
2023-07-20 13:01:52.149,"""a6Q+OsCSRDcPxh…","-64,-34","""#3690EA""",-64,-34
2023-07-20 13:01:57.333,"""AS0KN9rxoynWuN…","-267,-142","""#FFFFFF""",-267,-142
2023-07-20 13:02:14.260,"""aWwqNqt6Ydlvny…",-4352,"""#00A368""",-43,52
2023-07-20 13:03:42.173,"""vFCy3asEWbBER9…",-4374,"""#000000""",-43,74


In [4]:
value_counts = df.select("user").group_by("user").agg(pl.col('user').count().alias('count')).sort('count', descending=True)
value_counts = value_counts.with_columns(pl.col("count").cast(int))

In [5]:
value_counts.select("count").quantile(quantile=.95).collect(streaming = True)

count
f64
158.0


In [16]:
pbs_idx = value_counts.filter(pl.col("count") >= 158)

pixels_placed = df.filter(pl.col('user').is_in(pbs_idx.select("user").collect(streaming=True))).collect(streaming=True)

In [17]:
pixels_placed

timestamp,user,coordinate,pixel_color,x,y
datetime[μs],str,object,str,i32,i32
2023-07-20 13:01:52.149,"""a6Q+OsCSRDcPxh…","-64,-34","""#3690EA""",-64,-34
2023-07-20 13:03:42.173,"""vFCy3asEWbBER9…",-4374,"""#000000""",-43,74
2023-07-20 13:04:43.607,"""jijE+llUCOLjUX…",15126,"""#FFD635""",15,126
2023-07-20 13:04:44.439,"""L5EYlx1xHtzNNd…",784,"""#000000""",78,4
2023-07-20 13:04:45.288,"""8fwj/jPEvokM9V…","-36,-211","""#FF4500""",-36,-211
2023-07-20 13:04:45.616,"""veU+pBrJqKPdkY…","-161,-30","""#000000""",-161,-30
2023-07-20 13:04:45.965,"""Yd9kMTAts5VzxD…","468,-405","""#000000""",468,-405
2023-07-20 13:04:46.123,"""Eo/c2yPDrI95KW…",10261,"""#3690EA""",10,261
2023-07-20 13:04:47.278,"""FFt9e2W+8SAGLc…","-14,-299","""#FFA800""",-14,-299
2023-07-20 13:04:47.316,"""43pi/xvQXGFqD/…","-400,-140","""#000000""",-400,-140


In [18]:
22300000 / 121339051

0.18378254829106913

In [5]:
# Run this with value from box above
pbs_idx = value_counts.filter(pl.col("count") >= 60)

pixels_placed = df.filter(pl.col('user').is_in(pbs_idx.select("user").collect(streaming=True))).collect(streaming=True)

latest_coords = pixels_placed.group_by(["x", "y"]).agg(
    pl.col('timestamp').arg_max().alias('idxmax'),
    pl.col("pixel_color").last().alias("latest_color")
)

latest_coords.write_parquet("/Users/ben/Documents/GitHub/CloudComputing/latest_coords99.parquet")
# Go to r/place analysis

In [78]:
pixels_placed

timestamp,user,coordinate,pixel_color,x,y
datetime[μs],str,object,str,i32,i32
2023-07-20 13:00:43.705,"""uqi5XwkBePwcPK…","-42,-218","""#FFFFFF""",-42,-218
2023-07-20 13:01:40.445,"""2bmivBNj8NYvnp…",182164,"""#FF4500""",182,164
2023-07-20 13:01:52.149,"""a6Q+OsCSRDcPxh…","-64,-34","""#3690EA""",-64,-34
2023-07-20 13:03:42.173,"""vFCy3asEWbBER9…",-4374,"""#000000""",-43,74
2023-07-20 13:04:03.685,"""rkTj/gktE+fk1u…",-279242,"""#000000""",-279,242
2023-07-20 13:04:17.971,"""sxYEVPfaohJYIY…","60,-65","""#000000""",60,-65
2023-07-20 13:04:38.197,"""ywG7MbcHONF6gs…",-32263,"""#B44AC0""",-32,263
2023-07-20 13:04:39.757,"""FdP9nI9ZB1x8f0…",-62280,"""#000000""",-62,280
2023-07-20 13:04:41.115,"""Sp4flue6oel2cw…",-143266,"""#000000""",-143,266
2023-07-20 13:04:41.503,"""jFyStllXtkR7+s…","0,-100","""#00A368""",0,-100


In [60]:
pixels_placed.write_parquet("/Users/ben/Documents/GitHub/CloudComputing/pixels_placed.parquet")

In [54]:
latest_coords.select("y").max()

y
i32
999


In [23]:
pixels_placed.with_columns(["x", "y"]).filter((pl.col("x") == -3) & (pl.col("y") == -109))

timestamp,user,coordinate,pixel_color,x,y
datetime[μs],str,object,str,i32,i32
2023-07-20 13:04:48.455,"""rQeyBHwAAz03d1…","-3,-109","""#000000""",-3,-109
2023-07-20 13:56:46.903,"""gXs4E8qMxao7Yu…","-3,-109","""#FF4500""",-3,-109
2023-07-20 15:52:03.044,"""DuiAOwMqY59slD…","-3,-109","""#FFD635""",-3,-109
2023-07-21 01:15:14.710,"""FJq7GEYa7Yb2c+…","-3,-109","""#FFD635""",-3,-109
2023-07-21 01:32:21.497,"""upwxfUb3yvelhr…","-3,-109","""#FFD635""",-3,-109
2023-07-21 01:32:34.191,"""WGCMlbXAiZQMKi…","-3,-109","""#FFD635""",-3,-109
2023-07-21 02:38:32.458,"""wpigWtzZP99uN9…","-3,-109","""#FFFFFF""",-3,-109
2023-07-21 03:35:35.674,"""Zz19OTdL9Q7Lvy…","-3,-109","""#00A368""",-3,-109
2023-07-21 04:00:59.471,"""nCheKBJ53dRMRo…","-3,-109","""#FFFFFF""",-3,-109
2023-07-21 04:01:54.511,"""VtxnG4EI3lp26u…","-3,-109","""#3690EA""",-3,-109


In [17]:
result_df = pixels_placed.filter(pl.col('idxmax').is_in(idx_max_timestamp['idxmax']))

ColumnNotFoundError: unable to find column "idxmax"; valid columns: ["timestamp", "user", "coordinate", "pixel_color", "x", "y"]

Error originated just after this operation:
DF ["timestamp", "user", "coordinate", "pixel_color"]; PROJECT */6 COLUMNS; SELECTION: "None"

In [15]:
df.head(10).group_by("user").agg(place_count = pl.col("coordinate").len()).collect(streaming=True)

user,place_count
str,u32
"""AS0KN9rxoynWuN…",1
"""iyPavVpo8ojDYs…",1
"""vFCy3asEWbBER9…",1
"""qJ7O6cuUNfkDyn…",1
"""aWwqNqt6Ydlvny…",1
"""uqi5XwkBePwcPK…",1
"""2bmivBNj8NYvnp…",1
"""a6Q+OsCSRDcPxh…",1
"""no+8HEIDjbdx7/…",1
"""rgSTj7FHZUHsLX…",1


In [24]:
# Aggregate df to count unique pixels and colors per user
agg_df = df.group_by('user').agg(
    color_count=pl.col('pixel_color').len(),
    place_count=pl.col('coordinate').len()
)

# Filter to find users with only one color or one place
filtered_df = agg_df.filter(
    (pl.col('color_count') == 1) | (pl.col('place_count') == 1)
)

In [23]:
filtered_df.collect(streaming=True)

thread '<unnamed>' panicked at crates/polars-core/src/series/ops/null.rs:77:17:
not implemented for dtype Object("object", None)
--- PyO3 is resuming a panic after fetching a PanicException from Python. ---
Python stack trace below:


PanicException: not implemented for dtype Object("object", None)

PanicException: not implemented for dtype Object("object", None)

In [32]:
df.head(10).collect(streaming=True)

timestamp,user,coordinate,pixel_color,x,y
datetime[μs],str,object,str,i32,i32
2023-07-20 13:00:26.088,"""no+8HEIDjbdx7/…","-199,-235","""#FFFFFF""",-199,-235
2023-07-20 13:00:43.658,"""qJ7O6cuUNfkDyn…","0,-298","""#FF4500""",0,-298
2023-07-20 13:00:43.705,"""uqi5XwkBePwcPK…","-42,-218","""#FFFFFF""",-42,-218
2023-07-20 13:01:02.487,"""rgSTj7FHZUHsLX…","-418,-232","""#B44AC0""",-418,-232
2023-07-20 13:01:40.445,"""2bmivBNj8NYvnp…",182164,"""#FF4500""",182,164
2023-07-20 13:01:51.457,"""iyPavVpo8ojDYs…","-113,-1","""#FFFFFF""",-113,-1
2023-07-20 13:01:52.149,"""a6Q+OsCSRDcPxh…","-64,-34","""#3690EA""",-64,-34
2023-07-20 13:01:57.333,"""AS0KN9rxoynWuN…","-267,-142","""#FFFFFF""",-267,-142
2023-07-20 13:02:14.260,"""aWwqNqt6Ydlvny…",-4352,"""#00A368""",-43,52
2023-07-20 13:03:42.173,"""vFCy3asEWbBER9…",-4374,"""#000000""",-43,74
