# 2. Local lakehouse with MinIO and Delta

In [1]:
import os

import polars as pl

In [2]:
%load_ext dotenv

In [3]:
%dotenv

In [4]:
minio_endpoint_url = os.environ["MINIO_ENDPOINT_URL"]
minio_access_key = os.environ["MINIO_KEY"]
minio_secret_id = os.environ["MINIO_SECRET"]

In [5]:
df = pl.read_parquet("submissions.pq").sort("creation_datetime", descending=True)
df.head()

title,author_name,creation_datetime,subreddit_name,num_comments,sfw,score,upvote_ratio,is_self,permalink,selftext,flair_text
str,str,"datetime[μs, UTC]",str,i64,bool,i64,f64,bool,str,str,str
"""AITA for kicking guests out of…","""TheHylind""",2024-07-06 05:36:33 UTC,"""r/AmItheAsshole""",7,True,1,1.0,True,"""/r/AmItheAsshole/comments/1dwi…","""For context, I work at a local…","""Not enough info"""
"""AITA for reporting coworker's …","""Allethiia""",2024-07-06 05:21:10 UTC,"""r/AmItheAsshole""",2,True,3,0.6,True,"""/r/AmItheAsshole/comments/1dwi…","""I (28) have been working at my…","""TL;DR"""
"""AITA for cancelling my birthda…","""Lis_wj""",2024-07-06 05:14:00 UTC,"""r/AmItheAsshole""",6,True,1,0.6,True,"""/r/AmItheAsshole/comments/1dwi…","""I (26F) have been really stres…","""Not the A-hole"""
"""AITA: I told my sister she has…","""dswizzle2""",2024-07-06 05:09:53 UTC,"""r/AmItheAsshole""",14,True,18,0.8,True,"""/r/AmItheAsshole/comments/1dwh…","""For context, I(27F) and my sis…","""Not the A-hole"""
"""WIBTA for calling out my frien…","""gremlinoverlord_420""",2024-07-06 04:57:21 UTC,"""r/AmItheAsshole""",9,True,0,0.33,True,"""/r/AmItheAsshole/comments/1dwh…","""I (f) have gotten fed up with …","""Not the A-hole"""


Writing to a local Delta table is okay, but doing so to an object-storage is even better! Enter MinIO:

In [6]:
minio_storage_options = {
    "AWS_ENDPOINT_URL": minio_endpoint_url,
    "AWS_ACCESS_KEY_ID": minio_access_key,
    "AWS_SECRET_ACCESS_KEY": minio_secret_id,
    "AWS_ALLOW_HTTP": "true",
    "AWS_S3_ALLOW_UNSAFE_RENAME": "true",
    "AWS_EC2_METADATA_DISABLED": "true",
}

In [None]:
df.write_delta(
    "s3://reddit-submissions/submissions-raw",
    mode="error",  # To avoid accidentally overwriting the data
    storage_options=minio_storage_options,
)

Better than just overwriting the data, Delta allows you to do `UPSERT`-like operations (UPdate and inSERT):

In [15]:
(
    df.write_delta(
        "s3://reddit-submissions/submissions-raw",
        mode="merge",
        storage_options=minio_storage_options,
        delta_merge_options={
            "predicate": "s.permalink = t.permalink",
            "source_alias": "s",
            "target_alias": "t",
        },
    )
    .when_matched_update_all()
    .when_not_matched_insert_all()
    .execute()
)

{'num_source_rows': 668,
 'num_target_rows_inserted': 507,
 'num_target_rows_updated': 161,
 'num_target_rows_deleted': 0,
 'num_target_rows_copied': 17,
 'num_output_rows': 685,
 'num_target_files_added': 2,
 'num_target_files_removed': 1,
 'execution_time_ms': 243,
 'scan_time_ms': 0,
 'rewrite_time_ms': 163}

And now you can read your data as follows:

In [16]:
df = (
    pl.read_delta(
        "s3://reddit-submissions/submissions-raw", storage_options=minio_storage_options
    ).sort("creation_datetime", descending=True)
)
df.head()

title,author_name,creation_datetime,subreddit_name,num_comments,sfw,score,upvote_ratio,is_self,permalink,selftext,flair_text
str,str,"datetime[μs, UTC]",str,i64,bool,i64,f64,bool,str,str,str
"""AITA for telling my sister if …","""aitaeatinghabits""",2024-07-06 05:39:18 UTC,"""r/AmItheAsshole""",2,True,1,1.0,True,"""/r/AmItheAsshole/comments/1dwi…","""I (20f) just learned I am auti…",
"""AITA for kicking guests out of…","""TheHylind""",2024-07-06 05:36:33 UTC,"""r/AmItheAsshole""",7,True,1,1.0,True,"""/r/AmItheAsshole/comments/1dwi…","""For context, I work at a local…","""Not enough info"""
"""AITA for telling my bff to lea…","""Direct-Television426""",2024-07-06 05:34:17 UTC,"""r/AmItheAsshole""",2,True,1,1.0,True,"""/r/AmItheAsshole/comments/1dwi…","""So, my partner and I are both …",
"""AITA for reporting coworker's …","""Allethiia""",2024-07-06 05:21:10 UTC,"""r/AmItheAsshole""",2,True,3,0.6,True,"""/r/AmItheAsshole/comments/1dwi…","""I (28) have been working at my…","""TL;DR"""
"""AITA for cancelling my birthda…","""Lis_wj""",2024-07-06 05:14:00 UTC,"""r/AmItheAsshole""",6,True,1,0.6,True,"""/r/AmItheAsshole/comments/1dwi…","""I (26F) have been really stres…","""Not the A-hole"""


Delta tables allow for _time travel_:

In [None]:
df0 = (
    pl.read_delta(
        "s3://reddit-submissions/submissions-raw",
        version=0,  # The first version of the data, YMMV
        storage_options=minio_storage_options
    ).sort("creation_datetime", descending=True)
)
df0.head()