<a href="https://colab.research.google.com/github/berkyalcinkaya/cs145-project2-systems/blob/main/cs145_project2_systems_template_fa2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab (Main)"/></a>

<a href="https://colab.research.google.com/github/berkyalcinkaya/cs145-project2-systems/blob/berk/cs145_project2_systems_template_fa2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab (berk)"/></a>

## Collaborators

1.   Berk Yalcinkaya
2.   Nick Allen


# Setup

In [2]:
import pandas as pd
import os
import uuid
import argparse
import time
import psutil
import heapq
import pyarrow as pa
import pyarrow.parquet as pq
import random
import string
import numpy as np
from typing import List, Optional, Callable, Dict, Union, Any, Tuple
import shutil
import glob
import gc
from IPython.display import display
import tempfile
from pathlib import Path
from functools import partial
import memory_profiler

In [3]:
def clear_parquet_files():
    for file in glob.glob("*.parquet"):
        os.remove(file)
    return

clear_parquet_files()


# Section 0: Generate Test Data

This section has already been implemented for you.

In [4]:
import gc


def generate_songs_chunk(start, size, string_length=100):
    data = {
        "song_id": range(start, start + size),
        "title": [f"Song_{i}" for i in range(start, start + size)],
    }
    base_strings = generate_base_strings(size, string_length)
    for i in range(1, 11):
        data[f"extra_col_{i}"] = np.roll(base_strings, shift=i)
    return pd.DataFrame(data)


def generate_users_chunk(start, size, string_length=100):
    data = {
        "user_id": range(start, start + size),
        "age": [18 + ((start + i) % 60) for i in range(size)],
    }
    base_strings = generate_base_strings(size, string_length)
    for i in range(1, 11):
        data[f"extra_col_{i}"] = np.roll(base_strings, shift=i)
    return pd.DataFrame(data)


def generate_listens_chunk(start, size, num_users, num_songs, string_length=16):
    data = {
        "listen_id": range(start, start + size),
        "user_id": np.random.randint(0, num_users, size=size),
        "song_id": np.random.randint(0, num_songs, size=size),
    }
    base_strings = generate_base_strings(size, string_length)
    for i in range(1, 11):
        data[f"extra_col_{i}"] = np.roll(base_strings, shift=i)
    return pd.DataFrame(data)


def generate_base_strings(num_records, string_length):
    chars = np.array(list("ab"))
    random_indices = np.random.randint(0, len(chars), size=(num_records, string_length))
    char_array = chars[random_indices]
    return np.array(list(map("".join, char_array)))


def _write_parquet_streamed(
    filename,
    total_rows,
    make_chunk_fn,
    chunk_size=250_000,
    compression="snappy",
):
    """
    Stream DataFrame chunks to a single Parquet file with one ParquetWriter.
    - schema_df: optional small DataFrame to lock schema; if None we'll infer from the first chunk.
    """
    written = 0

    first_chunk = make_chunk_fn(0, min(chunk_size, total_rows))
    first_table = pa.Table.from_pandas(first_chunk, preserve_index=False)
    writer = pq.ParquetWriter(filename, first_table.schema, compression=compression)
    writer.write_table(first_table)

    written += len(first_chunk)
    del first_chunk
    gc.collect()

    while written < total_rows:
        take = min(chunk_size, total_rows - written)
        chunk_df = make_chunk_fn(written, take)
        writer.write_table(pa.Table.from_pandas(chunk_df, preserve_index=False))
        written += take
        del chunk_df
        gc.collect()

    writer.close()


def generate_test_data(target_size="100MB"):
    """
    Generate datasets with proper foreign key relationships.

    Target COMPRESSED Parquet file sizes on disk:
    100MB total compressed:
        - Songs: 10K rows → ~5MB (5% of total)
        - Users: 50K rows → ~20MB (20% of total)
        - Listens: 1M rows → ~75MB (75% of total)
    1GB total compressed:
        - Songs: 100K rows → ~50MB (5% of total)
        - Users: 500K rows → ~200MB (20% of total)
        - Listens: 10M rows → ~750MB (75% of total)

    Each table needs:
        - Primary key column(s)
        - 10 additional string columns of k characters each
        - For Users: add 'age' column (random 18-80)

    CRITICAL: Listens table must have valid foreign keys!
    Every song_id must exist in Songs
    Every user_id must exist in Users
    """

    assert target_size in ["100MB", "1GB", "10GB"]
    if target_size == "100MB":
        num_songs = 10_000
        num_users = 50_000
        num_listens = 1_000_000

        songs_chunk = 10_000
        users_chunk = 50_000
        listens_chunk = 1_000_000
    elif target_size == "1GB":
        num_songs = 100_000
        num_users = 500_000
        num_listens = 10_000_000

        songs_chunk = 10_000
        users_chunk = 50_000
        listens_chunk = 1_000_000
    else: 
        num_songs = 1_000_000
        num_users = 5_000_000
        num_listens = 100_000_000

        songs_chunk = 10_000
        users_chunk = 50_000
        listens_chunk = 1_000_000

    print("Writing Songs")
    _write_parquet_streamed(
        filename=f"songs_{target_size}.parquet",
        total_rows=num_songs,
        make_chunk_fn=lambda start, size: generate_songs_chunk(start, size),
        chunk_size=songs_chunk,
    )

    print("Writing Users")
    _write_parquet_streamed(
        filename=f"users_{target_size}.parquet",
        total_rows=num_users,
        make_chunk_fn=lambda start, size: generate_users_chunk(start, size),
        chunk_size=users_chunk,
    )

    print("Writing Listens")
    _write_parquet_streamed(
        filename=f"listens_{target_size}.parquet",
        total_rows=num_listens,
        make_chunk_fn=lambda start, size: generate_listens_chunk(
            start, size, num_users, num_songs
        ),
        chunk_size=listens_chunk,
    )

    print("Done!")

# Section 0b: Define Memory and Performance Benchmarking Functions
- Memory will be monitored using the memory_profiler function: %%memit above a cell monitors memory usage of entire cell, %memit monitors the memory usage of a single line
- CPU performance will be measured with a custom decorator defined below


In [6]:
%load_ext memory_profiler

In [87]:
def timer(func):
    """
    Decorator to measure and print the execution time of a function.

    Usage:
        @timer
        def my_function(...):
            ...

    When the decorated function is called, it will print the elapsed time in seconds with a descriptive message.

    Returns:
        The result of the wrapped function, after printing its runtime.
    """
    def wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        print(f"Method '{func.__name__}' took {end_time - start_time:.4f} seconds.")
        return result
    return wrapper

In [11]:
%%memit
random.seed(0)
if not os.path.exists("listens_100MB.parquet"):
    generate_test_data("100MB")
else:
    print("100MB data already generated")
if not os.path.exists("listens_1GB.parquet"):
    generate_test_data('1GB')
else:
    print("1GB data already generated")
if not os.path.exists("listens_10GB.parquet"):
    generate_test_data('10GB')
else:
    print("10GB data already generated")

100MB data already generated
1GB data already generated
Writing Songs
Writing Users
Writing Listens
Done!
peak memory: 2553.41 MiB, increment: 718.86 MiB


# Section 1: Parquet-based Columnar Storage

Implement Parquet-based storage for the tables
- For simplicity, store all data for a table in a single Parquet file and use a single DataFrame object as a buffer

In [12]:
# see ed: https://edstem.org/us/courses/87394/discussion/7251811 for advice on writing to a parquet without loading existing into RAM
# a ColumnarDbFile is actually a directory with an arbitrary number of parquet files inside
# Append writes a new file with the next postfix
# Retrieve reads all parquet files and concatenates them together, done natively by pandas
class ColumnarDbFile:
    def __init__(self, table_name, file_dir='data', file_pfx=''):
        self.file_pfx = file_pfx
        self.table_name = table_name
        self.file_dir = file_dir
        #os.makedirs(self.file_dir, exist_ok=True)
        self.base_file_name = f"{self.file_dir}/{self.file_pfx}_{self.table_name}"
        os.makedirs(self.base_file_name, exist_ok=True)
        
        # Streaming state
        self._streaming = False
        self._stream_writer = None
        self._stream_file_path = None

    def build_table(self, data):
        """Build and save table data to Parquet."""
        assert self._get_num_parquets() == 0
        target_path = f"{self.base_file_name}/{self.table_name}-0.parquet"
        # If data is a string and is a valid file path, copy it
        if isinstance(data, str) and os.path.isfile(data):
            shutil.copy(data, target_path)
        elif isinstance(data, pd.DataFrame):
            data.to_parquet(target_path)
        else:
            raise ValueError("data must be a pandas DataFrame or a valid file path string")
        return

    def retrieve_data(self, columns=None, sample=None):
        """Create pd.DataFrame by reading from Parquet"""
        if sample is not None:
            return next(self.iter_pages(sample, columns=columns, as_pandas=True))
        else:
            return pd.read_parquet(self.base_file_name, columns=columns)

    def append_data(self, data):
        """Append new data to Parquet
        
        Behavior depends on streaming mode:
        - If streaming (start_stream() called): writes to a single parquet file via ParquetWriter
        - Otherwise: creates a new parquet file for each call
        """
        if self._streaming:
            # Convert DataFrame to PyArrow Table
            table = pa.Table.from_pandas(data, preserve_index=False)
            
            # Lazy writer creation: create on first append with schema
            if self._stream_writer is None:
                self._stream_writer = pq.ParquetWriter(self._stream_file_path, table.schema)
            
            # Write to stream
            self._stream_writer.write_table(table)
        else:
            # Original behavior: create new file
            data.to_parquet(self.get_new_parquet_file())
        return

    def get_new_parquet_file(self):
        '''return a path to a new file with name uniqueness'''
        return f"{self.base_file_name}/{self.table_name}-{self._get_num_parquets()}.parquet"

    def _get_num_parquets(self):
        return len(self.get_all_parquet_paths())

    def get_all_parquet_paths(self):
        return glob.glob(f"{self.base_file_name}/*.parquet")
    
    def start_stream(self):
        """Start streaming mode for efficient batch writes.
        
        After calling this, append_data() will write to a single parquet file
        using ParquetWriter (streaming) instead of creating separate files.
        Must call stop_stream() when done to properly close the writer.
        
        If called multiple times, closes any existing writer and starts a new stream.
        
        Can also be used as a context manager:
            with output_db:
                output_db.append_data(df1)
                output_db.append_data(df2)
            # Automatically stops streaming
        """
        # Close existing writer if streaming was already active
        if self._streaming and self._stream_writer is not None:
            self._stream_writer.close()
        
        # Initialize streaming state
        self._streaming = True
        self._stream_file_path = self.get_new_parquet_file()
        self._stream_writer = None  # Will be created lazily on first append_data()
    
    def stop_stream(self):
        """Stop streaming mode and close the ParquetWriter.
        
        Safe to call multiple times or if streaming was never started.
        """
        if self._stream_writer is not None:
            self._stream_writer.close()
            self._stream_writer = None
        
        self._streaming = False
        self._stream_file_path = None
    
    def __enter__(self):
        """Context manager entry: start streaming mode."""
        self.start_stream()
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit: stop streaming mode."""
        self.stop_stream()
        return False  # Don't suppress exceptions
    
    def __del__(self):
        """Destructor: ensure stream is closed if not explicitly stopped."""
        # Safety net: close writer if streaming was left open
        if self._streaming and self._stream_writer is not None:
            try:
                self._stream_writer.close()
            except:
                pass  # Ignore errors during cleanup

    def table_metadata(self):
        """Return total rows and total byte size of the table without loading data."""
        parquet_files = glob.glob(f"{self.base_file_name}/*.parquet")

        total_rows = 0
        total_bytes = 0

        for file in parquet_files:
            pf = pq.ParquetFile(file)
            meta = pf.metadata

            total_rows += meta.num_rows
            total_bytes += meta.serialized_size  # includes footer + metadata

        return {
            "num_files": len(parquet_files),
            "total_rows": total_rows,
            "total_compressed_bytes": total_bytes,
        }

    def table_disk_usage(self):
        parquet_files = glob.glob(f"{self.base_file_name}/*.parquet")

        total_bytes = sum(os.path.getsize(f) for f in parquet_files)

        return {
            "num_files": len(parquet_files),
            "total_bytes": total_bytes
        }

    def iter_pages(self, rows_per_batch: int = 100_000, columns=None, as_pandas=True):
        for path in self.get_all_parquet_paths():        
            pf = pq.ParquetFile(path)
            for batch in pf.iter_batches(batch_size=rows_per_batch, columns=columns):
                yield batch.to_pandas() if as_pandas else batch

    @staticmethod
    def fits_in_12GB(bytes_needed: int) -> bool:
        TWELVE_GB = 12 * 1024**3
        return bytes_needed <= TWELVE_GB

    @staticmethod
    def can_process_parquet(bytes_on_disk: int, compression_factor: int = 5) -> bool:
        """
        Returns True if a Parquet dataset of `bytes_on_disk` can be processed
        within 12 GB of RAM, after accounting for decompression expansion.
        """
        estimated_ram = bytes_on_disk * compression_factor
        TWELVE_GB = 12 * 1024**3
        return estimated_ram <= TWELVE_GB

In [14]:
%%memit
print("Building tables...")
if os.path.exists('data'):
    print("Removing existing data directory")
    shutil.rmtree('data')

sizes = ["100MB", "1GB", "10GB"]
tables = {}
for size in sizes:
    for table_name in ["Songs", "Users", "Listens"]:
        key = f"{table_name}_{size}"
        tables[key] = ColumnarDbFile(f"{table_name}_{size}", file_dir='data')
        parquet_path = f"{table_name.lower()}_{size}.parquet"
        assert os.path.exists(parquet_path)
        tables[key].build_table(parquet_path)

print("Tables built successfully.")


Building tables...
Removing existing data directory
Tables built successfully.
peak memory: 1770.94 MiB, increment: 0.14 MiB


In [15]:
# retrieve data
tables['Songs_100MB'].retrieve_data(columns = ['song_id', 'title'])

Unnamed: 0,song_id,title
0,0,Song_0
1,1,Song_1
2,2,Song_2
3,3,Song_3
4,4,Song_4
...,...,...
9995,9995,Song_9995
9996,9996,Song_9996
9997,9997,Song_9997
9998,9998,Song_9998


In [16]:
tables['Listens_100MB'].retrieve_data(columns = ['listen_id', 'user_id', 'song_id'])

Unnamed: 0,listen_id,user_id,song_id
0,0,19936,7687
1,1,37756,9045
2,2,35676,3593
3,3,18861,2977
4,4,9826,4653
...,...,...,...
999995,999995,15502,4168
999996,999996,1562,1217
999997,999997,5838,2871
999998,999998,35276,1541


Analyze and report on:
- Space efficiency compared to row storage
  - e.g. Compare file sizes on disk: How much disk space does Parquet use vs. a row storage format like CSV?
- Compression ratios achieved with Parquet
  - e.g. Compare Parquet’s uncompressed encoded size (reported in its metadata) to its compressed on-disk size to compute compression ratios.
  - You could also report the memory expansion factor: how much larger the dataset becomes when loaded into a `pd.DataFrame` compared to the compressed file size.
- Read/write performance characteristics
  - e.g. Read performance: How long does it take to read all columns from Parquet vs. CSV?
  - e.g. Columnar advantage: How long does it take to read selective columns from Parquet vs. reading all columns?
  - e.g. Write performance: How long does it take to write data to Parquet vs. CSV?

In [53]:
def analyze(size="100MB"):
    """Analyze storage efficiency, compression, and read/write performance."""

    table_files = {
        "Songs": f"songs_{size}.parquet",
        "Users": f"users_{size}.parquet",
        "Listens": f"listens_{size}.parquet",
    }

    report_rows = []

    for table_name, parquet_file in table_files.items():
        parquet_path = Path(parquet_file)

        df = pd.read_parquet(parquet_path)
        mem_usage_bytes = df.memory_usage(deep=True).sum() # memory usage of the dataframe
        parquet_size_bytes = parquet_path.stat().st_size # size of the parquet file on disk

        parquet_file_obj = pq.ParquetFile(parquet_path)
        metadata = parquet_file_obj.metadata
        uncompressed_bytes = 0

        # iterate over all row groups and columns to get the total uncompressed size of the parquet file
        for rg_idx in range(metadata.num_row_groups):
            row_group = metadata.row_group(rg_idx)
            for col_idx in range(row_group.num_columns):
                column_meta = row_group.column(col_idx)
                if column_meta.total_uncompressed_size is not None:
                    uncompressed_bytes += column_meta.total_uncompressed_size

        # calculate compression ratio and memory expansion
        compression_ratio = (
            uncompressed_bytes / parquet_size_bytes
        )
        memory_expansion = (
            mem_usage_bytes / parquet_size_bytes
        )

        # test reading speed of parquet file vs csv, for all columns and selective columns
        # pick 1 less than the total number of columns to test reading selective columns
        subset_columns = list(df.columns)[0:len(df.columns)-1]

        with tempfile.TemporaryDirectory() as tmpdir:
            tmpdir_path = Path(tmpdir)

            csv_path = tmpdir_path / f"{parquet_path.stem}.csv"
            start = time.perf_counter()
            df.to_csv(csv_path, index=False)
            write_csv_time = time.perf_counter() - start
            csv_size_bytes = csv_path.stat().st_size

            parquet_tmp_path = tmpdir_path / f"{parquet_path.stem}.parquet"
            start = time.perf_counter()
            df.to_parquet(parquet_tmp_path, index=False)
            write_parquet_time = time.perf_counter() - start

            start = time.perf_counter()
            _ = pd.read_parquet(parquet_path)
            read_parquet_all = time.perf_counter() - start

            start = time.perf_counter()
            _ = pd.read_csv(csv_path)
            read_csv_all = time.perf_counter() - start

            start = time.perf_counter()
            _ = pd.read_parquet(parquet_path, columns=subset_columns)
            read_parquet_subset = time.perf_counter() - start

            start = time.perf_counter()
            _ = pd.read_csv(csv_path, usecols=subset_columns)
            read_csv_subset = time.perf_counter() - start

        size_saving_pct = (
            100.0 * (1 - parquet_size_bytes / csv_size_bytes)
        )

        # append the results to the report
        report_rows.append(
            {
                "table": table_name,
                "parquet_size_mb": parquet_size_bytes / (1024 ** 2),
                "csv_size_mb": csv_size_bytes / (1024 ** 2),
                "size_saving_pct": size_saving_pct,
                "compression_ratio": compression_ratio,
                "memory_expansion": memory_expansion,
                "read_parquet_all_s": read_parquet_all,
                "read_csv_all_s": read_csv_all,
                "read_parquet_subset_s": read_parquet_subset,
                "read_csv_subset_s": read_csv_subset,
                "write_parquet_s": write_parquet_time,
                "write_csv_s": write_csv_time,
            }
        )

        del df
        gc.collect()

    summary = pd.DataFrame(report_rows)
    print("Analysis Summary for Tables of Size " + size + " (sizes in MB, times in seconds):")
    return summary

In [None]:
display(analyze(size="100MB"))

Analysis Summary for Tables of Size 100MB (sizes in MB, times in seconds):


Unnamed: 0,table,parquet_size_mb,csv_size_mb,size_saving_pct,compression_ratio,memory_expansion,read_parquet_all_s,read_csv_all_s,read_parquet_subset_s,read_csv_subset_s,write_parquet_s,write_csv_s
0,Songs,4.271927,9.773173,56.289255,2.41591,3.47343,0.00923,0.070341,0.007404,0.064247,0.02648,0.099192
1,Users,20.347857,48.579238,58.114089,2.471382,3.529207,0.040647,0.349069,0.036233,0.315945,0.098274,0.457715
2,Listens,79.926873,178.866784,55.31486,2.43253,8.042059,0.259581,1.787849,0.283681,1.761461,0.606222,2.841885


peak memory: 1890.39 MiB, increment: 507.52 MiB


In [None]:
display(analyze(size="1GB"))

Analysis Summary for Tables of Size 1GB (sizes in MB, times in seconds):


Unnamed: 0,table,parquet_size_mb,csv_size_mb,size_saving_pct,compression_ratio,memory_expansion,read_parquet_all_s,read_csv_all_s,read_parquet_subset_s,read_csv_subset_s,write_parquet_s,write_csv_s
0,Songs,42.661434,97.92129,56.432933,2.421422,3.480348,0.107253,0.775493,0.101759,0.723374,0.193498,0.89897
1,Users,203.403031,486.268065,58.170596,2.4723,3.530512,0.703518,3.862735,0.607247,3.443182,0.965171,4.605921
2,Listens,834.570071,1817.281654,54.07591,2.409382,7.701888,1.894318,21.883511,3.630786,20.319147,6.950246,29.20366


peak memory: 5707.55 MiB, increment: 4143.92 MiB


# Section 2: Parse SQL Query

In this section, you should implement logic to parse the following SQL query:
```sql
    SELECT s.song_id, AVG(u.age) AS avg_age,
       COUNT(DISTINCT l.user_id) AS count_distinct_users,
    FROM Songs s
    JOIN Listens l ON s.song_id = l.song_id
    JOIN Users u ON l.user_id = u.user_id
    GROUP BY s.song_id, s.title
    ORDER BY COUNT(DISTINCT l.user_id) DESC, s.song_id;
```

You should manually extract the components from the provided query (i.e. you don't need to implement a general SQL parser, just handle this specific query).

In [51]:
query = """SELECT s.song_id, AVG(u.age) AS avg_age,
COUNT(DISTINCT l.user_id)
FROM Songs s
JOIN Listens l ON s.song_id = l.song_id
JOIN Users u ON l.user_id = u.user_id
GROUP BY s.song_id, s.title
ORDER BY COUNT(DISTINCT l.user_id) DESC, s.song_id;
"""

In [49]:
import re
import re

def parse_tables(query):

    # pattern matches: "from songs s" or "join listens l"
    pattern = r"(from|join)\s+([a-z_]+)\s+([a-z])"

    matches = re.findall(pattern, query)

    tables = {}
    for _, table_name, alias in matches:
        tables[alias] = table_name

    return tables

def parse_joins(query):

    # 1) Get the base table from the FROM clause
    base_match = re.search(r"from\s+([a-z_]+)\s+([a-z])", query)
    if not base_match:
        raise ValueError("Could not find FROM clause")

    base_table_name = base_match.group(1)
    base_alias = base_match.group(2)
    base_table = (base_alias, base_table_name)

    # 2) Get each JOIN clause, in order
    # pattern matches:
    #   join listens l on s.song_id = l.song_id
    join_pattern = (
        r"join\s+([a-z_]+)\s+([a-z])\s+on\s+"
        r"([a-z])\.([a-z_]+)\s*=\s*([a-z])\.([a-z_]+)"
    )

    joins = []
    for m in re.finditer(join_pattern, query):
        joined_table_name = m.group(1)
        joined_alias = m.group(2)
        left_alias = m.group(3)
        left_col = m.group(4)
        right_alias = m.group(5)
        right_col = m.group(6)

        joins.append(
            {
                "joined_table_alias": joined_alias,
                "joined_table_name": joined_table_name,
                "left_alias": left_alias,
                "left_column": left_col,
                "right_alias": right_alias,
                "right_column": right_col,
            }
        )

    return {"base_table" : base_table, "Joins" : joins}


def parse_group_by(query):
    """
    Return GROUP BY columns as a list of (alias, column) tuples.
    Example: [('s', 'song_id'), ('s', 'title')]
    """
    q = query.lower()

    # Capture whatever is between GROUP BY and ORDER BY/semicolon/end
    match = re.search(r"group\s+by\s+(.+?)(order\s+by|;|$)", q, re.DOTALL)
    if not match:
        return []

    groupby_text = match.group(1).strip()

    columns = []
    for col in groupby_text.split(","):
        col = col.strip()

        # Expect pattern: alias.column
        alias, column = col.split(".")
        columns.append((alias, column))

    return columns

def parse_select_and_aggregations(query):
    """
    Build:
      aggregations: {agg_key: {...}}
      select: list of items that may refer to agg_key
    """
    q = query.lower()

    m = re.search(r"select\s+(.+?)\s+from", q, re.DOTALL)
    if not m:
        return [], {}

    select_text = m.group(1).strip()
    raw_items = [item.strip() for item in select_text.split(",") if item.strip()]

    select_list = []
    aggregations = {}
    agg_id = 1

    for idx, item in enumerate(raw_items, start=1):
        # AVG(...)
        if item.startswith("avg("):
            m_avg = re.match(
                r"avg\(\s*([a-z])\.([a-z_]+)\s*\)(\s+as\s+([a-z_]+))?",
                item
            )
            if not m_avg:
                raise ValueError(f"Could not parse AVG aggregation: {item}")
            alias_letter = m_avg.group(1)
            col_name = m_avg.group(2)
            out_alias = m_avg.group(4) if m_avg.group(4) else None

            aggregations[agg_id] = {
                "func": "avg",
                "source": (alias_letter, col_name),
                "distinct": False,
                "output_name": out_alias,
            }

            select_list.append(
                {
                    "kind": "aggregation",
                    "agg_key": agg_id,
                    "alias": out_alias,

                }
            )
            agg_id += 1

        # COUNT(DISTINCT ...)
        elif item.startswith("count("):
            m_cnt = re.match(
                r"count\(\s*distinct\s+([a-z])\.([a-z_]+)\s*\)(\s+as\s+([a-z_]+))?",
                item
            )
            if not m_cnt:
                raise ValueError(f"Could not parse COUNT aggregation: {item}")
            alias_letter = m_cnt.group(1)
            col_name = m_cnt.group(2)
            out_alias = m_cnt.group(4) if m_cnt.group(4) else None

            aggregations[agg_id] = {
                "func": "count",
                "source": (alias_letter, col_name),
                "distinct": True,
                "output_name": out_alias,
            }

            select_list.append(
                {
                    "kind": "aggregation",
                    "agg_key": agg_id,
                    "alias": out_alias,
                }
            )
            agg_id += 1

        # Plain column: alias.column
        else:
            alias_letter, col_name = item.split(".")
            select_list.append(
                {
                    "kind": "column",
                    "source": (alias_letter, col_name),
                    "alias": None,
                }
            )

    return select_list, aggregations


def parse_order_by(query, aggregations):
    """
    Build order_by list where entries can refer to aggregations via agg_key.
    """
    q = query.lower()

    m = re.search(r"order\s+by\s+(.+?)(;|$)", q, re.DOTALL)
    if not m:
        return []

    order_text = m.group(1).strip()
    raw_items = [item.strip() for item in order_text.split(",") if item.strip()]

    order_by = []

    for item in raw_items:
        direction = "asc"
        expr = item

        if expr.endswith(" desc"):
            direction = "desc"
            expr = expr[:-5].strip()
        elif expr.endswith(" asc"):
            direction = "asc"
            expr = expr[:-4].strip()

        # COUNT(DISTINCT ...) → match an aggregation
        if expr.startswith("count("):
            m_cnt = re.match(
                r"count\(\s*distinct\s+([a-z])\.([a-z_]+)\s*\)",
                expr
            )
            if not m_cnt:
                raise ValueError(f"Could not parse ORDER BY aggregation: {expr}")
            src = (m_cnt.group(1), m_cnt.group(2))

            agg_key = None
            for k, agg in aggregations.items():
                if (
                    agg["func"] == "count"
                    and agg["distinct"]
                    and agg["source"] == src
                ):
                    agg_key = k
                    break

            if agg_key is None:
                raise ValueError(f"No matching aggregation found for ORDER BY expr: {expr}")

            order_by.append(
                {
                    "kind": "aggregation",
                    "agg_key": agg_key,
                    "direction": direction,
                }
            )

        else:
            # assume plain column: alias.column
            alias_letter, col_name = expr.split(".")
            order_by.append(
                {
                    "kind": "column",
                    "source": (alias_letter, col_name),
                    "direction": direction,
                }
            )

    return order_by

def parse_sql(query):
    """
    YOUR TASK: Extract tables, joins, and aggregations
    """
    # Parse SQL string to identify:
    # - Tables involved
    # - Join conditions
    # - GROUP BY columns
    # - Aggregation functions
    # Your implementation here
    query = query.lower()
    output = {}

    output["tables"] = parse_tables(query)
    output["joins"] = parse_joins(query)
    output["GroupBy"] = parse_group_by(query)
    output["select"], output["aggregations"] = parse_select_and_aggregations(query)
    output["orderBy"] = parse_order_by(query, output["aggregations"])

    return output

In [52]:
output = parse_sql(query)
for key, value in output.items():
    print(f"{key}: {value}")

tables: {'s': 'songs', 'l': 'listens', 'u': 'users'}
joins: {'base_table': ('s', 'songs'), 'Joins': [{'joined_table_alias': 'l', 'joined_table_name': 'listens', 'left_alias': 's', 'left_column': 'song_id', 'right_alias': 'l', 'right_column': 'song_id'}, {'joined_table_alias': 'u', 'joined_table_name': 'users', 'left_alias': 'l', 'left_column': 'user_id', 'right_alias': 'u', 'right_column': 'user_id'}]}
GroupBy: [('s', 'song_id'), ('s', 'title')]
select: [{'kind': 'column', 'source': ('s', 'song_id'), 'alias': None}, {'kind': 'aggregation', 'agg_key': 1, 'alias': 'avg_age'}, {'kind': 'aggregation', 'agg_key': 2, 'alias': None}]
aggregations: {1: {'func': 'avg', 'source': ('u', 'age'), 'distinct': False, 'output_name': 'avg_age'}, 2: {'func': 'count', 'source': ('l', 'user_id'), 'distinct': True, 'output_name': None}}
orderBy: [{'kind': 'aggregation', 'agg_key': 2, 'direction': 'desc'}, {'kind': 'column', 'source': ('s', 'song_id'), 'direction': 'asc'}]


# Section 3: Implement Join Algorithms

In this section, you will implement the execution operators (*how* to join) and aggregation after joins.

**Reminder:** If you use temporary files or folders, you should clean them up either as part of your join logic, or after each run. Otherwise you might run into correctness issues!

In [17]:
import hashlib

def HASHVALUE(value, B):
    if isinstance(value, int):
        return hash(value) % B
    sha256 = hashlib.sha256()
    sha256.update(str(value).encode("utf-8"))
    return int(sha256.hexdigest(), 16) % B

In [18]:
def hash_partition(
    table: ColumnarDbFile,
    hash_keys: List[str],
    num_partitions: int,
    parquet_batch_size: int,
    hash_value_fn: Callable[[object, int], int],
    make_partition_path_fn: Callable[[int], str],
    columns: Optional[List[str]] = None,
):
    """
    Hash-partition `table` into `num_partitions` Parquet files.

    - `hash_keys` is a list of column names (one or more).
    - If len(hash_keys) > 1, we build a temporary concatenated column `_hash_key`
      and hash on that.
    - `hash_value_fn(key, num_partitions)` returns an int in [0, num_partitions).
    - `columns` are the columns to write into each partition.
      All `hash_keys` are automatically included in `columns`.
    """
    is_multi_col = len(hash_keys) > 1
    hash_col_name = "_hash_key" if is_multi_col else hash_keys[0]

    # Normalize and ensure hash_keys are included in the columns we read & write
    if columns:
        for col in hash_keys:
            if col not in columns:
                columns.append(col)

    writers: Dict[int, pq.ParquetWriter] = {}
    for batch_df in table.iter_pages(columns=columns, rows_per_batch=parquet_batch_size):
        # If multiple hash columns, build a temporary concatenated key column
        if is_multi_col:
            batch_df[hash_col_name] = (
                batch_df[hash_keys]
                .astype(str)
                .agg("|".join, axis=1)
            )

        # Compute partition id
        batch_df["_part"] = batch_df[hash_col_name].apply(
            lambda x: hash_value_fn(x, num_partitions)
        )

        if columns:
            batch_df = batch_df[columns + ["_part"]]
        if is_multi_col:
            batch_df = batch_df.drop(columns=hash_col_name)

        # Group rows by partition and write each group
        for part_id, part_df in batch_df.groupby("_part"):
            part_df = part_df.drop(columns=["_part"])

            part_table = pa.Table.from_pandas(part_df, preserve_index=False)

            writer = writers.get(part_id)
            if writer is None:
                part_path = make_partition_path_fn(part_id)
                writer = pq.ParquetWriter(part_path, part_table.schema)
                writers[part_id] = writer

            writer.write_table(part_table)

    for w in writers.values():
        w.close()

Implement `HashPartitionJoin`:
1. Hash partition both tables
2. Build hash table from smaller partition
3. Probe with larger partition
4. Return joined results

In [None]:
class FastHashPartitionJoin:
    def __init__(self, num_partitions=4, parquet_batch_size=100_000, use_streaming=False, time_it=True):
        self.num_partitions = num_partitions
        self.parquet_batch_size = parquet_batch_size
        self.use_streaming = use_streaming
        self.time_it = time_it
    def join(self, table1: ColumnarDbFile, table2: ColumnarDbFile, join_key1, join_key2,
             temp_dir='temp', columns_table1=None, columns_table2=None):
        """
        Perform an optimized hash partition join between two ColumnarDbFile instances.

        Speed ups:
        - load smaller table into memory and build hash map with pandas groupby, larger table is probed in batches
        - vectorized join using numpy operations: see _vectorized_join method for more details
        """
        os.makedirs(temp_dir, exist_ok=True)

        # Partition both tables
        self._hash_partition(table1, join_key1, temp_dir, 'left', columns_table1)
        self._hash_partition(table2, join_key2, temp_dir, 'right', columns_table2)

        output = ColumnarDbFile(f"hpj_{table1.table_name}_{table2.table_name}")
        
        # Clean up any existing files in the output directory
        if os.path.exists(output.base_file_name):
            for file_path in output.get_all_parquet_paths():
                os.remove(file_path)
        
        if self.use_streaming:
            output.start_stream()

        for part_id in range(self.num_partitions):
            left_path = self._make_partition_path(temp_dir, "left", part_id)
            right_path = self._make_partition_path(temp_dir, "right", part_id)

            if not (os.path.exists(left_path) and os.path.exists(right_path)):
                continue
            
            # Process this partition with batched reading
            self._process_partition(
                left_path, right_path, join_key1, join_key2, output
            )

        if self.use_streaming:
            output.stop_stream()
        
        shutil.rmtree(temp_dir)
        return output

    def _process_partition(self, left_path, right_path, join_key1, join_key2, output):
        """
        Process a partition an individual partition from left and right
        Determine which side is smaller and build hash map from that side
        Probe with larger side
        """
        # Get metadata to determine which side is smaller
        left_size = pq.ParquetFile(left_path).metadata.num_rows
        right_size = pq.ParquetFile(right_path).metadata.num_rows
        
        if left_size <= right_size:
            # Build hash map from left, probe with right
            self._build_and_probe(left_path, right_path, join_key1, join_key2, 
                                  output, left_is_build=True)
        else:
            # Build hash map from right, probe with left
            self._build_and_probe(right_path, left_path, join_key2, join_key1, 
                                  output, left_is_build=False)

    def _build_and_probe(self, build_path, probe_path, build_key, probe_key, 
                         output, left_is_build):
        """
        Build hash map from build side and probe with probe side using batched reading.
        """
        # Build hash map from the smaller side (build side)
        hash_map = self._build_hash_map(build_path, build_key)
        
        # Probe with the larger side in batches
        probe_file = pq.ParquetFile(probe_path)
        build_df = pq.read_table(build_path).to_pandas()
        
        for probe_batch in probe_file.iter_batches(batch_size=self.parquet_batch_size):
            probe_df = probe_batch.to_pandas()
            
            # Vectorized join using numpy operations
            joined_df = self._vectorized_join(
                build_df, probe_df, hash_map, build_key, probe_key, left_is_build
            )
            
            if not joined_df.empty:
                output.append_data(joined_df)
            
            # Explicit memory cleanup
            del probe_df
            del joined_df
            gc.collect()
        
        del build_df
        del hash_map
        gc.collect()

    def _build_hash_map(self, file_path, key_column):
        """
        Build an optimized hash map using numpy arrays for better performance.
        Returns a dictionary mapping keys to numpy arrays of indices.
        """
        df = pq.read_table(file_path).to_pandas()
        
        # Group indices by key using pandas groupby (much faster than manual loop)
        grouped = df.reset_index().groupby(key_column)['index'].apply(np.array).to_dict()
        
        return grouped

    def _vectorized_join(self, build_df, probe_df, hash_map, probe_key, left_is_build):
        """
        Primary optimization using a vectorized join with vectorized join:
        1. Get probe keys and find indeces of matches in hash map  
        2. Establish a parrallel index for build and probe tables
        3. Build result from parallel indices using advanced pandas indexing
        """
        probe_keys = probe_df[probe_key].values
        
        build_indices = []
        probe_indices = []
        
        # Build index for build and probe tables
        for probe_idx, key in enumerate(probe_keys):
            if key in hash_map:
                build_idxs = hash_map[key]
                build_indices.extend(build_idxs)
                probe_indices.extend([probe_idx] * len(build_idxs))
        
        if not build_indices:
            return pd.DataFrame()
        
        build_indices = np.array(build_indices)
        probe_indices = np.array(probe_indices)
        
        # Build result using advanced indexing
        # advanced works as follows here:
        # build_df.iloc[build_indices] -> get rows from build_df where index is in build_indices
        # probe_df.iloc[probe_indices] -> get rows from probe_df where index is in probe_indices
        # these lists are parallel, meaning that the row in position i in build_indices is joined 
        # with the row in position i in probe_indices
        if left_is_build:
            left_result = build_df.iloc[build_indices].reset_index(drop=True)
            right_result = probe_df.iloc[probe_indices].reset_index(drop=True)
        else:
            left_result = probe_df.iloc[probe_indices].reset_index(drop=True)
            right_result = build_df.iloc[build_indices].reset_index(drop=True)
        
        # Drop duplicate columns from right side (keeping left)
        common_columns = set(left_result.columns) & set(right_result.columns)
        if common_columns:
            right_result = right_result.drop(columns=list(common_columns))

        result = pd.concat([left_result, right_result], axis=1)
        
        return result

    def _make_partition_path(self, output_dir, side, part_id):
        return f"{output_dir}/{side}_part{part_id}.parquet"

    def _hash_partition(self, table: ColumnarDbFile, join_key, output_dir, side, columns=None):
        make_partition_path_fn = partial(self._make_partition_path, output_dir, side)
        hash_partition(table, [join_key], self.num_partitions, self.parquet_batch_size,
                       HASHVALUE, make_partition_path_fn, columns=columns)

In [36]:
# see ed: https://edstem.org/us/courses/87394/discussion/7151010 for discussion on this implementation
class SlowHashPartitionJoin:
    def __init__(self, num_partitions=4, parquet_batch_size=100_000, use_streaming=False):
        self.num_partitions = num_partitions
        self.parquet_batch_size = parquet_batch_size
        self.use_streaming = use_streaming

    def join(self, table1: ColumnarDbFile, table2: ColumnarDbFile, join_key1, join_key2,
             temp_dir='temp', columns_table1=None, columns_table2=None):
        """
        Perform a hash partition join between two ColumnarDbFile instances.

        Parameters:
        - table1: Left table (ColumnarDbFile)
        - table2: Right table (ColumnarDbFile)
        - join_key1: Join key from table1
        - join_key2: Join key from table2
        - temp_dir: Directory to store temporary files
        - columns_table1: List of columns to select from table1
        - columns_table2: List of columns to select from table2

        Returns:
        - join_result_table: ColumnarDbFile instance containing the join results
        """
        os.makedirs(temp_dir, exist_ok=True)

        # Partition both tables
        self._hash_partition(table1, join_key1, temp_dir, 'left', columns_table1)
        self._hash_partition(table2, join_key2, temp_dir, 'right', columns_table2)

        output = ColumnarDbFile(f"hpj_{table1.table_name}_{table2.table_name}")
        
        # Clean up any existing files in the output directory to avoid reading old files
        if os.path.exists(output.base_file_name):
            for file_path in output.get_all_parquet_paths():
                os.remove(file_path)
        
        if self.use_streaming:
            output.start_stream()

        for part_id in range(self.num_partitions):
            left_path = self._make_partition_path(temp_dir, "left", part_id)
            right_path = self._make_partition_path(temp_dir, "right", part_id)

            if not (os.path.exists(left_path) and os.path.exists(right_path)):
                continue
            
            # BOTTLENECK: read_table is slow, so we should paginate reads here, but as a work around we can increase num_partitions
            left_df = pq.read_table(left_path).to_pandas()
            right_df = pq.read_table(right_path).to_pandas()

            # Decide which side is smaller for this partition
            if len(left_df) <= len(right_df):
                small_df, big_df = left_df, right_df
                small_is_left = True
            else:
                small_df, big_df = right_df, left_df
                small_is_left = False

            # Build hash map from the smaller side, storing **indices** only
            hash_map = {}
            if small_is_left:
                # small_df is left: hash on join_key1
                for i in range(len(small_df)):
                    key = small_df.iloc[i][join_key1]
                    if key not in hash_map:
                        hash_map[key] = []
                    hash_map[key].append(i)
            else:
                # small_df is right: hash on join_key2
                for i in range(len(small_df)):
                    key = small_df.iloc[i][join_key2]
                    if key not in hash_map:
                        hash_map[key] = []
                    hash_map[key].append(i)

            # Nested-loop join probing with the larger side
            joined_rows = []
            if small_is_left:
                # small = left, big = right
                for r_i in range(len(big_df)):
                    r_row = big_df.iloc[r_i]
                    key = r_row[join_key2]
                    if key not in hash_map:
                        continue
                    for l_idx in hash_map[key]:
                        l_row = small_df.iloc[l_idx]
                        combined = {}
                        # copy all left columns
                        for col in left_df.columns:
                            combined[col] = l_row[col]
                        # copy all right columns
                        for col in right_df.columns:
                            combined[col] = r_row[col]
                        joined_rows.append(combined)
            else:
                # small = right, big = left
                for l_i in range(len(big_df)):
                    l_row = big_df.iloc[l_i]
                    key = l_row[join_key1]
                    if key not in hash_map:
                        continue
                    for r_idx in hash_map[key]:
                        r_row = small_df.iloc[r_idx]
                        combined = {}
                        # copy all left columns
                        for col in left_df.columns:
                            combined[col] = l_row[col]
                        # copy all right columns
                        for col in right_df.columns:
                            combined[col] = r_row[col]
                        joined_rows.append(combined)

            if not joined_rows:
                continue
            joined_df = pd.DataFrame(joined_rows)
            joined_table = pa.Table.from_pandas(joined_df, preserve_index=False)

            output.append_data(joined_df)

        if self.use_streaming:
            output.stop_stream()
        
        shutil.rmtree(temp_dir)
        return output

    def _make_partition_path(self, output_dir, side, part_id):
        return f"{output_dir}/{side}_part{part_id}.parquet"

    def _hash_partition(self, table: ColumnarDbFile, join_key, output_dir, side, columns=None):
        make_partition_path_fn = partial(self._make_partition_path, output_dir, side)
        hash_partition(table, [join_key], self.num_partitions, self.parquet_batch_size,
                       HASHVALUE, make_partition_path_fn, columns=columns)

In [85]:
%%memit

SIZE = "1GB" #["100MB", "1GB", "10GB"]
SAMPLE = 100
USE_STREAMING = True

songs_table = tables[f'Songs_{SIZE}']
listens_table = tables[f'Listens_{SIZE}']

# Select specific columns from each table
songs_cols = ['song_id', 'title']
listens_cols = ['listen_id', 'song_id', 'user_id']

# Create HashPartitionJoin instance
hpj1 = FastHashPartitionJoin(
    num_partitions=4, 
    parquet_batch_size=1000000,
    use_streaming=USE_STREAMING  
)

# Perform the join
result_songs_listens = hpj1.join(
    table1=songs_table,           
    table2=listens_table,         
    join_key1='song_id',          
    join_key2='song_id',          
    temp_dir='temp_songs_listens',
    columns_table1=songs_cols,    
    columns_table2=listens_cols   
)

result_df = result_songs_listens.retrieve_data(sample=None)

peak memory: 2734.20 MiB, increment: 119.58 MiB


In [45]:
USE_STREAMING = True
SIZE = "100MB"
# Optional: Verify your implementation against pd.merge
def test_hash_partition_join_comprehensive():
    """
    Comprehensive test that validates both structure AND actual data values.
    This ensures the HPJ implementation is truly correct.
    """
    print("="*70)
    print("Comprehensive Hash Partition Join Test")
    print("="*70)
    
    all_tests_passed = True
    
    # Test: Songs JOIN Listens - FULL DATA VALIDATION
    print("\n" + "="*70)
    print("Test: Songs JOIN Listens ")
    print("="*70)
    
    songs_table = tables[f'Songs_{SIZE}']
    listens_table = tables[f'Listens_{SIZE}']
    
    songs_cols = ['song_id', 'title']
    listens_cols = ['listen_id', 'song_id', 'user_id']
    
    # Perform joins
    hpj1 = FastHashPartitionJoin(num_partitions=4, parquet_batch_size=100_000, use_streaming=USE_STREAMING)
    result_table1 = hpj1.join(
        songs_table, listens_table,
        join_key1='song_id', join_key2='song_id',
        temp_dir='temp_test_songs_listens_comp',
        columns_table1=songs_cols,
        columns_table2=listens_cols
    )
    
    hpj_result1 = result_table1.retrieve_data()
    
    # Get pd.merge result
    songs_df = songs_table.retrieve_data(columns=songs_cols)
    listens_df = listens_table.retrieve_data(columns=listens_cols)
    pd_result1 = pd.merge(songs_df, listens_df, on='song_id', how='inner')
    
    print(f"\nHPJ result shape: {hpj_result1.shape}")
    print(f"pd.merge result shape: {pd_result1.shape}")
    
    test1_passed = True
    
    # 1. Row count check
    if len(hpj_result1) != len(pd_result1):
        print(f"Row count mismatch -- HPJ: {len(hpj_result1)}, pd.merge: {len(pd_result1)}")
        test1_passed = False
        all_tests_passed = False
    else:
        print("Row counts match!")
    
    # 2. Column check
    hpj_cols = set(hpj_result1.columns)
    pd_cols = set(pd_result1.columns)
    if hpj_cols != pd_cols:
        print(f"Column mismatch -- HPJ: {hpj_cols}, pd.merge: {pd_cols}")
        test1_passed = False
        all_tests_passed = False
    else:
        print("Columns match!")
    
    if test1_passed:
        # 3. Sort both results for comparison
        sort_cols = ['song_id', 'listen_id'] if 'listen_id' in hpj_result1.columns else ['song_id']
        hpj_sorted = hpj_result1.sort_values(sort_cols).reset_index(drop=True)
        pd_sorted = pd_result1.sort_values(sort_cols).reset_index(drop=True)
        
        # 4. Check unique keys
        hpj_song_ids = set(hpj_result1['song_id'].unique())
        pd_song_ids = set(pd_result1['song_id'].unique())
        if hpj_song_ids != pd_song_ids:
            print(f"Unique song_ids differ!")
            test1_passed = False
            all_tests_passed = False
        else:
            print("Unique song_ids match!")
        
        # 5. FULL DATA VALUE COMPARISON - This is the critical check!
        print("\nPerforming full data value comparison...")
        data_matches = True
        
        # Compare each column
        for col in sorted(hpj_cols):
            hpj_col_data = hpj_sorted[col].values
            pd_col_data = pd_sorted[col].values
            
            # Use np.array_equal for exact comparison
            if not np.array_equal(hpj_col_data, pd_col_data):
                print(f"Column '{col}' data mismatch")
                
                # Find first mismatch
                mismatch_idx = np.where(hpj_col_data != pd_col_data)[0]
                if len(mismatch_idx) > 0:
                    idx = mismatch_idx[0]
                    print(f"  First mismatch at row {idx}:")
                    print(f"    HPJ: {hpj_col_data[idx]}")
                    print(f"    pd.merge: {pd_col_data[idx]}")
                    print(f"  Total mismatches: {len(mismatch_idx)}")
                
                data_matches = False
                break
        
        if data_matches:
            print("All data values match exactly!")
            print(f"Verified {len(hpj_sorted)} rows × {len(hpj_cols)} columns = {len(hpj_sorted) * len(hpj_cols)} values")
        else:
            print("✗ Data values do NOT match!")
            test1_passed = False
            all_tests_passed = False
        
        # 6. Check for duplicate rows (should be same in both)
        hpj_duplicates = hpj_sorted.duplicated().sum()
        pd_duplicates = pd_sorted.duplicated().sum()
        if hpj_duplicates != pd_duplicates:
            print(f"Duplicate row counts differ (HPJ: {hpj_duplicates}, pd.merge: {pd_duplicates})")
        else:
            print(f"Duplicate row counts match ({hpj_duplicates} duplicates)")
    
    if test1_passed:
        print("\n Test PASSED")
    else:
        print("\n Test FAILED!")
    
    # Summary
    print("\n" + "="*70)
    print("COMPREHENSIVE TEST SUMMARY")
    print("="*70)
    if all_tests_passed:
        print("✓ ALL TESTS PASSED: Hash Partition Join is CORRECT!")
        print("  - Row counts match")
        print("  - Column structure matches")
        print("  - Unique keys match")
        print("  - ALL DATA VALUES match exactly")
    else:
        print("✗ TESTS FAILED: Implementation has issues")
    print("="*70)
    
    return all_tests_passed

test_hash_partition_join_comprehensive()

Comprehensive Hash Partition Join Test

Test: Songs JOIN Listens 

HPJ result shape: (1000000, 4)
pd.merge result shape: (1000000, 4)
Row counts match!
Columns match!
Unique song_ids match!

Performing full data value comparison...
All data values match exactly!
Verified 1000000 rows × 4 columns = 4000000 values
Duplicate row counts match (0 duplicates)

 Test PASSED

COMPREHENSIVE TEST SUMMARY
✓ ALL TESTS PASSED: Hash Partition Join is CORRECT!
  - Row counts match
  - Column structure matches
  - Unique keys match
  - ALL DATA VALUES match exactly


True

Implement `SortMergeJoin`:
1. Sort both tables by join key
2. Merge sorted sequences
3. Handle duplicates

In [None]:
BWAY_MERGE_FACTOR = 10

class SortMergeJoin:
    def __init__(
        self, bway_merge_factor: int = BWAY_MERGE_FACTOR, num_pages_per_split=1000
    ):
        self.bway_merge_factor = bway_merge_factor
        self.num_pages_per_split = num_pages_per_split

    def _flush_run(
        self,
        dfs: List[pd.DataFrame],
        join_key: str,
        output_dir: str,
        side: str,
        run_idx: int,
    ) -> str:

        df_run = pd.concat(dfs, ignore_index=True)
        df_run_sorted = df_run.sort_values(by=join_key)

        run_file = os.path.join(output_dir, f"{side}_run_{run_idx}.parquet")
        df_run_sorted.to_parquet(run_file)

        dfs.clear()
        del df_run, df_run_sorted
        gc.collect()

        return run_file


    def _external_sort(
        self,
        table: ColumnarDbFile,
        join_key: str,
        output_dir: str,
        side: str,
        columns: Optional[List[str]] = None,
    ) -> ColumnarDbFile:
        """
        Perform an external sort on a table based on the join key and return a sorted ColumnarDbFile.
        Use _bway_merge to merge sorted files
        """

        # Get table size (on disk)
        disk_usage = table.table_disk_usage()
        total_bytes = disk_usage["total_bytes"]

        # Check if we can safely process in 12 GB RAM
        if table.can_process_parquet(total_bytes):

            # read data in and sort all in RAM
            df = table.retrieve_data(columns=columns)
            df_sorted = df.sort_values(by=join_key).reset_index(drop=True)

            # create paraquet in output dir for the table
            sorted_name = f"{side}_{table.table_name}_sorted"
            sorted_table = ColumnarDbFile(sorted_name, file_dir=output_dir)
            sorted_table.build_table(df_sorted)

            # clean unnecessary overhead and return table
            del df, df_sorted
            gc.collect()
            return sorted_table

        else:
            print("sorting table ", table.table_name, "with ", total_bytes, "bytes using external sort")
            print("GBs : ", total_bytes / (1024 * 1024 * 1024))
            # Get list of parquet files in the table directory
            parquet_files = glob.glob(f"{table.base_file_name}/*.parquet")

            runs_path: List[str] = []
            run_idx = 0
            current_dfs: List[pd.DataFrame] = []
            current_row_groups = 0

            # loop through all the parquet files
            print(f"Sorting {len(parquet_files)} files")
            for file in parquet_files:
                pf = pq.ParquetFile(file)

                # safe bounded unit of work for sorting
                num_row_groups = pf.metadata.num_row_groups

                for rg in range(num_row_groups):

                    # read a row group as a chunk
                    batch = pf.read_row_group(rg, columns=columns)
                    df_chunk = batch.to_pandas()
                    current_dfs.append(df_chunk)
                    current_row_groups += 1

                    # treating a row group as a page
                    if current_row_groups > self.num_pages_per_split:

                        print("flushing run ", run_idx)
                        run_file = self._flush_run(
                        current_dfs, join_key, output_dir, side, run_idx
                        )
                        runs_path.append(run_file)
                        run_idx += 1
                        current_row_groups = 0
                        print(f"Flushed run {run_idx} at {run_file}")

            # flush remaining partial run
            if current_dfs:
              run_file = self._flush_run(
                  current_dfs, join_key, output_dir, side, run_idx
              )
              runs_path.append(run_file)

            # Create the wrapper first so we write where it will read
            sorted_table = ColumnarDbFile(
                table_name=f"{side}_{table.table_name}_sorted",
                file_dir=output_dir,
            )

            # Write the final merged file inside that directory, matching ColumnarDbFile
            final_sorted_path = os.path.join(
                sorted_table.base_file_name, f"{sorted_table.table_name}-0.parquet"
            )
            print("merging all runs into ", final_sorted_path)
            self._merge_all_runs(runs_path, final_sorted_path, join_key)

            return sorted_table

    def _merge_all_runs(self, sorted_files: List[str], output_file: str, join_key: str):
        """
        Merge multiple sorted Parquet files into a single sorted Parquet file.
        """
        B = self.bway_merge_factor

        # copy that we will mutate
        runs = list(sorted_files)
        pass_idx = 0

        while len(runs) > 1:
          print("merging pass ", pass_idx)
          next_runs = []

          # B - 1 input buffers +1 output buffer
          for i in range(0, len(runs), B - 1):
                batch = runs[i : i + (B - 1)]   # B-1 input buffers

                # choose an output path for this merged batch
                # on the final pass, we want the result at `output_file`
                if len(runs) <= B - 1 and len(next_runs) == 0:
                    # last pass, first (and only) merged run -> final output
                    merged_path = output_file
                else:
                    # intermediate pass: write to a temp run file
                    base_dir = os.path.dirname(output_file)
                    merged_path = os.path.join(
                        base_dir,
                        f"bway_pass{pass_idx}_run{len(next_runs)}.parquet",
                    )

                # K-way merge this batch into merged_path
                self._bway_merge(batch, merged_path, join_key)

                next_runs.append(merged_path)

          runs = next_runs
          pass_idx += 1

        # At this point, runs has exactly one file.
        final_run = runs[0]
        if final_run != output_file:
            # In case we didn't land exactly on output_file path
            if os.path.exists(output_file):
                os.remove(output_file)
            shutil.move(final_run, output_file)

        return output_file


    def _bway_merge(self, sorted_files: List[str], output_file: str, join_key: str):
        """
        Merge a batch of sorted files into a single sorted file by join_key.
        """
        dfs = []
        for path in sorted_files:
            df = pd.read_parquet(path)
            dfs.append(df)

        merged = pd.concat(dfs, ignore_index=True)
        merged_sorted = merged.sort_values(by=join_key)
        merged_sorted.to_parquet(output_file, index=False)
        print(sorted_files)

    def join(
        self,
        table1: ColumnarDbFile,
        table2: ColumnarDbFile,
        join_key1: str,
        join_key2: str,
        temp_dir: str = "temp",
        columns_table1: Optional[List[str]] = None,
        columns_table2: Optional[List[str]] = None,
    ) -> Optional[ColumnarDbFile]:
        """
        Perform a sort-merge join between two ColumnarDbFile instances and return a sorted ColumnarDbFile.
        """
        os.makedirs(temp_dir, exist_ok=True)

        # Sort both tables externally
        sorted_table1 = self._external_sort(
            table1, join_key1, temp_dir, "left", columns_table1
        )
        sorted_table2 = self._external_sort(
            table2, join_key2, temp_dir, "right", columns_table2
        )

        # 2) load sorted dataframes from the ColumnarDbFiles
        sorted_table1 = sorted_table1.retrieve_data(columns=columns_table1)
        sorted_table2 = sorted_table2.retrieve_data(columns=columns_table2)

        joined_df = pd.merge(
            sorted_table1,
            sorted_table2,
            left_on=join_key1,
            right_on=join_key2,
            how="inner",
        )

        result_table = ColumnarDbFile("join_result", file_dir=temp_dir)
        result_table.build_table(joined_df)

        return result_table

In [None]:
songs_table = tables['Songs']
users_table = tables['Users']
listens_table = tables['Listens']

smj = SortMergeJoin()

# Example: join Songs with Listens on song_id
sorted_join_result = smj.join(
    songs_table,
    listens_table,
    join_key1="song_id",
    join_key2="song_id",
    temp_dir="temp_songs_listens",
    columns_table1= ["song_id", "title"],
    columns_table2= ["song_id", "user_id"]
)

display(sorted_join_result)


<__main__.ColumnarDbFile at 0x7b08d42781a0>

Implement GROUP BY after joins:
- Here you could use `pd.groupby` or do manual aggregation

In [77]:
# GROUP BY s.song_id, s.title
class HashGroupbyAverageAndDistinct():
    def __init__(self, num_partitions, parquet_batch_size, use_streaming=False):
        self.num_partitions = num_partitions
        self.parquet_batch_size = parquet_batch_size
        self.use_streaming = use_streaming
    
    def _make_partition_path(self, temp_dir, part_id):
        return os.path.join(temp_dir, f"group_part{part_id}.parquet")
    
    def groupby_average_distinct(self,
                        table: ColumnarDbFile, 
                        groupby_cols: List[str],
                        average_col: str, 
                        average_col_name: str, 
                        distinct_col: str,
                        distinct_col_name: str,
                        select_cols: List[str], 
                        temp_dir='groupby_temp') -> ColumnarDbFile:
        """
        Perform:
            SELECT select_cols..., AVG(average_col) AS average_col_name, COUNT(DISTINCT distinct_col)
            FROM table
            GROUP BY groupby_cols...
        
        Hash partitioning on (concatenation of) groupby_cols, and then in-memory aggregation per partition

        Assumptions:
        - groupby_col is non-empty
        - select_cols is a subset of groupby_col
        - Per-partition hash table fits in memory
        
        Uses self.use_streaming to determine whether to use ParquetWriter streaming
        for efficient batch writes.
        """
        if os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)
        
        os.makedirs(temp_dir, exist_ok=True)
        # hash on groupby columns (safe: same group always shares this)

        hash_partition(
            table=table,
            hash_keys=groupby_cols,  # hash on groupby columns (safe: same group always shares this)
            num_partitions=self.num_partitions,
            parquet_batch_size=self.parquet_batch_size,
            hash_value_fn=HASHVALUE,
            make_partition_path_fn=partial(self._make_partition_path, temp_dir),
            columns= list(set(groupby_cols + [average_col, distinct_col])),
        )

        output_db = ColumnarDbFile(f"{table.table_name}_groupby_avg")
        
        # Start streaming if enabled
        if self.use_streaming:
            output_db.start_stream()
        for part_id in range(self.num_partitions):
            part_path = self._make_partition_path(temp_dir, part_id)
            if not os.path.exists(part_path):
                continue

            # In-memory hash table for this partition:
            # key: tuple of groupby_col values
            # value: Tuple of (sum of average_col, count of average_col, set of distinct distinct_col values)
            SUM_IDX = 0
            COUNT_IDX = 1
            DISTINCT_SET_IDX = 2
            agg_map: Dict[Any, Tuple[float, int, set]] = {}

            pf = pq.ParquetFile(part_path)
            for batch in pf.iter_batches(batch_size=self.parquet_batch_size):
                df = batch.to_pandas()

                grouped = (
                    df.groupby(groupby_cols)
                    .agg(
                        sum_avg=(average_col, "sum"),
                        cnt_avg=(average_col, "count"),  # SQL AVG ignores NULLs
                        distinct_set=(distinct_col, lambda s: set(s.dropna()))  # SQL ignores NULL in COUNT DISTINCT
                    )
                )

                for key_tuple, row in grouped.iterrows():
                    if not isinstance(key_tuple, tuple):
                        key_tuple = (key_tuple,)

                    state = agg_map.setdefault(key_tuple, [0.0, 0, set()])
                    state[SUM_IDX] += row["sum_avg"]
                    state[COUNT_IDX] += row["cnt_avg"]
                    state[DISTINCT_SET_IDX] |= row["distinct_set"]


            # Turn the per-partition hash table into a DataFrame and append
            if agg_map:
                # Pre-compute column index mapping to avoid repeated index() calls
                col_idx_map = {col: groupby_cols.index(col) for col in select_cols}
                
                # Build output efficiently using list comprehensions
                out_rows = []
                for key_tuple, state in agg_map.items():
                    # Ensure key_tuple is a tuple (handles single vs multi-column)
                    if not isinstance(key_tuple, tuple):
                        key_tuple = (key_tuple,)
                    
                    row_dict = {col: key_tuple[col_idx_map[col]] for col in select_cols}
                    row_dict[average_col_name] = state[SUM_IDX] / state[COUNT_IDX]
                    row_dict[distinct_col_name] = len(state[DISTINCT_SET_IDX])
                    out_rows.append(row_dict)

                if out_rows:
                    out_df = pd.DataFrame(out_rows)
                    output_db.append_data(out_df)

        # Stop streaming if it was enabled
        if self.use_streaming:
            output_db.stop_stream()

        return output_db

In [84]:
%%memit
# test implementation

SIZE = "1GB" #["100MB", "1GB", "10GB"]
SAMPLE = 100
USE_STREAMING = False

listens_table = tables[f'Listens_{SIZE}']

groupby_average_distinct = HashGroupbyAverageAndDistinct(
    num_partitions=4,
    parquet_batch_size=10000000,
    use_streaming=USE_STREAMING
)

results = groupby_average_distinct.groupby_average_distinct(
    table=listens_table,
    groupby_cols=['song_id'],
    average_col='user_id',
    average_col_name='avg_user_id',
    distinct_col='user_id',
    distinct_col_name='distinct_user_id',
    select_cols=['song_id']
)
result_df = result_songs_listens.retrieve_data(sample=None)

peak memory: 2980.83 MiB, increment: 722.27 MiB


# Section 4: Query Planning & Optimization

In this section, you'll implement smart query planning using metadata analysis. The key idea is to **avoid loading data unnecessarily** by:
1. Analyzing Parquet metadata first (row counts, column names, file sizes)
2. Making intelligent decisions about join order and algorithm selection
3. Loading only the columns you actually need for the query

In [None]:
def analyze_metadata_before_loading(file_paths):
    """YOUR TASK: Get table statistics WITHOUT loading data

    Hints:
    - Use pq.ParquetFile() to access metadata
    - Extract: num_rows, column names, file sizes
    - DON'T use pd.read_parquet() here - that loads data!
    """
    metadata = {}

    # TODO: For each table ('songs', 'users', 'listens'):
    #   - Open the Parquet file (but don't load data)
    #   - Extract metadata like row count, columns, sizes
    #   - Store in a dictionary
    pass  # Your implementation here


def plan_query_execution(metadata, parsed_query):
    """YOUR TASK: Use metadata to make smart decisions

    Questions to answer:
    - Which table is smallest? Largest?
    - Will a hash table fit in memory?
    - Which columns does the query actually need?
    - What's the optimal join order?
    """
    # TODO: Based on metadata, decide:
    #   1. Join order (smallest first? or different strategy?)
    #   2. Algorithm choice (HPJ if fits in memory, else SMJ)
    #   3. Which columns to load for each table
    pass  # Your implementation here


# After planning, load ONLY what you need:
# Example (you implement the actual logic):
# columns_needed = ['song_id', 'artist']  # From your planning
# df = pd.read_parquet('songs.parquet', columns=columns_needed)

In [None]:
class QueryPlanner:
    pass # Your implementation here


class QueryExecutor:
    def __init__(self, tables, num_partitions=8, output_dir="temp", planner=None, size="100MB"):
        self.tables = tables
        self.num_partitions = num_partitions
        self.output_dir = output_dir
        self.planner = planner or QueryPlanner()
        os.makedirs(self.output_dir, exist_ok=True)

    def execute_hardcoded_query(self):
        """
        Executes the following SQL query:

        SELECT s.song_id, AVG(u.age) AS avg_age,
        COUNT(DISTINCT l.user_id)
        FROM Songs s
        JOIN Listens l ON s.song_id = l.song_id
        JOIN Users u ON l.user_id = u.user_id
        GROUP BY s.song_id, s.title
        ORDER BY COUNT(DISTINCT l.user_id) DESC, s.song_id;
        """
        # Hardcoded
        columns = {"Songs": ["song_id", "title"], "Listens": ["listen_id", "song_id", "user_id"], "Users": ["user_id", "age"]}

        # TODO: this should be specified by the query planner
        join_order = ["Listens", "Users", "Songs"]
        join_algorithm = ["HPJ", "HPJ"]

        # do joins
        result = self.tables[f"{join_order[0]}_{SIZE}"]
        for i in range(1, len(join_order)):
            table = self.tables[f"{join_order[i]}_{SIZE}"]
            if join_algorithm[i-1] == "HPJ":
                # use HPJ
            else:
                # use SMJ

        # do group by 
        avg_col = "age"
        avg_col_name = "avg_age"
        distinct_col = "user_id"
        distinct_col_name = "distinct_user_id"
        select_cols = ["song_id"]

        # do group by
        

        # sort by count distinct

        # Your implementation here

# Section 5: Performance Benchmarking

In [None]:
def benchmark_query(executor, dataset_size):
    """Benchmark the query execution time and memory usage."""
    print(f"\nBenchmarking with {dataset_size} dataset...")
    start_mem = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)
    start_time = time.time()

    result = executor.execute_hardcoded_query()

    end_time = time.time()
    end_mem = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)

    print(f"Execution Time: {end_time - start_time:.2f} seconds")
    print(f"Memory Usage: {end_mem - start_mem:.2f} MB")
    return result

## 100MB Benchmark

In [None]:
# Your implementation here

## 1GB Benchmark

In [None]:
# Your implementation here

## Performance Analysis

In [None]:
# baselines: https://edstem.org/us/courses/87394/discussion/7276409
# Your implementation here