<a href="https://colab.research.google.com/github/berkyalcinkaya/cs145-project2-systems/blob/main/cs145_project2_systems_template_fa2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Collaborators

1.   Berk Yalcinkaya
2.   Nick Allen


# Setup

In [1]:
import pandas as pd
import os
import uuid
import argparse
import time
import psutil
import heapq
import pyarrow as pa
import pyarrow.parquet as pq
import random
import string
import numpy as np
from typing import List, Optional
import shutil
import glob
import gc
from IPython.display import display
import tempfile
from pathlib import Path

# Section 0: Generate Test Data

This section has already been implemented for you.

In [None]:
import gc

def generate_songs_chunk(start, size, string_length=100):
    data = {
        "song_id": range(start, start + size),
        "title": [f"Song_{i}" for i in range(start, start + size)],
    }
    base_strings = generate_base_strings(size, string_length)
    for i in range(1, 11):
        data[f"extra_col_{i}"] = np.roll(base_strings, shift=i)
    return pd.DataFrame(data)


def generate_users_chunk(start, size, string_length=100):
    data = {
        "user_id": range(start, start + size),
        "age": [18 + ((start + i) % 60) for i in range(size)],
    }
    base_strings = generate_base_strings(size, string_length)
    for i in range(1, 11):
        data[f"extra_col_{i}"] = np.roll(base_strings, shift=i)
    return pd.DataFrame(data)


def generate_listens_chunk(start, size, num_users, num_songs, string_length=16):
    data = {
        "listen_id": range(start, start + size),
        "user_id": np.random.randint(0, num_users, size=size),
        "song_id": np.random.randint(0, num_songs, size=size),
    }
    base_strings = generate_base_strings(size, string_length)
    for i in range(1, 11):
        data[f"extra_col_{i}"] = np.roll(base_strings, shift=i)
    return pd.DataFrame(data)


def generate_base_strings(num_records, string_length):
    chars = np.array(list("ab"))
    random_indices = np.random.randint(0, len(chars), size=(num_records, string_length))
    char_array = chars[random_indices]
    return np.array(list(map("".join, char_array)))


def _write_parquet_streamed(
    filename,
    total_rows,
    make_chunk_fn,
    chunk_size=250_000,
    compression="snappy",
):
    """
    Stream DataFrame chunks to a single Parquet file with one ParquetWriter.
    - schema_df: optional small DataFrame to lock schema; if None we'll infer from the first chunk.
    """
    written = 0

    first_chunk = make_chunk_fn(0, min(chunk_size, total_rows))
    first_table = pa.Table.from_pandas(first_chunk, preserve_index=False)
    writer = pq.ParquetWriter(filename, first_table.schema, compression=compression)
    writer.write_table(first_table)

    written += len(first_chunk)
    del first_chunk
    gc.collect()

    while written < total_rows:
        take = min(chunk_size, total_rows - written)
        chunk_df = make_chunk_fn(written, take)
        writer.write_table(pa.Table.from_pandas(chunk_df, preserve_index=False))
        written += take
        del chunk_df
        gc.collect()

    writer.close()


def generate_test_data(target_size="100MB"):
    """
    Generate datasets with proper foreign key relationships.

    Target COMPRESSED Parquet file sizes on disk:
    100MB total compressed:
        - Songs: 10K rows → ~5MB (5% of total)
        - Users: 50K rows → ~20MB (20% of total)
        - Listens: 1M rows → ~75MB (75% of total)
    1GB total compressed:
        - Songs: 100K rows → ~50MB (5% of total)
        - Users: 500K rows → ~200MB (20% of total)
        - Listens: 10M rows → ~750MB (75% of total)

    Each table needs:
        - Primary key column(s)
        - 10 additional string columns of k characters each
        - For Users: add 'age' column (random 18-80)

    CRITICAL: Listens table must have valid foreign keys!
    Every song_id must exist in Songs
    Every user_id must exist in Users
    """

    assert target_size in ["100MB", "1GB"]
    if target_size == "100MB":
        num_songs = 10_000
        num_users = 50_000
        num_listens = 1_000_000

        songs_chunk = 10_000
        users_chunk = 50_000
        listens_chunk = 1_000_000
    else:
        num_songs = 100_000
        num_users = 500_000
        num_listens = 10_000_000

        songs_chunk = 10_000
        users_chunk = 50_000
        listens_chunk = 1_000_000

    print("Writing Songs")
    _write_parquet_streamed(
        filename=f"songs_{target_size}.parquet",
        total_rows=num_songs,
        make_chunk_fn=lambda start, size: generate_songs_chunk(start, size),
        chunk_size=songs_chunk,
    )

    print("Writing Users")
    _write_parquet_streamed(
        filename=f"users_{target_size}.parquet",
        total_rows=num_users,
        make_chunk_fn=lambda start, size: generate_users_chunk(start, size),
        chunk_size=users_chunk,
    )

    print("Writing Listens")
    _write_parquet_streamed(
        filename=f"listens_{target_size}.parquet",
        total_rows=num_listens,
        make_chunk_fn=lambda start, size: generate_listens_chunk(
            start, size, num_users, num_songs
        ),
        chunk_size=listens_chunk,
    )

    print("Done!")

In [3]:
random.seed(0)

generate_test_data('100MB')
generate_test_data('1GB')

Writing Songs
Writing Users
Writing Listens
Done!
Writing Songs
Writing Users
Writing Listens
Done!


# Section 1: Parquet-based Columnar Storage

Implement Parquet-based storage for the tables
- For simplicity, store all data for a table in a single Parquet file and use a single DataFrame object as a buffer

In [2]:
# see ed: https://edstem.org/us/courses/87394/discussion/7251811 for advice on writing to a parquet without loading existing into RAM
# a ColumnarDbFile is actually a directory with an arbitrary number of parquet files inside
# Append writes a new file with the next postfix
# Retrieve reads all parquet files and concatenates them together, done natively by pandas
class ColumnarDbFile:
    def __init__(self, table_name, file_dir='data', file_pfx=''):
        self.file_pfx = file_pfx
        self.table_name = table_name
        self.file_dir = file_dir
        #os.makedirs(self.file_dir, exist_ok=True)
        self.base_file_name = f"{self.file_dir}/{self.file_pfx}_{self.table_name}"
        os.makedirs(self.base_file_name, exist_ok=True)

    def build_table(self, data):
        """Build and save table data to Parquet."""
        data.to_parquet(f"{self.base_file_name}/{self.table_name}-0.parquet")
        return
    
    def get_new_parquet_file(self):
        '''return a path to a new file with name uniqueness'''
        return f"{self.base_file_name}/{self.table_name}-{self._get_num_parquets()}.parquet"

    def retrieve_data(self, columns=None):
        """Create pd.DataFrame by reading from Parquet"""
        return pd.read_parquet(self.base_file_name, columns=columns)

    def append_data(self, data):
        """Append new data to Parquet"""
        # Use glob to count the number of parquet files in the directory
        data.to_parquet(self.get_new_parquet_file())
        return

    def _get_num_parquets(self):
        return len(glob.glob(f"{self.base_file_name}/*.parquet"))

In [4]:
print("Building tables...")
if os.path.exists('data'):
    shutil.rmtree('data')
tables = {
    'Songs': ColumnarDbFile("Songs", file_dir='data'),
    'Users': ColumnarDbFile("Users", file_dir='data'),
    'Listens': ColumnarDbFile("Listens", file_dir='data')
}

size = "100MB"
songs_data = pd.read_parquet(f'songs_{size}.parquet')
users_data = pd.read_parquet(f'users_{size}.parquet')
listens_data = pd.read_parquet(f'listens_{size}.parquet')

tables['Songs'].build_table(songs_data)
tables['Users'].build_table(users_data)
tables['Listens'].build_table(listens_data)
print("Tables built successfully.")

Building tables...
Tables built successfully.


In [5]:
# retrieve data
tables['Songs'].retrieve_data(columns = ['song_id', 'title'])

Unnamed: 0,song_id,title
0,0,Song_0
1,1,Song_1
2,2,Song_2
3,3,Song_3
4,4,Song_4
...,...,...
9995,9995,Song_9995
9996,9996,Song_9996
9997,9997,Song_9997
9998,9998,Song_9998


In [6]:
tables['Listens'].retrieve_data(columns = ['listen_id', 'user_id', 'song_id'])

Unnamed: 0,listen_id,user_id,song_id
0,0,34466,442
1,1,648,4282
2,2,10186,2750
3,3,9396,6732
4,4,35402,7472
...,...,...,...
999995,999995,39479,4803
999996,999996,24831,7059
999997,999997,29334,4934
999998,999998,4672,6217


Analyze and report on:
- Space efficiency compared to row storage
  - e.g. Compare file sizes on disk: How much disk space does Parquet use vs. a row storage format like CSV?
- Compression ratios achieved with Parquet
  - e.g. Compare Parquet’s uncompressed encoded size (reported in its metadata) to its compressed on-disk size to compute compression ratios.
  - You could also report the memory expansion factor: how much larger the dataset becomes when loaded into a `pd.DataFrame` compared to the compressed file size.
- Read/write performance characteristics
  - e.g. Read performance: How long does it take to read all columns from Parquet vs. CSV?
  - e.g. Columnar advantage: How long does it take to read selective columns from Parquet vs. reading all columns?
  - e.g. Write performance: How long does it take to write data to Parquet vs. CSV?

In [8]:
def analyze(size="100MB"):
    """Analyze storage efficiency, compression, and read/write performance."""

    table_files = {
        "Songs": f"songs_{size}.parquet",
        "Users": f"users_{size}.parquet",
        "Listens": f"listens_{size}.parquet",
    }

    report_rows = []

    for table_name, parquet_file in table_files.items():
        parquet_path = Path(parquet_file)

        df = pd.read_parquet(parquet_path)
        mem_usage_bytes = df.memory_usage(deep=True).sum() # memory usage of the dataframe
        parquet_size_bytes = parquet_path.stat().st_size # size of the parquet file on disk

        parquet_file_obj = pq.ParquetFile(parquet_path)
        metadata = parquet_file_obj.metadata
        uncompressed_bytes = 0

        # iterate over all row groups and columns to get the total uncompressed size of the parquet file
        for rg_idx in range(metadata.num_row_groups):
            row_group = metadata.row_group(rg_idx)
            for col_idx in range(row_group.num_columns):
                column_meta = row_group.column(col_idx)
                if column_meta.total_uncompressed_size is not None:
                    uncompressed_bytes += column_meta.total_uncompressed_size

        # calculate compression ratio and memory expansion
        compression_ratio = (
            uncompressed_bytes / parquet_size_bytes
        )
        memory_expansion = (
            mem_usage_bytes / parquet_size_bytes
        )

        # test reading speed of parquet file vs csv, for all columns and selective columns
        # pick 1 less than the total number of columns to test reading selective columns
        subset_columns = list(df.columns)[0:len(df.columns)-1]

        with tempfile.TemporaryDirectory() as tmpdir:
            tmpdir_path = Path(tmpdir)

            csv_path = tmpdir_path / f"{parquet_path.stem}.csv"
            start = time.perf_counter()
            df.to_csv(csv_path, index=False)
            write_csv_time = time.perf_counter() - start
            csv_size_bytes = csv_path.stat().st_size

            parquet_tmp_path = tmpdir_path / f"{parquet_path.stem}.parquet"
            start = time.perf_counter()
            df.to_parquet(parquet_tmp_path, index=False)
            write_parquet_time = time.perf_counter() - start

            start = time.perf_counter()
            _ = pd.read_parquet(parquet_path)
            read_parquet_all = time.perf_counter() - start

            start = time.perf_counter()
            _ = pd.read_csv(csv_path)
            read_csv_all = time.perf_counter() - start

            start = time.perf_counter()
            _ = pd.read_parquet(parquet_path, columns=subset_columns)
            read_parquet_subset = time.perf_counter() - start

            start = time.perf_counter()
            _ = pd.read_csv(csv_path, usecols=subset_columns)
            read_csv_subset = time.perf_counter() - start

        size_saving_pct = (
            100.0 * (1 - parquet_size_bytes / csv_size_bytes)
        )

        # append the results to the report
        report_rows.append(
            {
                "table": table_name,
                "parquet_size_mb": parquet_size_bytes / (1024 ** 2),
                "csv_size_mb": csv_size_bytes / (1024 ** 2),
                "size_saving_pct": size_saving_pct,
                "compression_ratio": compression_ratio,
                "memory_expansion": memory_expansion,
                "read_parquet_all_s": read_parquet_all,
                "read_csv_all_s": read_csv_all,
                "read_parquet_subset_s": read_parquet_subset,
                "read_csv_subset_s": read_csv_subset,
                "write_parquet_s": write_parquet_time,
                "write_csv_s": write_csv_time,
            }
        )

        del df
        gc.collect()

    summary = pd.DataFrame(report_rows)
    print("Analysis Summary for Tables of Size " + size + " (sizes in MB, times in seconds):")
    return summary

In [9]:
display(analyze(size="100MB"))

Analysis Summary for Tables of Size 100MB (sizes in MB, times in seconds):


Unnamed: 0,table,parquet_size_mb,csv_size_mb,size_saving_pct,compression_ratio,memory_expansion,read_parquet_all_s,read_csv_all_s,read_parquet_subset_s,read_csv_subset_s,write_parquet_s,write_csv_s
0,Songs,4.270176,9.773173,56.307171,2.416896,3.474854,0.062576,0.169019,0.053261,0.150658,0.067855,0.355316
1,Users,20.345389,48.579238,58.119167,2.471677,3.529635,0.386665,0.890072,0.289729,0.839951,0.345185,2.673603
2,Listens,79.988129,178.867496,55.280792,2.432883,8.0359,3.091684,6.910288,3.648525,6.145732,3.289582,12.617512


In [10]:
# display(analyze(size="1GB"))

# Section 2: Parse SQL Query

In this section, you should implement logic to parse the following SQL query:
```sql
    SELECT s.song_id, AVG(u.age) AS avg_age,
       COUNT(DISTINCT l.user_id) AS count_distinct_users,
    FROM Songs s
    JOIN Listens l ON s.song_id = l.song_id
    JOIN Users u ON l.user_id = u.user_id
    GROUP BY s.song_id, s.title
    ORDER BY COUNT(DISTINCT l.user_id) DESC, s.song_id;
```

You should manually extract the components from the provided query (i.e. you don't need to implement a general SQL parser, just handle this specific query).

In [11]:
query = """SELECT s.song_id, AVG(u.age) AS avg_age,
COUNT(DISTINCT l.user_id)
FROM Songs s
JOIN Listens l ON s.song_id = l.song_id
JOIN Users u ON l.user_id = u.user_id
GROUP BY s.song_id, s.title
ORDER BY COUNT(DISTINCT l.user_id) DESC, s.song_id;
"""

In [None]:
import re
import re

def parse_tables(query):

    # pattern matches: "from songs s" or "join listens l"
    pattern = r"(from|join)\s+([a-z_]+)\s+([a-z])"

    matches = re.findall(pattern, query)

    tables = {}
    for _, table_name, alias in matches:
        tables[alias] = table_name

    return tables

def parse_joins(query):

    # 1) Get the base table from the FROM clause
    base_match = re.search(r"from\s+([a-z_]+)\s+([a-z])", query)
    if not base_match:
        raise ValueError("Could not find FROM clause")

    base_table_name = base_match.group(1)
    base_alias = base_match.group(2)
    base_table = (base_alias, base_table_name)

    # 2) Get each JOIN clause, in order
    # pattern matches:
    #   join listens l on s.song_id = l.song_id
    join_pattern = (
        r"join\s+([a-z_]+)\s+([a-z])\s+on\s+"
        r"([a-z])\.([a-z_]+)\s*=\s*([a-z])\.([a-z_]+)"
    )

    joins = []
    for m in re.finditer(join_pattern, query):
        joined_table_name = m.group(1)
        joined_alias = m.group(2)
        left_alias = m.group(3)
        left_col = m.group(4)
        right_alias = m.group(5)
        right_col = m.group(6)

        joins.append(
            {
                "joined_table_alias": joined_alias,
                "joined_table_name": joined_table_name,
                "left_alias": left_alias,
                "left_column": left_col,
                "right_alias": right_alias,
                "right_column": right_col,
            }
        )

    return {"base_table" : base_table, "Joins" : joins}


def parse_group_by(query):
    """
    Return GROUP BY columns as a list of (alias, column) tuples.
    Example: [('s', 'song_id'), ('s', 'title')]
    """
    q = query.lower()

    # Capture whatever is between GROUP BY and ORDER BY/semicolon/end
    match = re.search(r"group\s+by\s+(.+?)(order\s+by|;|$)", q, re.DOTALL)
    if not match:
        return []

    groupby_text = match.group(1).strip()

    columns = []
    for col in groupby_text.split(","):
        col = col.strip()

        # Expect pattern: alias.column
        alias, column = col.split(".")
        columns.append((alias, column))

    return columns

def parse_select_and_aggregations(query):
    """
    Build:
      aggregations: {agg_key: {...}}
      select: list of items that may refer to agg_key
    """
    q = query.lower()

    m = re.search(r"select\s+(.+?)\s+from", q, re.DOTALL)
    if not m:
        return [], {}

    select_text = m.group(1).strip()
    raw_items = [item.strip() for item in select_text.split(",") if item.strip()]

    select_list = []
    aggregations = {}
    agg_id = 1

    for idx, item in enumerate(raw_items, start=1):
        # AVG(...)
        if item.startswith("avg("):
            m_avg = re.match(
                r"avg\(\s*([a-z])\.([a-z_]+)\s*\)(\s+as\s+([a-z_]+))?",
                item
            )
            if not m_avg:
                raise ValueError(f"Could not parse AVG aggregation: {item}")
            alias_letter = m_avg.group(1)
            col_name = m_avg.group(2)
            out_alias = m_avg.group(4) if m_avg.group(4) else None

            aggregations[agg_id] = {
                "func": "avg",
                "source": (alias_letter, col_name),
                "distinct": False,
                "output_name": out_alias,
            }

            select_list.append(
                {
                    "kind": "aggregation",
                    "agg_key": agg_id,
                    "alias": out_alias,

                }
            )
            agg_id += 1

        # COUNT(DISTINCT ...)
        elif item.startswith("count("):
            m_cnt = re.match(
                r"count\(\s*distinct\s+([a-z])\.([a-z_]+)\s*\)(\s+as\s+([a-z_]+))?",
                item
            )
            if not m_cnt:
                raise ValueError(f"Could not parse COUNT aggregation: {item}")
            alias_letter = m_cnt.group(1)
            col_name = m_cnt.group(2)
            out_alias = m_cnt.group(4) if m_cnt.group(4) else None

            aggregations[agg_id] = {
                "func": "count",
                "source": (alias_letter, col_name),
                "distinct": True,
                "output_name": out_alias,
            }

            select_list.append(
                {
                    "kind": "aggregation",
                    "agg_key": agg_id,
                    "alias": out_alias,
                }
            )
            agg_id += 1

        # Plain column: alias.column
        else:
            alias_letter, col_name = item.split(".")
            select_list.append(
                {
                    "kind": "column",
                    "source": (alias_letter, col_name),
                    "alias": None,
                }
            )

    return select_list, aggregations


def parse_order_by(query, aggregations):
    """
    Build order_by list where entries can refer to aggregations via agg_key.
    """
    q = query.lower()

    m = re.search(r"order\s+by\s+(.+?)(;|$)", q, re.DOTALL)
    if not m:
        return []

    order_text = m.group(1).strip()
    raw_items = [item.strip() for item in order_text.split(",") if item.strip()]

    order_by = []

    for item in raw_items:
        direction = "asc"
        expr = item

        if expr.endswith(" desc"):
            direction = "desc"
            expr = expr[:-5].strip()
        elif expr.endswith(" asc"):
            direction = "asc"
            expr = expr[:-4].strip()

        # COUNT(DISTINCT ...) → match an aggregation
        if expr.startswith("count("):
            m_cnt = re.match(
                r"count\(\s*distinct\s+([a-z])\.([a-z_]+)\s*\)",
                expr
            )
            if not m_cnt:
                raise ValueError(f"Could not parse ORDER BY aggregation: {expr}")
            src = (m_cnt.group(1), m_cnt.group(2))

            agg_key = None
            for k, agg in aggregations.items():
                if (
                    agg["func"] == "count"
                    and agg["distinct"]
                    and agg["source"] == src
                ):
                    agg_key = k
                    break

            if agg_key is None:
                raise ValueError(f"No matching aggregation found for ORDER BY expr: {expr}")

            order_by.append(
                {
                    "kind": "aggregation",
                    "agg_key": agg_key,
                    "direction": direction,
                }
            )

        else:
            # assume plain column: alias.column
            alias_letter, col_name = expr.split(".")
            order_by.append(
                {
                    "kind": "column",
                    "source": (alias_letter, col_name),
                    "direction": direction,
                }
            )

    return order_by

def parse_sql(query):
    """
    YOUR TASK: Extract tables, joins, and aggregations
    """
    # Parse SQL string to identify:
    # - Tables involved
    # - Join conditions
    # - GROUP BY columns
    # - Aggregation functions
    # Your implementation here
    query = query.lower()
    output = {}

    output["tables"] = parse_tables(query)
    output["joins"] = parse_joins(query)
    output["GroupBy"] = parse_group_by(query)
    output["select"], output["aggregations"] = parse_select_and_aggregations(query)
    output["orderBy"] = parse_order_by(query, output["aggregations"])

    return output

In [23]:
output = parse_sql(query)
for key, value in output.items():
    print(f"{key}: {value}")

tables: {'s': 'songs', 'l': 'listens', 'u': 'users'}
joins: {'base_table': ('s', 'songs'), 'Joins': [{'joined_table_alias': 'l', 'joined_table_name': 'listens', 'left_alias': 's', 'left_column': 'song_id', 'right_alias': 'l', 'right_column': 'song_id'}, {'joined_table_alias': 'u', 'joined_table_name': 'users', 'left_alias': 'l', 'left_column': 'user_id', 'right_alias': 'u', 'right_column': 'user_id'}]}
GroupBy: [('s', 'song_id'), ('s', 'title')]
select: [{'kind': 'column', 'source': ('s', 'song_id'), 'alias': None}, {'kind': 'aggregation', 'agg_key': 1, 'alias': 'avg_age'}, {'kind': 'aggregation', 'agg_key': 2, 'alias': None}]
aggregations: {1: {'func': 'avg', 'source': ('u', 'age'), 'distinct': False, 'output_name': 'avg_age'}, 2: {'func': 'count', 'source': ('l', 'user_id'), 'distinct': True, 'output_name': None}}
orderBy: [{'kind': 'aggregation', 'agg_key': 2, 'direction': 'desc'}, {'kind': 'column', 'source': ('s', 'song_id'), 'direction': 'asc'}]


# Section 3: Implement Join Algorithms

In this section, you will implement the execution operators (*how* to join) and aggregation after joins.

**Reminder:** If you use temporary files or folders, you should clean them up either as part of your join logic, or after each run. Otherwise you might run into correctness issues!

In [3]:
import hashlib

def HASHVALUE(value, B):
    if isinstance(value, int):
        return hash(value) % B
    sha256 = hashlib.sha256()
    sha256.update(str(value).encode("utf-8"))
    return int(sha256.hexdigest(), 16) % B

Implement `HashPartitionJoin`:
1. Hash partition both tables
2. Build hash table from smaller partition
3. Probe with larger partition
4. Return joined results

In [7]:
# see ed: https://edstem.org/us/courses/87394/discussion/7151010 for discussion on this implementation
class HashPartitionJoin:
    def __init__(self, num_partitions=4, parquet_batch_size=1000):
        self.num_partitions = num_partitions
        self.parquet_batch_size = parquet_batch_size

    def join(self, table1: ColumnarDbFile, table2: ColumnarDbFile, join_key1, join_key2,
             temp_dir='temp', columns_table1=None, columns_table2=None):
        """
        Perform a hash partition join between two ColumnarDbFile instances.

        Parameters:
        - table1: Left table (ColumnarDbFile)
        - table2: Right table (ColumnarDbFile)
        - join_key1: Join key from table1
        - join_key2: Join key from table2
        - temp_dir: Directory to store temporary files
        - columns_table1: List of columns to select from table1
        - columns_table2: List of columns to select from table2

        Returns:
        - join_result_table: ColumnarDbFile instance containing the join results
        """
        os.makedirs(temp_dir, exist_ok=True)

        # Partition both tables
        self._hash_partition(table1, join_key1, temp_dir, 'left', columns_table1)
        self._hash_partition(table2, join_key2, temp_dir, 'right', columns_table2)

        # Output file for the final joined table
        output = ColumnarDbFile(f"hpj_{table1.table_name}_{table2.table_name}")
        output_path = output.get_new_parquet_file()
        join_writer = None  # lazy-init when we see first joined batch

        for part_id in range(self.num_partitions):
            left_path = self._make_partition_path(temp_dir, "left", part_id)
            right_path = self._make_partition_path(temp_dir, "right", part_id)

            # If either side is missing, nothing to join
            if not (os.path.exists(left_path) and os.path.exists(right_path)):
                continue

            # Load both sides fully for this partition
            left_df = pq.read_table(left_path).to_pandas()
            right_df = pq.read_table(right_path).to_pandas()

            if left_df.empty or right_df.empty:
                continue

            # Decide which side is smaller for this partition
            if len(left_df) <= len(right_df):
                small_df, big_df = left_df, right_df
                small_is_left = True
            else:
                small_df, big_df = right_df, left_df
                small_is_left = False

            # Build hash map from the smaller side
            hash_map = {}
            if small_is_left:
                # small_df is left: hash on join_key1
                for _, row in small_df.iterrows():
                    key = row[join_key1]
                    if key not in hash_map:
                        hash_map[key] = []
                    hash_map[key].append(row)
            else:
                # small_df is right: hash on join_key2
                for _, row in small_df.iterrows():
                    key = row[join_key2]
                    if key not in hash_map:
                        hash_map[key] = []
                    hash_map[key].append(row)

            # Nested-loop join probing with the larger side
            joined_rows = []
            if small_is_left:
                # small = left, big = right
                for _, r_row in big_df.iterrows():
                    key = r_row[join_key2]
                    if key not in hash_map:
                        continue
                    for l_row in hash_map[key]:
                        combined = {}
                        # copy all left columns
                        for col in left_df.columns:
                            combined[col] = l_row[col]
                        # copy all right columns
                        for col in right_df.columns:
                            combined[col] = r_row[col]
                        joined_rows.append(combined)
            else:
                # small = right, big = left
                for _, l_row in big_df.iterrows():
                    key = l_row[join_key1]
                    if key not in hash_map:
                        continue
                    for r_row in hash_map[key]:
                        combined = {}
                        # copy all left columns
                        for col in left_df.columns:
                            combined[col] = l_row[col]
                        # copy all right columns
                        for col in right_df.columns:
                            combined[col] = r_row[col]
                        joined_rows.append(combined)

            if not joined_rows:
                continue

            joined_df = pd.DataFrame(joined_rows)
            joined_table = pa.Table.from_pandas(joined_df, preserve_index=False)

            # Initialize writer on first non-empty batch
            if join_writer is None:
                join_writer = pq.ParquetWriter(output_path, joined_table.schema)

            join_writer.write_table(joined_table)

        if join_writer is not None:
            join_writer.close()
        
        shutil.rmtree(temp_dir)
        return output

    def _make_partition_path(self, output_dir, side, part_id):
        return f"{output_dir}/{side}_part{part_id}.parquet"

    def _hash_partition(self, table: ColumnarDbFile, join_key, output_dir, side, columns=None):
        # Find all parquet files in the directory
        parquet_files = glob.glob(f"{table.base_file_name}/*.parquet")
        if not parquet_files:
            raise ValueError(f"No parquet files found in {table.base_file_name}")
        
        writers: dict[int, pq.ParquetWriter] = {}
        
        # Process each parquet file in the directory
        for parquet_file_path in parquet_files:
            parquet_file = pq.ParquetFile(parquet_file_path)
            # Ensure join_key is included in columns for partitioning
            read_columns = columns
            if columns and join_key not in columns:
                read_columns = list(columns) + [join_key]
            
            for batch in parquet_file.iter_batches(batch_size=self.parquet_batch_size, columns=read_columns):
                batch_df = batch.to_pandas()
                
                # Add partition column based on join_key
                batch_df["_part"] = batch_df[join_key].apply(lambda x: HASHVALUE(x, self.num_partitions))
                
                # Filter to requested columns if specified (but keep _part for grouping)
                if columns:
                    # Select requested columns plus _part
                    batch_df = batch_df[columns + ["_part"]]

                # Group rows by partition id and write them out
                for part_id, part_df in batch_df.groupby("_part"):
                    # Drop helper column before writing
                    part_df = part_df.drop(columns=["_part"])

                    # Convert to Arrow Table
                    part_table = pa.Table.from_pandas(part_df, preserve_index=False)

                    # Lazily create writer for this partition
                    writer = writers.get(part_id)
                    if writer is None:
                        part_path = self._make_partition_path(output_dir, side, part_id)
                        writer = pq.ParquetWriter(part_path, part_table.schema)
                        writers[part_id] = writer

                    # Append this batch's rows for this partition as a new row group
                    writer.write_table(part_table)

        # Close all writers
        for w in writers.values():
            w.close()


In [8]:
# Test Hash Partition Join against pd.merge using existing test data
def test_hash_partition_join_vs_pd_merge():
    """
    Compare HashPartitionJoin results with pd.merge to verify correctness.
    Uses the existing Songs, Users, and Listens tables loaded via ColumnarDbFile API.
    """
    print("="*70)
    print("Testing Hash Partition Join against pd.merge")
    print("="*70)
    
    # Ensure tables are loaded (using the pattern from the provided code)
    if not os.path.exists('data'):
        print("ERROR: 'data' directory does not exist. Please run the table building cell first.")
        return False
    
    # Load existing tables using ColumnarDbFile API
    print("\nLoading existing tables using ColumnarDbFile API...")
    tables = {
        'Songs': ColumnarDbFile("Songs", file_dir='data'),
        'Users': ColumnarDbFile("Users", file_dir='data'),
        'Listens': ColumnarDbFile("Listens", file_dir='data')
    }
    
    # Verify tables exist
    for name, table in tables.items():
        try:
            sample_data = table.retrieve_data()
            print(f"  ✓ {name} table loaded ({len(sample_data)} rows, {len(sample_data.columns)} columns)")
        except Exception as e:
            print(f"  ✗ {name} table not found: {e}")
            return False
    
    all_tests_passed = True
    
    # Test Case 1: Songs JOIN Listens on song_id
    print("\n" + "="*70)
    print("Test Case 1: Songs JOIN Listens (on song_id)")
    print("="*70)
    
    songs_table = tables['Songs']
    listens_table = tables['Listens']
    
    # Select relevant columns for testing (using fewer columns for performance)
    songs_cols = ['song_id', 'title']
    listens_cols = ['listen_id', 'song_id', 'user_id']
    
    # Perform hash partition join
    hpj1 = HashPartitionJoin(num_partitions=4, parquet_batch_size=1000)
    result_table1 = hpj1.join(
        songs_table, listens_table,
        join_key1='song_id', join_key2='song_id',
        temp_dir='temp_test_songs_listens',
        columns_table1=songs_cols,
        columns_table2=listens_cols
    )
    
    # Retrieve results
    hpj_result1 = result_table1.retrieve_data()
    
    # Get data for pd.merge comparison using ColumnarDbFile API
    songs_df = songs_table.retrieve_data(columns=songs_cols)
    listens_df = listens_table.retrieve_data(columns=listens_cols)
    pd_result1 = pd.merge(songs_df, listens_df, on='song_id', how='inner')
    
    # Compare results
    print(f"\nHPJ result shape: {hpj_result1.shape}")
    print(f"pd.merge result shape: {pd_result1.shape}")
    
    test1_passed = True
    if len(hpj_result1) != len(pd_result1):
        print(f"✗ Row count mismatch! HPJ: {len(hpj_result1)}, pd.merge: {len(pd_result1)}")
        test1_passed = False
        all_tests_passed = False
    else:
        print("✓ Row counts match!")
        
        # Check column names
        hpj_cols = set(hpj_result1.columns)
        pd_cols = set(pd_result1.columns)
        if hpj_cols != pd_cols:
            print(f"WARNING: Column names differ")
            print(f"  HPJ has: {hpj_cols - pd_cols}")
            print(f"  pd.merge has: {pd_cols - hpj_cols}")
        
        # Sort both results for comparison (order may differ)
        sort_cols = ['song_id', 'listen_id'] if 'listen_id' in hpj_result1.columns else ['song_id']
        hpj_sorted1 = hpj_result1.sort_values(sort_cols).reset_index(drop=True)
        pd_sorted1 = pd_result1.sort_values(sort_cols).reset_index(drop=True)
        
        # Check that unique song_ids match
        hpj_song_ids = set(hpj_result1['song_id'].unique())
        pd_song_ids = set(pd_result1['song_id'].unique())
        if hpj_song_ids != pd_song_ids:
            print(f"✗ Unique song_ids differ!")
            print(f"  HPJ has {len(hpj_song_ids - pd_song_ids)} extra song_ids")
            print(f"  pd.merge has {len(pd_song_ids - hpj_song_ids)} extra song_ids")
            test1_passed = False
            all_tests_passed = False
        else:
            print("✓ Unique song_ids match!")
        
        # Sample comparison of actual data
        if len(hpj_sorted1) > 0:
            print("\nSample comparison (first 3 rows):")
            print("HPJ result:")
            print(hpj_sorted1.head(3))
            print("\npd.merge result:")
            print(pd_sorted1.head(3))
    
    if test1_passed:
        print("\n✓ Test Case 1 PASSED!")
    else:
        print("\n✗ Test Case 1 FAILED!")
    
    # Test Case 2: Listens JOIN Users on user_id
    print("\n" + "="*70)
    print("Test Case 2: Listens JOIN Users (on user_id)")
    print("="*70)
    
    listens_table2 = tables['Listens']
    users_table = tables['Users']
    
    # Select relevant columns
    listens_cols2 = ['listen_id', 'user_id', 'song_id']
    users_cols = ['user_id', 'age']
    
    # Perform hash partition join
    hpj2 = HashPartitionJoin(num_partitions=4, parquet_batch_size=1000)
    result_table2 = hpj2.join(
        listens_table2, users_table,
        join_key1='user_id', join_key2='user_id',
        temp_dir='temp_test_listens_users',
        columns_table1=listens_cols2,
        columns_table2=users_cols
    )
    
    # Retrieve results
    hpj_result2 = result_table2.retrieve_data()
    
    # Get data for pd.merge comparison
    listens_df2 = listens_table2.retrieve_data(columns=listens_cols2)
    users_df = users_table.retrieve_data(columns=users_cols)
    pd_result2 = pd.merge(listens_df2, users_df, on='user_id', how='inner')
    
    # Compare results
    print(f"\nHPJ result shape: {hpj_result2.shape}")
    print(f"pd.merge result shape: {pd_result2.shape}")
    
    test2_passed = True
    if len(hpj_result2) != len(pd_result2):
        print(f"✗ Row count mismatch! HPJ: {len(hpj_result2)}, pd.merge: {len(pd_result2)}")
        test2_passed = False
        all_tests_passed = False
    else:
        print("✓ Row counts match!")
        
        # Check unique user_ids
        hpj_user_ids = set(hpj_result2['user_id'].unique())
        pd_user_ids = set(pd_result2['user_id'].unique())
        if hpj_user_ids != pd_user_ids:
            print(f"✗ Unique user_ids differ!")
            print(f"  HPJ has {len(hpj_user_ids - pd_user_ids)} extra user_ids")
            print(f"  pd.merge has {len(pd_user_ids - hpj_user_ids)} extra user_ids")
            test2_passed = False
            all_tests_passed = False
        else:
            print("✓ Unique user_ids match!")
        
        # Sample comparison
        if len(hpj_result2) > 0:
            sort_cols2 = ['user_id', 'listen_id'] if 'listen_id' in hpj_result2.columns else ['user_id']
            hpj_sorted2 = hpj_result2.sort_values(sort_cols2).reset_index(drop=True)
            pd_sorted2 = pd_result2.sort_values(sort_cols2).reset_index(drop=True)
            
            print("\nSample comparison (first 3 rows):")
            print("HPJ result:")
            print(hpj_sorted2.head(3))
            print("\npd.merge result:")
            print(pd_sorted2.head(3))
    
    if test2_passed:
        print("\n✓ Test Case 2 PASSED!")
    else:
        print("\n✗ Test Case 2 FAILED!")
    
    # Summary
    print("\n" + "="*70)
    print("SUMMARY")
    print("="*70)
    if all_tests_passed:
        print("✓ All tests PASSED: Hash Partition Join matches pd.merge!")
    else:
        print("✗ Some tests FAILED: Results do not match pd.merge")
    print("="*70)
    
    return all_tests_passed

# Run the test
test_hash_partition_join_vs_pd_merge()


Testing Hash Partition Join against pd.merge

Loading existing tables using ColumnarDbFile API...
  ✓ Songs table loaded (10000 rows, 12 columns)
  ✓ Users table loaded (50000 rows, 12 columns)
  ✓ Listens table loaded (1000000 rows, 13 columns)

Test Case 1: Songs JOIN Listens (on song_id)

HPJ result shape: (1000000, 4)
pd.merge result shape: (1000000, 4)
✓ Row counts match!
✓ Unique song_ids match!

Sample comparison (first 3 rows):
HPJ result:
   song_id   title  listen_id  user_id
0        0  Song_0       3365     5884
1        0  Song_0       7528    24413
2        0  Song_0       8799    44982

pd.merge result:
   song_id   title  listen_id  user_id
0        0  Song_0       3365     5884
1        0  Song_0       7528    24413
2        0  Song_0       8799    44982

✓ Test Case 1 PASSED!

Test Case 2: Listens JOIN Users (on user_id)

HPJ result shape: (1000000, 4)
pd.merge result shape: (1000000, 4)
✓ Row counts match!
✓ Unique user_ids match!

Sample comparison (first 3 rows):
H

True

In [None]:
# Optional: Verify your implementation against pd.merge

Implement `SortMergeJoin`:
1. Sort both tables by join key
2. Merge sorted sequences
3. Handle duplicates

In [None]:
BWAY_MERGE_FACTOR = 10

class SortMergeJoin:
    def __init__(
        self, bway_merge_factor: int = BWAY_MERGE_FACTOR, num_pages_per_split=1000
    ):
        self.bway_merge_factor = bway_merge_factor
        self.num_pages_per_split = num_pages_per_split

    def _external_sort(
        self,
        table: ColumnarDbFile,
        join_key: str,
        output_dir: str,
        side: str,
        columns: Optional[List[str]] = None,
    ) -> ColumnarDbFile:
        """
        Perform an external sort on a table based on the join key and return a sorted ColumnarDbFile.
        Use _bway_merge to merge sorted files
        """
        # Your implementation here

    def _bway_merge(self, sorted_files: List[str], output_file: str, join_key: str):
        """
        Merge multiple sorted Parquet files into a single sorted Parquet file using B-way merge.
        """
        # Your implementation here

    def join(
        self,
        table1: ColumnarDbFile,
        table2: ColumnarDbFile,
        join_key1: str,
        join_key2: str,
        temp_dir: str = "temp",
        columns_table1: Optional[List[str]] = None,
        columns_table2: Optional[List[str]] = None,
    ) -> Optional[ColumnarDbFile]:
        """
        Perform a sort-merge join between two ColumnarDbFile instances and return a sorted ColumnarDbFile.
        """
        os.makedirs(temp_dir, exist_ok=True)

        # Sort both tables externally
        sorted_table1 = self._external_sort(
            table1, join_key1, temp_dir, "left", columns_table1
        )
        sorted_table2 = self._external_sort(
            table2, join_key2, temp_dir, "right", columns_table2
        )

        # Your implementation here

In [None]:
# Optional: Verify your implementation against pd.merge

Implement GROUP BY after joins:
- Here you could use `pd.groupby` or do manual aggregation

In [None]:
# Your implementation here

# Section 4: Query Planning & Optimization

In this section, you'll implement smart query planning using metadata analysis. The key idea is to **avoid loading data unnecessarily** by:
1. Analyzing Parquet metadata first (row counts, column names, file sizes)
2. Making intelligent decisions about join order and algorithm selection
3. Loading only the columns you actually need for the query

In [None]:
def analyze_metadata_before_loading(file_paths):
    """YOUR TASK: Get table statistics WITHOUT loading data

    Hints:
    - Use pq.ParquetFile() to access metadata
    - Extract: num_rows, column names, file sizes
    - DON'T use pd.read_parquet() here - that loads data!
    """
    metadata = {}

    # TODO: For each table ('songs', 'users', 'listens'):
    #   - Open the Parquet file (but don't load data)
    #   - Extract metadata like row count, columns, sizes
    #   - Store in a dictionary
    pass  # Your implementation here


def plan_query_execution(metadata, parsed_query):
    """YOUR TASK: Use metadata to make smart decisions

    Questions to answer:
    - Which table is smallest? Largest?
    - Will a hash table fit in memory?
    - Which columns does the query actually need?
    - What's the optimal join order?
    """
    # TODO: Based on metadata, decide:
    #   1. Join order (smallest first? or different strategy?)
    #   2. Algorithm choice (HPJ if fits in memory, else SMJ)
    #   3. Which columns to load for each table
    pass  # Your implementation here


# After planning, load ONLY what you need:
# Example (you implement the actual logic):
# columns_needed = ['song_id', 'artist']  # From your planning
# df = pd.read_parquet('songs.parquet', columns=columns_needed)

In [None]:
class QueryPlanner:
    pass # Your implementation here


class QueryExecutor:
    def __init__(self, tables, num_partitions=8, output_dir="temp", planner=None):
        self.tables = tables
        self.num_partitions = num_partitions
        self.output_dir = output_dir
        self.planner = planner or QueryPlanner()
        os.makedirs(self.output_dir, exist_ok=True)

    def execute_hardcoded_query(self):
        """
        Executes the following SQL query:

        SELECT s.song_id, AVG(u.age) AS avg_age,
        COUNT(DISTINCT l.user_id)
        FROM Songs s
        JOIN Listens l ON s.song_id = l.song_id
        JOIN Users u ON l.user_id = u.user_id
        GROUP BY s.song_id, s.title
        ORDER BY COUNT(DISTINCT l.user_id) DESC, s.song_id;
        """

        # Your implementation here

# Section 5: Performance Benchmarking

In [None]:
def benchmark_query(executor, dataset_size):
    """Benchmark the query execution time and memory usage."""
    print(f"\nBenchmarking with {dataset_size} dataset...")
    start_mem = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)
    start_time = time.time()

    result = executor.execute_hardcoded_query()

    end_time = time.time()
    end_mem = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)

    print(f"Execution Time: {end_time - start_time:.2f} seconds")
    print(f"Memory Usage: {end_mem - start_mem:.2f} MB")
    return result

## 100MB Benchmark

In [None]:
# Your implementation here

## 1GB Benchmark

In [None]:
# Your implementation here

## Performance Analysis

In [None]:
# Your implementation here