## Collaborators

1.   Berk Yalcinkaya
2.   Nick Allen


# Setup

In [1]:
import pandas as pd
import os
import uuid
import argparse
import time
import psutil
import heapq
import pyarrow as pa
import pyarrow.parquet as pq
import random
import string
import numpy as np
from typing import List, Optional
import shutil

# Section 0: Generate Test Data

This section has already been implemented for you.

In [2]:
import gc


def generate_songs_chunk(start, size, string_length=100):
    data = {
        "song_id": range(start, start + size),
        "title": [f"Song_{i}" for i in range(start, start + size)],
    }
    base_strings = generate_base_strings(size, string_length)
    for i in range(1, 11):
        data[f"extra_col_{i}"] = np.roll(base_strings, shift=i)
    return pd.DataFrame(data)


def generate_users_chunk(start, size, string_length=100):
    data = {
        "user_id": range(start, start + size),
        "age": [18 + ((start + i) % 60) for i in range(size)],
    }
    base_strings = generate_base_strings(size, string_length)
    for i in range(1, 11):
        data[f"extra_col_{i}"] = np.roll(base_strings, shift=i)
    return pd.DataFrame(data)


def generate_listens_chunk(start, size, num_users, num_songs, string_length=16):
    data = {
        "listen_id": range(start, start + size),
        "user_id": np.random.randint(0, num_users, size=size),
        "song_id": np.random.randint(0, num_songs, size=size),
    }
    base_strings = generate_base_strings(size, string_length)
    for i in range(1, 11):
        data[f"extra_col_{i}"] = np.roll(base_strings, shift=i)
    return pd.DataFrame(data)


def generate_base_strings(num_records, string_length):
    chars = np.array(list("ab"))
    random_indices = np.random.randint(0, len(chars), size=(num_records, string_length))
    char_array = chars[random_indices]
    return np.array(list(map("".join, char_array)))


def _write_parquet_streamed(
    filename,
    total_rows,
    make_chunk_fn,
    chunk_size=250_000,
    compression="snappy",
):
    """
    Stream DataFrame chunks to a single Parquet file with one ParquetWriter.
    - schema_df: optional small DataFrame to lock schema; if None we'll infer from the first chunk.
    """
    written = 0

    first_chunk = make_chunk_fn(0, min(chunk_size, total_rows))
    first_table = pa.Table.from_pandas(first_chunk, preserve_index=False)
    writer = pq.ParquetWriter(filename, first_table.schema, compression=compression)
    writer.write_table(first_table)

    written += len(first_chunk)
    del first_chunk
    gc.collect()

    while written < total_rows:
        take = min(chunk_size, total_rows - written)
        chunk_df = make_chunk_fn(written, take)
        writer.write_table(pa.Table.from_pandas(chunk_df, preserve_index=False))
        written += take
        del chunk_df
        gc.collect()

    writer.close()


def generate_test_data(target_size="100MB"):
    """
    Generate datasets with proper foreign key relationships.

    Target COMPRESSED Parquet file sizes on disk:
    100MB total compressed:
        - Songs: 10K rows → ~5MB (5% of total)
        - Users: 50K rows → ~20MB (20% of total)
        - Listens: 1M rows → ~75MB (75% of total)
    1GB total compressed:
        - Songs: 100K rows → ~50MB (5% of total)
        - Users: 500K rows → ~200MB (20% of total)
        - Listens: 10M rows → ~750MB (75% of total)

    Each table needs:
        - Primary key column(s)
        - 10 additional string columns of k characters each
        - For Users: add 'age' column (random 18-80)

    CRITICAL: Listens table must have valid foreign keys!
    Every song_id must exist in Songs
    Every user_id must exist in Users
    """

    assert target_size in ["100MB", "1GB"]
    if target_size == "100MB":
        num_songs = 10_000
        num_users = 50_000
        num_listens = 1_000_000

        songs_chunk = 10_000
        users_chunk = 50_000
        listens_chunk = 1_000_000
    else:
        num_songs = 100_000
        num_users = 500_000
        num_listens = 10_000_000

        songs_chunk = 10_000
        users_chunk = 50_000
        listens_chunk = 1_000_000

    print("Writing Songs")
    _write_parquet_streamed(
        filename=f"songs_{target_size}.parquet",
        total_rows=num_songs,
        make_chunk_fn=lambda start, size: generate_songs_chunk(start, size),
        chunk_size=songs_chunk,
    )

    print("Writing Users")
    _write_parquet_streamed(
        filename=f"users_{target_size}.parquet",
        total_rows=num_users,
        make_chunk_fn=lambda start, size: generate_users_chunk(start, size),
        chunk_size=users_chunk,
    )

    print("Writing Listens")
    _write_parquet_streamed(
        filename=f"listens_{target_size}.parquet",
        total_rows=num_listens,
        make_chunk_fn=lambda start, size: generate_listens_chunk(
            start, size, num_users, num_songs
        ),
        chunk_size=listens_chunk,
    )

    print("Done!")

In [3]:
random.seed(0)

generate_test_data('100MB')
generate_test_data('1GB')

Writing Songs
Writing Users
Writing Listens
Done!
Writing Songs
Writing Users
Writing Listens
Done!


# Section 1: Parquet-based Columnar Storage

Implement Parquet-based storage for the tables
- For simplicity, store all data for a table in a single Parquet file and use a single DataFrame object as a buffer

In [None]:
# see ed: https://edstem.org/us/courses/87394/discussion/7251811 for advice on writing to a parquet without loading existing into RAM
class ColumnarDbFile:
    def __init__(self, table_name, file_dir='data', file_pfx=''):
        self.file_pfx = file_pfx
        self.table_name = table_name
        self.file_dir = file_dir
        os.makedirs(self.file_dir, exist_ok=True)
        self.base_file_name = f"{self.file_dir}/{self.file_pfx}_{self.table_name}"

    def build_table(self, data):
        """Build and save table data to Parquet."""
        data.to_parquet(self.base_file_name)
        return

    def retrieve_data(self, columns=None):
        """Create pd.DataFrame by reading from Parquet"""
        return pd.read_parquet(self.base_file_name, columns=columns)

    def append_data(self, data):
        """Append new data to Parquet"""
        # Your implementation here
        return

In [None]:
print("Building tables...")
if os.path.exists('data'):
    shutil.rmtree('data')
tables = {
    'Songs': ColumnarDbFile("Songs", file_dir='data'),
    'Users': ColumnarDbFile("Users", file_dir='data'),
    'Listens': ColumnarDbFile("Listens", file_dir='data')
}

size = "100MB"
songs_data = pd.read_parquet(f'songs_{size}.parquet')
users_data = pd.read_parquet(f'users_{size}.parquet')
listens_data = pd.read_parquet(f'listens_{size}.parquet')

tables['Songs'].build_table(songs_data)
tables['Users'].build_table(users_data)
tables['Listens'].build_table(listens_data)
print("Tables built successfully.")

In [None]:
# retrieve data
tables['Songs'].retrieve_data(columns = ['song_id', 'title'])

In [None]:
tables['Listens'].retrieve_data(columns = ['listen_id', 'user_id', 'song_id'])

Analyze and report on:
- Space efficiency compared to row storage
  - e.g. Compare file sizes on disk: How much disk space does Parquet use vs. a row storage format like CSV?
- Compression ratios achieved with Parquet
  - e.g. Compare Parquet’s uncompressed encoded size (reported in its metadata) to its compressed on-disk size to compute compression ratios.
  - You could also report the memory expansion factor: how much larger the dataset becomes when loaded into a `pd.DataFrame` compared to the compressed file size.
- Read/write performance characteristics
  - e.g. Read performance: How long does it take to read all columns from Parquet vs. CSV?
  - e.g. Columnar advantage: How long does it take to read selective columns from Parquet vs. reading all columns?
  - e.g. Write performance: How long does it take to write data to Parquet vs. CSV?

In [None]:
def analyze():
    pass # Your implementation here

# Section 2: Parse SQL Query

In this section, you should implement logic to parse the following SQL query:
```sql
    SELECT s.song_id, AVG(u.age) AS avg_age,
       COUNT(DISTINCT l.user_id) AS count_distinct_users,
    FROM Songs s
    JOIN Listens l ON s.song_id = l.song_id
    JOIN Users u ON l.user_id = u.user_id
    GROUP BY s.song_id, s.title
    ORDER BY COUNT(DISTINCT l.user_id) DESC, s.song_id;
```

You should manually extract the components from the provided query (i.e. you don't need to implement a general SQL parser, just handle this specific query).

In [None]:
query = """SELECT s.song_id, AVG(u.age) AS avg_age,
COUNT(DISTINCT l.user_id)
FROM Songs s
JOIN Listens l ON s.song_id = l.song_id
JOIN Users u ON l.user_id = u.user_id
GROUP BY s.song_id, s.title
ORDER BY COUNT(DISTINCT l.user_id) DESC, s.song_id;
"""

In [None]:
def parse_sql(query):
    """
    YOUR TASK: Extract tables, joins, and aggregations
    """
    # Parse SQL string to identify:
    # - Tables involved
    # - Join conditions
    # - GROUP BY columns
    # - Aggregation functions
    pass  # Your implementation here

In [None]:
parse_sql(query)

# Section 3: Implement Join Algorithms

In this section, you will implement the execution operators (*how* to join) and aggregation after joins.

**Reminder:** If you use temporary files or folders, you should clean them up either as part of your join logic, or after each run. Otherwise you might run into correctness issues!

In [None]:
import hashlib

def HASHVALUE(value, B):
    if isinstance(value, int):
        return hash(value) % B
    sha256 = hashlib.sha256()
    sha256.update(str(value).encode("utf-8"))
    return int(sha256.hexdigest(), 16) % B

Implement `HashPartitionJoin`:
1. Hash partition both tables
2. Build hash table from smaller partition
3. Probe with larger partition
4. Return joined results

In [None]:
class HashPartitionJoin:
    def __init__(self, num_partitions=4):
        self.num_partitions = num_partitions

    def join(self, table1: ColumnarDbFile, table2: ColumnarDbFile, join_key1, join_key2,
             temp_dir='temp', columns_table1=None, columns_table2=None):
        """
        Perform a hash partition join between two ColumnarDbFile instances.

        Parameters:
        - table1: Left table (ColumnarDbFile)
        - table2: Right table (ColumnarDbFile)
        - join_key1: Join key from table1
        - join_key2: Join key from table2
        - temp_dir: Directory to store temporary files
        - columns_table1: List of columns to select from table1
        - columns_table2: List of columns to select from table2

        Returns:
        - join_result_table: ColumnarDbFile instance containing the join results
        """
        os.makedirs(temp_dir, exist_ok=True)
        # Partition both tables
        partitions1 = self._hash_partition(table1, join_key1, temp_dir, 'left', columns_table1)
        partitions2 = self._hash_partition(table2, join_key2, temp_dir, 'right', columns_table2)

        # Your implementation here

    def _hash_partition(self, table: ColumnarDbFile, join_key, output_dir, side, columns=None):
        # Your implementation here

In [None]:
# Optional: Verify your implementation against pd.merge

Implement `SortMergeJoin`:
1. Sort both tables by join key
2. Merge sorted sequences
3. Handle duplicates

In [None]:
BWAY_MERGE_FACTOR = 10

class SortMergeJoin:
    def __init__(
        self, bway_merge_factor: int = BWAY_MERGE_FACTOR, num_pages_per_split=1000
    ):
        self.bway_merge_factor = bway_merge_factor
        self.num_pages_per_split = num_pages_per_split

    def _external_sort(
        self,
        table: ColumnarDbFile,
        join_key: str,
        output_dir: str,
        side: str,
        columns: Optional[List[str]] = None,
    ) -> ColumnarDbFile:
        """
        Perform an external sort on a table based on the join key and return a sorted ColumnarDbFile.
        Use _bway_merge to merge sorted files
        """
        # Your implementation here

    def _bway_merge(self, sorted_files: List[str], output_file: str, join_key: str):
        """
        Merge multiple sorted Parquet files into a single sorted Parquet file using B-way merge.
        """
        # Your implementation here

    def join(
        self,
        table1: ColumnarDbFile,
        table2: ColumnarDbFile,
        join_key1: str,
        join_key2: str,
        temp_dir: str = "temp",
        columns_table1: Optional[List[str]] = None,
        columns_table2: Optional[List[str]] = None,
    ) -> Optional[ColumnarDbFile]:
        """
        Perform a sort-merge join between two ColumnarDbFile instances and return a sorted ColumnarDbFile.
        """
        os.makedirs(temp_dir, exist_ok=True)

        # Sort both tables externally
        sorted_table1 = self._external_sort(
            table1, join_key1, temp_dir, "left", columns_table1
        )
        sorted_table2 = self._external_sort(
            table2, join_key2, temp_dir, "right", columns_table2
        )

        # Your implementation here

In [None]:
# Optional: Verify your implementation against pd.merge

Implement GROUP BY after joins:
- Here you could use `pd.groupby` or do manual aggregation

In [None]:
# Your implementation here

# Section 4: Query Planning & Optimization

In this section, you'll implement smart query planning using metadata analysis. The key idea is to **avoid loading data unnecessarily** by:
1. Analyzing Parquet metadata first (row counts, column names, file sizes)
2. Making intelligent decisions about join order and algorithm selection
3. Loading only the columns you actually need for the query

In [None]:
def analyze_metadata_before_loading(file_paths):
    """YOUR TASK: Get table statistics WITHOUT loading data

    Hints:
    - Use pq.ParquetFile() to access metadata
    - Extract: num_rows, column names, file sizes
    - DON'T use pd.read_parquet() here - that loads data!
    """
    metadata = {}

    # TODO: For each table ('songs', 'users', 'listens'):
    #   - Open the Parquet file (but don't load data)
    #   - Extract metadata like row count, columns, sizes
    #   - Store in a dictionary
    pass  # Your implementation here


def plan_query_execution(metadata, parsed_query):
    """YOUR TASK: Use metadata to make smart decisions

    Questions to answer:
    - Which table is smallest? Largest?
    - Will a hash table fit in memory?
    - Which columns does the query actually need?
    - What's the optimal join order?
    """
    # TODO: Based on metadata, decide:
    #   1. Join order (smallest first? or different strategy?)
    #   2. Algorithm choice (HPJ if fits in memory, else SMJ)
    #   3. Which columns to load for each table
    pass  # Your implementation here


# After planning, load ONLY what you need:
# Example (you implement the actual logic):
# columns_needed = ['song_id', 'artist']  # From your planning
# df = pd.read_parquet('songs.parquet', columns=columns_needed)

In [None]:
class QueryPlanner:
    pass # Your implementation here


class QueryExecutor:
    def __init__(self, tables, num_partitions=8, output_dir="temp", planner=None):
        self.tables = tables
        self.num_partitions = num_partitions
        self.output_dir = output_dir
        self.planner = planner or QueryPlanner()
        os.makedirs(self.output_dir, exist_ok=True)

    def execute_hardcoded_query(self):
        """
        Executes the following SQL query:

        SELECT s.song_id, AVG(u.age) AS avg_age,
        COUNT(DISTINCT l.user_id)
        FROM Songs s
        JOIN Listens l ON s.song_id = l.song_id
        JOIN Users u ON l.user_id = u.user_id
        GROUP BY s.song_id, s.title
        ORDER BY COUNT(DISTINCT l.user_id) DESC, s.song_id;
        """

        # Your implementation here

# Section 5: Performance Benchmarking

In [None]:
def benchmark_query(executor, dataset_size):
    """Benchmark the query execution time and memory usage."""
    print(f"\nBenchmarking with {dataset_size} dataset...")
    start_mem = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)
    start_time = time.time()

    result = executor.execute_hardcoded_query()

    end_time = time.time()
    end_mem = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)

    print(f"Execution Time: {end_time - start_time:.2f} seconds")
    print(f"Memory Usage: {end_mem - start_mem:.2f} MB")
    return result

## 100MB Benchmark

In [None]:
# Your implementation here

## 1GB Benchmark

In [None]:
# Your implementation here

## Performance Analysis

In [None]:
# Your implementation here