
# storage.ipynb â€” Storage & Validation

This notebook persists the **clean DataFrame** into:
- **SQLite** (table: `university_rankings`)
- **Parquet** (columnar format)

It also validates by reading both back and comparing row counts and checksums.


In [None]:

import sqlite3
import pandas as pd
from pathlib import Path
import hashlib

CLEAN_CSV = Path("./data_clean/rankings_clean.csv")
DB_PATH = Path("./rankings.db")
PARQUET_PATH = Path("./rankings.parquet")
TABLE = "university_rankings"


In [None]:

def load_clean(path: Path) -> pd.DataFrame:
    return pd.read_csv(path)


In [None]:

def to_sqlite(df: pd.DataFrame, db_path: Path, table: str) -> None:
    with sqlite3.connect(db_path) as con:
        df.to_sql(table, con, if_exists="replace", index=False)


In [None]:

def to_parquet(df: pd.DataFrame, path: Path) -> None:
    df.to_parquet(path, index=False)


In [None]:

def read_sqlite(db_path: Path, table: str) -> pd.DataFrame:
    with sqlite3.connect(db_path) as con:
        return pd.read_sql_query(f"SELECT * FROM {table}", con)


In [None]:

def read_parquet(path: Path) -> pd.DataFrame:
    return pd.read_parquet(path)


In [None]:

def df_checksum(df: pd.DataFrame) -> str:
    # Simple checksum for validation (order-independent by sorting by primary key if present)
    key_cols = [c for c in ["University", "Year"] if c in df.columns]
    df2 = df.sort_values(key_cols) if key_cols else df.copy()
    payload = df2.to_csv(index=False).encode("utf-8")
    return hashlib.md5(payload).hexdigest()



# ---- Execute storage and validation ----
clean = load_clean(CLEAN_CSV)
print("Clean rows:", len(clean))

to_sqlite(clean, DB_PATH, TABLE)
to_parquet(clean, PARQUET_PATH)
print("Saved SQLite and Parquet.")

back_sql = read_sqlite(DB_PATH, TABLE)
back_parq = read_parquet(PARQUET_PATH)
print("Back from SQLite:", len(back_sql), "rows")
print("Back from Parquet:", len(back_parq), "rows")

print("Checksum (orig):", df_checksum(clean))
print("Checksum (sqlite):", df_checksum(back_sql))
print("Checksum (parquet):", df_checksum(back_parq))
