In [1]:
import os
import pandas as pd

def clean_and_merge(orig_df, perf_df):
    # Standardize key for merging
    key = "Loan Sequence Number"
    orig_df[key] = orig_df[key].astype(str)
    perf_df[key] = perf_df[key].astype(str)

    # Merge (m:1), inner join
    merged = perf_df.merge(orig_df, on=key, how="inner", validate="m:1")

    # Snake‑case the column names
    merged.columns = (
        merged.columns
        .str.lower()
        .str.replace(r"[ \-\/()]+", "_", regex=True)
        .str.strip("_")
    )

    # Parse and sort by date
    if "monthly_reporting_period" in merged.columns:
        merged["monthly_reporting_period"] = pd.to_datetime(
            merged["monthly_reporting_period"], errors="coerce"
        )
    if "first_payment_date" in merged.columns:
        merged["first_payment_date"] = pd.to_datetime(
            merged["first_payment_date"], format="%Y%m", errors="coerce"
        )
    merged = merged.sort_values(
        ["loan_sequence_number", "monthly_reporting_period"]
    ).reset_index(drop=True)

    # Numeric casts
    for col in ("current_actual_upb", "original_upb", "interest_rate"):
        if col in merged.columns:
            merged[col] = pd.to_numeric(merged[col], errors="coerce")

    return merged

    




In [4]:
os.makedirs('../data/processed/merged_by_year', exist_ok=True)

for year in range(1999, 2025):
    o_path = f"../data/interim/origination_{year}.parquet"
    p_path = f"../data/interim/performance_{year}.parquet"
    if not os.path.exists(o_path):
        print(f"  Skipping origination_{year}.parquet (not found)")
        continue
    if not os.path.exists(p_path):
        print(f"  Skipping performance_{year}.parquet (not found)")
        continue

    print(f"Processing {year}...")
    orig = pd.read_parquet(o_path)
    perf = pd.read_parquet(p_path)

    merged = clean_and_merge(orig, perf)

    out_path = f"../data/processed/merged_by_year/merged_{year}.parquet"
    merged.to_parquet(out_path, index=False)
    print(f"  Saved merged_{year}.parquet ({len(merged):,} rows)")

Processing 1999...
  Saved merged_1999.parquet (2,507,096 rows)
Processing 2000...
  Saved merged_2000.parquet (1,439,897 rows)
Processing 2001...
  Saved merged_2001.parquet (1,960,552 rows)
Processing 2002...
  Saved merged_2002.parquet (2,488,606 rows)
Processing 2003...
  Saved merged_2003.parquet (4,040,135 rows)
Processing 2004...
  Saved merged_2004.parquet (3,959,723 rows)
Processing 2005...
  Saved merged_2005.parquet (3,856,796 rows)
Processing 2006...
  Saved merged_2006.parquet (3,184,782 rows)
Processing 2007...
  Saved merged_2007.parquet (2,989,909 rows)
Processing 2008...
  Saved merged_2008.parquet (2,437,525 rows)
Processing 2009...
  Saved merged_2009.parquet (3,122,523 rows)
Processing 2010...
  Saved merged_2010.parquet (3,390,711 rows)
Processing 2011...
  Saved merged_2011.parquet (3,548,320 rows)
Processing 2012...
  Saved merged_2012.parquet (4,352,734 rows)
Processing 2013...
  Saved merged_2013.parquet (4,027,063 rows)
Processing 2014...
  Saved merged_2014.p

In [10]:

import os, glob
import pyarrow as pa
import pyarrow.parquet as pq

yearly_dir = "../data/processed/merged_by_year"
final_path = "../data/processed/merged_loan_performance.parquet"

# 1) Find all per‑year files
files = sorted(glob.glob(os.path.join(yearly_dir, "merged_*.parquet")))
if not files:
    raise RuntimeError(f"No files found in {yearly_dir}")

# 2) Build a "master" schema, converting any NullType → string
base_schema = pq.read_schema(files[0])
fields = []
for fld in base_schema:
    if pa.types.is_null(fld.type):
        fields.append(pa.field(fld.name, pa.string()))
    else:
        fields.append(fld)
master_schema = pa.schema(fields)

# 3) Open writer
os.makedirs(os.path.dirname(final_path), exist_ok=True)
writer = pq.ParquetWriter(final_path, master_schema)

for path in files:
    print("Appending", os.path.basename(path))
    tbl = pq.read_table(path)

    # 3a) Make sure every master column exists
    for fld in master_schema:
        if fld.name not in tbl.schema.names:
            null_col = pa.array([None] * tbl.num_rows, type=fld.type)
            tbl = tbl.append_column(fld.name, null_col)

    # 3b) Build arrays in master order, casting each column into the right type
    arrays = []
    for fld in master_schema:
        col = tbl.column(fld.name)
        # if type differs, cast to fld.type
        if not col.type.equals(fld.type):
            col = col.cast(fld.type)
        arrays.append(col)
    tbl2 = pa.Table.from_arrays(arrays, names=master_schema.names)

    # 3c) Write it out
    writer.write_table(tbl2)

writer.close()
print(" Final merged file saved to", final_path)


Appending merged_1999.parquet
Appending merged_2000.parquet
Appending merged_2001.parquet
Appending merged_2002.parquet
Appending merged_2003.parquet
Appending merged_2004.parquet
Appending merged_2005.parquet
Appending merged_2006.parquet
Appending merged_2007.parquet
Appending merged_2008.parquet
Appending merged_2009.parquet
Appending merged_2010.parquet
Appending merged_2011.parquet
Appending merged_2012.parquet
Appending merged_2013.parquet
Appending merged_2014.parquet
Appending merged_2015.parquet
Appending merged_2016.parquet
Appending merged_2017.parquet
Appending merged_2018.parquet
Appending merged_2019.parquet
Appending merged_2020.parquet
Appending merged_2021.parquet
Appending merged_2022.parquet
Appending merged_2023.parquet
Appending merged_2024.parquet
 Final merged file saved to ../data/processed/merged_loan_performance.parquet
