## College Scorecard Merge
This notebook merges all `MERGED*.csv` files from the Scorecard dataset.

It also writes a summary file with each file's shape and columns.

In [13]:
import pandas as pd
from pathlib import Path

#### Define file paths

In [14]:
data_dir = Path("../data/raw")
scorecard_files = sorted(data_dir.glob("MERGED20*_PP.csv"))
summary_file = Path("../data/processed/scorecard_summaries.txt")

#### Summarize each file

In [15]:
with open(summary_file, "w", encoding="utf-8") as out:
    for file in scorecard_files:
        try:
            df = pd.read_csv(file, low_memory=False)
            out.write(f"{file.name}\n")
            out.write(f"Shape: {df.shape}\n")
            out.write(f"Columns: {list(df.columns)}\n")
            out.write("-"*80 + "\n")
        except Exception as e:
            out.write(f"Failed to read {file.name}: {e}\n")

print(f"Wrote summary for {len(scorecard_files)} files to {summary_file}")

Wrote summary for 23 files to ..\data\processed\scorecard_summaries.txt


#### Merge all files

In [16]:
### Merge all files

merged_scorecard = pd.DataFrame()
for file in scorecard_files:
    try:
        df = pd.read_csv(file, low_memory=False)
        year = file.stem.replace("MERGED", "").replace("_PP", "")
        df["Year"] = year
        merged_scorecard = pd.concat([merged_scorecard, df], ignore_index=True)
    except Exception as e:
        print(f"Failed to merge {file.name}: {e}")

if "UNITID" in merged_scorecard.columns:
    merged_scorecard = merged_scorecard.sort_values("UNITID").reset_index(drop=True)

print(f"Merged shape: {merged_scorecard.shape}")

Merged shape: (162482, 3306)


#### Save result

In [18]:
output_path = Path("../data/processed/merged_scorecard.csv")
merged_scorecard.to_csv(output_path, index=False)
print(f"Saved merged dataset to {output_path}")

Saved merged dataset to ..\data\processed\merged_scorecard.csv
