In [1]:
import pandas as pd
from pathlib import Path

In [5]:
# Define path to Scorecard data
scorecard_dir = Path("../data/raw")
merged_files = sorted(scorecard_dir.glob("MERGED20*_PP.csv"))

In [6]:
# Display files detected
print("Found MERGED files:")
for f in merged_files:
    print(" -", f.name)

Found MERGED files:
 - MERGED2000_01_PP.csv
 - MERGED2001_02_PP.csv
 - MERGED2002_03_PP.csv
 - MERGED2003_04_PP.csv
 - MERGED2004_05_PP.csv
 - MERGED2005_06_PP.csv
 - MERGED2006_07_PP.csv
 - MERGED2007_08_PP.csv
 - MERGED2008_09_PP.csv
 - MERGED2009_10_PP.csv
 - MERGED2010_11_PP.csv
 - MERGED2011_12_PP.csv
 - MERGED2012_13_PP.csv
 - MERGED2013_14_PP.csv
 - MERGED2014_15_PP.csv
 - MERGED2015_16_PP.csv
 - MERGED2016_17_PP.csv
 - MERGED2017_18_PP.csv
 - MERGED2018_19_PP.csv
 - MERGED2019_20_PP.csv
 - MERGED2020_21_PP.csv
 - MERGED2021_22_PP.csv
 - MERGED2022_23_PP.csv


In [7]:
# Dictionary to hold all crosswalk dataframes
scorecard_data = {}

In [9]:
# Output summary of each file to a text file
output_file = Path("../data/processed/scorecard_summaries.txt")
with open(output_file, "w", encoding="utf-8") as out:
    for file in merged_files:
        try:
            df = pd.read_csv(file, low_memory=False)
            out.write(f"\n{file.name} - Shape: {df.shape}\n")
            out.write("Columns:\n")
            out.write(", ".join(df.columns[:20]))  # First 20 columns
            out.write("\n\nHead:\n")
            out.write(df.head().to_string(index=False))
            out.write("\n" + "-"*80 + "\n")
        except Exception as e:
            out.write(f"Failed to load {file.name}: {e}\n")

print(f"Scorecard summaries written to: {output_file}")

Scorecard summaries written to: ..\data\processed\scorecard_summaries.txt


In [12]:
# Merge all scorecard files into one dataframe
def merge_scorecard_files(file_list):
    merged_df = pd.DataFrame()
    for file in file_list:
        try:
            df = pd.read_csv(file, low_memory=False)
            year = file.stem.replace("MERGED", "").replace("_PP", "")
            df["Year"] = year
            merged_df = pd.concat([merged_df, df], ignore_index=True)
        except Exception as e:
            print(f"Failed to merge {file.name}: {e}")
    if "UNITID" in merged_df.columns:
        merged_df = merged_df.sort_values(by="UNITID").reset_index(drop=True)
    return merged_df


merged_scorecard = merge_scorecard_files(merged_files)
print(f"Merged dataset shape: {merged_scorecard.shape}")

# Saved the merged dataset into the processed directory
merged_scorecard.to_csv("../data/processed/merged_scorecard.csv", index=False)
print("Merged Scorecard dataset saved to: data/processed/merged_scorecard.csv")

Merged dataset shape: (162482, 3306)
Merged Scorecard dataset saved to: data/processed/merged_scorecard.csv
