# Econometrics – Data Load

This notebook loads the replication data from **112446-V1** (student/teacher presentation and test data). Data are Stata `.dta` files; we use pandas to read them.

In [1]:
import pandas as pd
from pathlib import Path

# Base path: repo root (notebook lives in Econometrics/)
BASE = Path("data/supplemental/112446-V1/data")
assert BASE.exists(), f"Data path not found: {BASE.absolute()}"

## New datasets in `data/`

The following loads and inspects datasets under **`data/`**: STAR public-use (116327-V1) and Dataverse MRC tables (.dta / .csv). Run the cell below to see shape, columns, dtypes, and a preview of each.

In [2]:
# Base path and quick inventory
DATA_ROOT = Path("data")
assert DATA_ROOT.exists(), f"Data path not found: {DATA_ROOT.absolute()}"

print("Contents of data/:")
for p in sorted(DATA_ROOT.iterdir()):
    if p.name.startswith("."):
        continue
    if p.is_dir():
        dta = list(p.rglob("*.dta"))
        csv = list(p.rglob("*.csv"))
        print(f"  {p.name}/  -> {len(dta)} .dta, {len(csv)} .csv")
    else:
        print(f"  {p.name}")

Contents of data/:
  116327-V1/  -> 1 .dta, 0 .csv
  dataverse_files/  -> 50 .dta, 20 .csv
  supplemental/  -> 40 .dta, 28 .csv


In [3]:
# --- 1. STAR public-use (116327-V1)
star_path = DATA_ROOT / "116327-V1/STAR_extracted/STARdatapost/STAR_public_use.dta"
if star_path.exists():
    star = pd.read_stata(star_path)
    print("=== STAR_public_use.dta (116327-V1) ===")
    print("Shape:", star.shape)
    print("Columns:", list(star.columns))
    print("\nDtypes:\n", star.dtypes)
    print("\nFirst rows:")
    display(star.head())
else:
    print("STAR file not found:", star_path)

# --- 2. Dataverse MRC tables (dataverse_files)
mrc_dir = DATA_ROOT / "dataverse_files"
if mrc_dir.exists():
    # Try .dta first, then .csv
    mrc_dta = list(mrc_dir.glob("mrc_table*.dta"))
    mrc_csv = list(mrc_dir.glob("mrc_table*.csv"))
    mrc_dta.sort(key=lambda p: p.name)
    mrc_csv.sort(key=lambda p: p.name)
    print("\n=== Dataverse MRC tables ===")
    print("Available .dta:", [p.name for p in mrc_dta[:5]], "..." if len(mrc_dta) > 5 else "")
    print("Available .csv:", [p.name for p in mrc_csv[:5]], "..." if len(mrc_csv) > 5 else "")
    # Load first table (dta or csv) for inspection
    if mrc_dta:
        mrc1 = pd.read_stata(mrc_dta[0])
        print(f"\n--- {mrc_dta[0].name} ---")
        print("Shape:", mrc1.shape)
        print("Columns:", list(mrc1.columns))
        display(mrc1.head())
    elif mrc_csv:
        mrc1 = pd.read_csv(mrc_csv[0])
        print(f"\n--- {mrc_csv[0].name} ---")
        print("Shape:", mrc1.shape)
        print("Columns:", list(mrc1.columns))
        display(mrc1.head())
else:
    print("dataverse_files not found:", mrc_dir)

=== STAR_public_use.dta (116327-V1) ===
Shape: (1656, 48)
Columns: ['GPA_year1', 'GPA_year2', 'age', 'chooseUTM', 'compsurv', 'control', 'credits_earned1', 'credits_earned2', 'dad1', 'dad2', 'dad_edn', 'english', 'female', 'finish4', 'goodstanding_year1', 'goodstanding_year2', 'gpa0', 'graddeg', 'grade_20059_fall', 'hcom', 'hsgroup', 'lastmin', 'lm_never', 'lm_rarely', 'mathsci', 'mom1', 'mom2', 'mom_edn', 'mtongue', 'noshow', 'numcourses_nov1', 'prob_year1', 'prob_year2', 'sex', 'sfp', 'sfp_p', 'sfpany', 'sfpany_p', 'sfsp', 'sfsp_p', 'signup', 'ssp', 'ssp_p', 'totcredits_year1', 'used_adv', 'used_fsg', 'used_ssp', 'work1']

Dtypes:
 GPA_year1              float32
GPA_year2              float32
age                       int8
chooseUTM              float32
compsurv               float32
control                   int8
credits_earned1        float32
credits_earned2        float32
dad1                   float32
dad2                   float32
dad_edn               category
english          

Unnamed: 0,GPA_year1,GPA_year2,age,chooseUTM,compsurv,control,credits_earned1,credits_earned2,dad1,dad2,...,sfsp,sfsp_p,signup,ssp,ssp_p,totcredits_year1,used_adv,used_fsg,used_ssp,work1
0,2.58,3.49,18,0.0,1.0,1,2.5,3.5,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
1,3.55,2.96,17,0.0,1.0,0,3.5,3.5,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0
2,3.75,3.31,19,0.0,1.0,1,3.0,4.5,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0
3,2.37,2.62,18,0.0,1.0,1,3.5,2.5,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0
4,1.68,2.47,19,0.0,1.0,1,2.5,3.5,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0



=== Dataverse MRC tables ===
Available .dta: ['mrc_table1-2.dta', 'mrc_table10.dta', 'mrc_table11.dta', 'mrc_table12.dta', 'mrc_table13.dta'] ...
Available .csv: ['mrc_table1.csv', 'mrc_table10.csv', 'mrc_table11.csv', 'mrc_table12.csv', 'mrc_table13.csv'] ...

--- mrc_table1-2.dta ---
Shape: (2202, 15)
Columns: ['super_opeid', 'name', 'czname', 'state', 'par_median', 'k_median', 'par_q1', 'par_top1pc', 'kq5_cond_parq1', 'ktop1pc_cond_parq1', 'mr_kq5_pq1', 'mr_ktop1_pq1', 'trend_parq1', 'trend_bottom40', 'count']


Unnamed: 0,super_opeid,name,czname,state,par_median,k_median,par_q1,par_top1pc,kq5_cond_parq1,ktop1pc_cond_parq1,mr_kq5_pq1,mr_ktop1_pq1,trend_parq1,trend_bottom40,count
0,2665.0,Vaughn College Of Aeronautics And Technology,New York,NY,30900.0,53000.0,0.364779,0.001198,0.448435,0.017666,0.16358,0.006444,-0.079988,-0.057506,207.666667
1,7273.0,CUNY Bernard M. Baruch College,New York,NY,42800.0,57600.0,0.276322,0.005592,0.468242,0.025568,0.129386,0.007065,-0.091865,-0.122972,1083.0
2,2688.0,City College Of New York - CUNY,New York,NY,35500.0,48500.0,0.325465,0.002335,0.360216,0.014087,0.117237,0.004585,-0.098016,-0.138794,582.333333
3,7022.0,CUNY Lehman College,New York,NY,32500.0,40700.0,0.367075,0.0,0.27883,0.001896,0.102351,0.000696,-0.05734,-0.090723,468.333333
4,1140.0,"California State University, Los Angeles",Los Angeles,CA,36600.0,43000.0,0.331169,0.00156,0.299498,0.000836,0.099185,0.000277,-0.133136,-0.149198,1179.666667


In [4]:
# Load all three 112446-V1 datasets (from data/supplemental/112446-V1/data)
student_test = pd.read_stata(BASE / "student_test_data.dta")
student_pres = pd.read_stata(BASE / "student_pres_data.dta")
teacher_pres = pd.read_stata(BASE / "teacher_pres_data.dta")

print("Loaded:")
print("  student_test_data:", student_test.shape)
print("  student_pres_data:", student_pres.shape)
print("  teacher_pres_data:", teacher_pres.shape)

Loaded:
  student_test_data: (7022, 106)
  student_pres_data: (97100, 10)
  teacher_pres_data: (2484, 12)


In [5]:
# Preview student test data (main analysis file per analysis.do)
display_cols = [c for c in student_test.columns[:20]]  # first 20 columns
student_test[display_cols].head(10)

Unnamed: 0,pupilid,schoolid,district,bungoma,division,zone,tracking,sbm,girl,agetest,etpteacher,lowstream,stream_meanpercentile,SDstream_std_mark,MEANstream_std_mark,bottomhalf,tophalf,bottomquarter,secondquarter,thirdquarter
0,4301001,430.0,BUNGOMA,1.0,KANDUYI,MUNICIPALITY,1.0,1.0,0.0,,0.0,1.0,25.115055,0.669245,-0.81929,1.0,0.0,1.0,0.0,0.0
1,4301002,430.0,BUNGOMA,1.0,KANDUYI,MUNICIPALITY,1.0,1.0,1.0,12.0,0.0,1.0,25.115055,0.669245,-0.81929,1.0,0.0,1.0,0.0,0.0
2,4301003,430.0,BUNGOMA,1.0,KANDUYI,MUNICIPALITY,1.0,1.0,1.0,8.0,0.0,1.0,25.115055,0.669245,-0.81929,1.0,0.0,1.0,0.0,0.0
3,4301004,430.0,BUNGOMA,1.0,KANDUYI,MUNICIPALITY,1.0,1.0,0.0,14.0,0.0,1.0,25.115055,0.669245,-0.81929,1.0,0.0,1.0,0.0,0.0
4,4301005,430.0,BUNGOMA,1.0,KANDUYI,MUNICIPALITY,1.0,1.0,0.0,11.0,0.0,1.0,25.115055,0.669245,-0.81929,1.0,0.0,1.0,0.0,0.0
5,4301007,430.0,BUNGOMA,1.0,KANDUYI,MUNICIPALITY,1.0,1.0,0.0,10.0,0.0,1.0,25.115055,0.669245,-0.81929,1.0,0.0,1.0,0.0,0.0
6,4301009,430.0,BUNGOMA,1.0,KANDUYI,MUNICIPALITY,1.0,1.0,0.0,10.0,0.0,1.0,25.115055,0.669245,-0.81929,1.0,0.0,1.0,0.0,0.0
7,4301010,430.0,BUNGOMA,1.0,KANDUYI,MUNICIPALITY,1.0,1.0,1.0,9.0,0.0,1.0,25.115055,0.669245,-0.81929,1.0,0.0,1.0,0.0,0.0
8,4301011,430.0,BUNGOMA,1.0,KANDUYI,MUNICIPALITY,1.0,1.0,0.0,10.0,0.0,1.0,25.115055,0.669245,-0.81929,1.0,0.0,1.0,0.0,0.0
9,4301012,430.0,BUNGOMA,1.0,KANDUYI,MUNICIPALITY,1.0,1.0,0.0,10.0,0.0,1.0,25.115055,0.669245,-0.81929,1.0,0.0,1.0,0.0,0.0


In [6]:
# Column names and dtypes for each dataset
for name, df in [("student_test", student_test), ("student_pres", student_pres), ("teacher_pres", teacher_pres)]:
    print(f"=== {name} ===")
    print(df.dtypes.head(15))
    print()

=== student_test ===
pupilid                    int32
schoolid                 float32
district                     str
bungoma                  float32
division                     str
zone                         str
tracking                 float32
sbm                      float32
girl                     float32
agetest                  float32
etpteacher               float32
lowstream                float32
stream_meanpercentile    float32
SDstream_std_mark        float32
MEANstream_std_mark      float32
dtype: object

=== student_pres ===
schoolid      float32
tracking      float32
bungoma       float32
girl          float32
pupilid         int32
etpteacher    float32
visit             str
realdate      float32
pres          float64
bottomhalf    float32
dtype: object

=== teacher_pres ===
teacherid           float64
schoolid            float32
tracking            float32
bungoma             float32
etpteacher          float32
lowstream           float32
yrstaught           floa