# Schools and Education Pipeline

This notebook loads public school locations for Chicago and assigns each school to a Community Area using a spatial join. It counts the number of schools per Community Area as an indicator of family and youth infrastructure. This helps compare amenities for families with nightlife, retail, or fitness density.


In [1]:
import pandas as pd
import geopandas as gpd

# -----------------------------------------------------------
# 1. Load data
# -----------------------------------------------------------

schools_url = "https://data.cityofchicago.org/resource/76dk-7ieb.csv"
ca_url = "https://data.cityofchicago.org/resource/igwz-8jzy.geojson"

schools = pd.read_csv(schools_url)
ca = gpd.read_file(ca_url).to_crs("EPSG:4326")

# Clean CA fields
ca = ca.rename(columns={"area_numbe": "ca_num", "community": "ca_name"})
ca["ca_num"] = ca["ca_num"].astype(int)
ca["ca_name"] = ca["ca_name"].str.upper().str.strip()

Index(['school_id', 'legacy_unit_id', 'finance_id', 'short_name', 'long_name',
       'school_type', 'primary_category', 'is_high_school', 'is_middle_school',
       'is_elementary_school', 'is_pre_school', 'summary',
       'administrator_title', 'administrator', 'secondary_contact_title',
       'secondary_contact', 'address', 'city', 'state', 'zip', 'phone', 'fax',
       'school_website', 'website', 'facebook', 'twitter', 'youtube',
       'pinterest', 'attendance_boundaries', 'grades_offered_all',
       'grades_offered', 'student_count_total', 'student_count_low_income',
       'student_count_special_ed', 'student_count_english_learners',
       'student_count_black', 'student_count_hispanic', 'student_count_white',
       'student_count_asian', 'student_count_native_american',
       'student_count_other_ethnicity', 'student_count_asian_pacific_islander',
       'student_count_multi', 'student_count_hawaiian_pacific_islander',
       'student_count_ethnicity_not_available', 'sta

In [6]:
# -----------------------------------------------------------
# 2. Prepare school points
# -----------------------------------------------------------

schools = schools.dropna(subset=["school_latitude", "school_longitude"])

schools_gdf = gpd.GeoDataFrame(
    schools,
    geometry=gpd.points_from_xy(
        schools["school_longitude"].astype(float),
        schools["school_latitude"].astype(float)
    ),
    crs="EPSG:4326"
)


In [7]:
# -----------------------------------------------------------
# 3. Spatial join to community areas
# -----------------------------------------------------------

schools_join = gpd.sjoin(
    schools_gdf,
    ca[["ca_num", "ca_name", "geometry"]],
    how="inner",
    predicate="within"
)

In [None]:
# -----------------------------------------------------------
# 4. Aggregate counts
# -----------------------------------------------------------

school_counts = schools_join.groupby("ca_num").size().reset_index(name="school_count")

school_df = ca[["ca_num", "ca_name"]].merge(school_counts, on="ca_num", how="left")
school_df["school_count"] = school_df["school_count"].fillna(0)

# Save output
school_df.to_csv("../datasets/schools_by_CA.csv", index=False)

school_df.head()

Unnamed: 0,ca_num,ca_name,school_count
0,1,ROGERS PARK,7
1,2,WEST RIDGE,10
2,3,UPTOWN,7
3,4,LINCOLN SQUARE,5
4,5,NORTH CENTER,7
