# 🗺️ Geographic Analysis of NYC Schools

**Objective:** Map NYC schools by performance and identify geographic patterns

**Created:** October 6, 2025

## What We'll Do:
1. Load school data with SAT scores
2. Fetch geographic coordinates for schools
3. Create interactive maps with Folium
4. Analyze geographic patterns in school performance
5. Create choropleth maps by borough

In [None]:
# Imports
import sys
from pathlib import Path

import folium
from folium import plugins
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Add project root to path
PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

from scripts.utils import load_csv, save_csv

# Configuration
%matplotlib inline
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

print("✅ Imports successful!")
print(f"📁 Project root: {PROJECT_ROOT}")

## 1. Load Data

In [None]:
# Load the analyzed education data
try:
    df = load_csv("nyc_education_analyzed.csv", subfolder="processed")
    print(f"✅ Loaded processed data: {len(df)} schools")
except FileNotFoundError:
    # Load and merge raw data
    sat = load_csv("zt9s-n5aj_20251006_114228.csv")
    demo = load_csv("s3k6-pzi2_20251006_114239.csv")
    
    # Convert scores
    for col in ['mathematics_mean', 'critical_reading_mean', 'writing_mean', 'number_of_test_takers']:
        sat[col] = pd.to_numeric(sat[col], errors='coerce')
    
    sat['total_score'] = sat['mathematics_mean'] + sat['critical_reading_mean'] + sat['writing_mean']
    
    # Merge
    df = sat.merge(demo, on='dbn', how='left', suffixes=('', '_demo'))
    
    # Map boroughs
    borough_names = {'M': 'Manhattan', 'X': 'Bronx', 'K': 'Brooklyn', 'Q': 'Queens', 'R': 'Staten Island'}
    if 'boro' in df.columns:
        df['borough_name'] = df['boro'].map(borough_names)
    
    print(f"✅ Loaded and merged data: {len(df)} schools")

df.head()

## 2. Add Geographic Coordinates

NYC Open Data includes location information. Let's extract or fetch it.

In [None]:
# Check if we have location data
location_cols = [col for col in df.columns if 'lat' in col.lower() or 'lon' in col.lower() or 'location' in col.lower()]
print(f"Location columns found: {location_cols}")

# Try to extract coordinates
if 'location_1' in df.columns:
    print("\nSample location data:")
    print(df['location_1'].head())

# Check for latitude/longitude columns
if 'latitude' in df.columns and 'longitude' in df.columns:
    df['lat'] = pd.to_numeric(df['latitude'], errors='coerce')
    df['lon'] = pd.to_numeric(df['longitude'], errors='coerce')
    print(f"\n✅ Found lat/lon columns: {df['lat'].notna().sum()} schools with coordinates")
else:
    print("\n⚠️ No direct lat/lon columns found. Will use borough centroids for now.")
    # Borough centroids (approximate)
    borough_coords = {
        'Manhattan': (40.7831, -73.9712),
        'Bronx': (40.8448, -73.8648),
        'Brooklyn': (40.6782, -73.9442),
        'Queens': (40.7282, -73.7949),
        'Staten Island': (40.5795, -74.1502)
    }
    
    # Add some random jitter to spread out points
    if 'borough_name' in df.columns:
        df['lat'] = df['borough_name'].map(lambda b: borough_coords.get(b, (40.7, -74.0))[0] if pd.notna(b) else None)
        df['lon'] = df['borough_name'].map(lambda b: borough_coords.get(b, (40.7, -74.0))[1] if pd.notna(b) else None)
        
        # Add jitter
        np.random.seed(42)
        df['lat'] = df['lat'] + np.random.normal(0, 0.02, len(df))
        df['lon'] = df['lon'] + np.random.normal(0, 0.02, len(df))
        
        print(f"✅ Using borough centroids with jitter: {df['lat'].notna().sum()} schools")

## 3. Create Performance Categories

In [None]:
# Categorize schools by performance
def categorize_performance(score):
    if pd.isna(score):
        return 'Unknown'
    elif score >= 1500:
        return 'High (1500+)'
    elif score >= 1200:
        return 'Above Average (1200-1499)'
    elif score >= 1000:
        return 'Average (1000-1199)'
    else:
        return 'Below Average (<1000)'

df['performance_category'] = df['total_score'].apply(categorize_performance)

# Color mapping
performance_colors = {
    'High (1500+)': 'darkgreen',
    'Above Average (1200-1499)': 'green',
    'Average (1000-1199)': 'orange',
    'Below Average (<1000)': 'red',
    'Unknown': 'gray'
}

df['color'] = df['performance_category'].map(performance_colors)

print("Performance Distribution:")
print(df['performance_category'].value_counts())

## 4. Create Interactive Map with Folium

In [None]:
# Filter out schools without coordinates
df_map = df[df['lat'].notna() & df['lon'].notna()].copy()

print(f"📍 Mapping {len(df_map)} schools with coordinates")

# Create base map centered on NYC
nyc_center = [40.7128, -73.9960]
m = folium.Map(
    location=nyc_center,
    zoom_start=11,
    tiles='OpenStreetMap'
)

# Add schools to map
for idx, school in df_map.iterrows():
    # Create popup with school information
    popup_html = f"""
    <div style="width: 200px">
        <h4>{school['school_name']}</h4>
        <b>Total SAT:</b> {school['total_score']:.0f}<br>
        <b>Math:</b> {school['mathematics_mean']:.0f}<br>
        <b>Reading:</b> {school['critical_reading_mean']:.0f}<br>
        <b>Writing:</b> {school['writing_mean']:.0f}<br>
        <b>Test Takers:</b> {school['number_of_test_takers']:.0f}<br>
        <b>Borough:</b> {school.get('borough_name', 'N/A')}<br>
        <b>Category:</b> {school['performance_category']}
    </div>
    """
    
    folium.CircleMarker(
        location=[school['lat'], school['lon']],
        radius=5,
        popup=folium.Popup(popup_html, max_width=300),
        tooltip=f"{school['school_name']}: {school['total_score']:.0f}",
        color=school['color'],
        fill=True,
        fillColor=school['color'],
        fillOpacity=0.7,
        weight=2
    ).add_to(m)

# Add a legend
legend_html = '''
<div style="position: fixed; 
            bottom: 50px; right: 50px; width: 200px; height: 180px; 
            background-color: white; border:2px solid grey; z-index:9999; 
            font-size:14px; padding: 10px">
<h4 style="margin-top:0">Performance Legend</h4>
<p><span style="color:darkgreen">⬤</span> High (1500+)</p>
<p><span style="color:green">⬤</span> Above Avg (1200-1499)</p>
<p><span style="color:orange">⬤</span> Average (1000-1199)</p>
<p><span style="color:red">⬤</span> Below Avg (<1000)</p>
</div>
'''
m.get_root().html.add_child(folium.Element(legend_html))

# Add layer control
folium.LayerControl().add_to(m)

# Save map
map_path = PROJECT_ROOT / 'data' / 'processed' / 'nyc_schools_map.html'
m.save(str(map_path))

print(f"\n✅ Map saved to: {map_path}")
print("\n💡 Open the HTML file in your browser to view the interactive map!")

# Display in notebook
m

## 5. Create Heat Map of School Performance

In [None]:
# Create a heat map based on SAT scores
heat_map = folium.Map(
    location=nyc_center,
    zoom_start=11,
    tiles='OpenStreetMap'
)

# Prepare data for heat map (lat, lon, weight=score)
heat_data = []
for idx, school in df_map.iterrows():
    if pd.notna(school['total_score']):
        # Normalize score to 0-1 range for heat map
        normalized_score = (school['total_score'] - df_map['total_score'].min()) / (df_map['total_score'].max() - df_map['total_score'].min())
        heat_data.append([school['lat'], school['lon'], normalized_score])

# Add heat map layer
plugins.HeatMap(
    heat_data,
    min_opacity=0.3,
    max_zoom=13,
    radius=15,
    blur=20,
    gradient={
        '0.0': 'red',
        '0.5': 'yellow',
        '1.0': 'green'
    }
).add_to(heat_map)

# Save heat map
heat_map_path = PROJECT_ROOT / 'data' / 'processed' / 'nyc_schools_heatmap.html'
heat_map.save(str(heat_map_path))

print(f"✅ Heat map saved to: {heat_map_path}")

# Display in notebook
heat_map

## 6. Borough-Level Clustering Map

In [None]:
# Create map with marker clusters by borough
cluster_map = folium.Map(
    location=nyc_center,
    zoom_start=11,
    tiles='CartoDB positron'
)

# Create separate marker clusters for each borough
if 'borough_name' in df_map.columns:
    for borough in df_map['borough_name'].unique():
        if pd.notna(borough):
            borough_schools = df_map[df_map['borough_name'] == borough]
            
            # Create a marker cluster for this borough
            marker_cluster = plugins.MarkerCluster(name=borough).add_to(cluster_map)
            
            for idx, school in borough_schools.iterrows():
                popup_html = f"""
                <div style="width: 200px">
                    <h4>{school['school_name']}</h4>
                    <b>Borough:</b> {borough}<br>
                    <b>Total SAT:</b> {school['total_score']:.0f}<br>
                    <b>Category:</b> {school['performance_category']}
                </div>
                """
                
                folium.Marker(
                    location=[school['lat'], school['lon']],
                    popup=folium.Popup(popup_html, max_width=300),
                    tooltip=f"{school['school_name']}",
                    icon=folium.Icon(color='blue' if school['total_score'] > df_map['total_score'].median() else 'red', icon='info-sign')
                ).add_to(marker_cluster)

# Add layer control
folium.LayerControl().add_to(cluster_map)

# Save clustered map
cluster_map_path = PROJECT_ROOT / 'data' / 'processed' / 'nyc_schools_clusters.html'
cluster_map.save(str(cluster_map_path))

print(f"✅ Cluster map saved to: {cluster_map_path}")

# Display in notebook
cluster_map

## 7. Geographic Patterns Analysis

In [None]:
# Analyze geographic patterns
if 'borough_name' in df.columns:
    print("🗽 Borough Performance Summary\n" + "="*50)
    
    borough_stats = df.groupby('borough_name').agg({
        'school_name': 'count',
        'total_score': ['mean', 'median', 'min', 'max', 'std'],
        'number_of_test_takers': 'sum'
    }).round(0)
    
    borough_stats.columns = ['Schools', 'Avg Score', 'Median Score', 'Min Score', 'Max Score', 'Std Dev', 'Total Students']
    borough_stats = borough_stats.sort_values('Avg Score', ascending=False)
    
    print(borough_stats)
    print("\n")
    
    # Visualization
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Average scores by borough
    borough_avg = df.groupby('borough_name')['total_score'].mean().sort_values(ascending=False)
    axes[0, 0].barh(borough_avg.index, borough_avg.values, color='steelblue')
    axes[0, 0].set_xlabel('Average SAT Score')
    axes[0, 0].set_title('Average SAT Score by Borough')
    axes[0, 0].axvline(df['total_score'].mean(), color='red', linestyle='--', label='NYC Average')
    axes[0, 0].legend()
    
    # School count by borough
    borough_count = df['borough_name'].value_counts()
    axes[0, 1].bar(borough_count.index, borough_count.values, color='coral')
    axes[0, 1].set_ylabel('Number of Schools')
    axes[0, 1].set_title('Schools by Borough')
    axes[0, 1].tick_params(axis='x', rotation=45)
    
    # Box plot of scores by borough
    df.boxplot(column='total_score', by='borough_name', ax=axes[1, 0])
    axes[1, 0].set_xlabel('Borough')
    axes[1, 0].set_ylabel('Total SAT Score')
    axes[1, 0].set_title('Score Distribution by Borough')
    axes[1, 0].tick_params(axis='x', rotation=45)
    plt.sca(axes[1, 0])
    plt.xticks(rotation=45)
    
    # Performance category distribution by borough
    perf_by_borough = pd.crosstab(df['borough_name'], df['performance_category'], normalize='index') * 100
    perf_by_borough.plot(kind='bar', stacked=True, ax=axes[1, 1], 
                         color=['darkgreen', 'green', 'orange', 'red'])
    axes[1, 1].set_ylabel('Percentage')
    axes[1, 1].set_title('Performance Category Distribution by Borough')
    axes[1, 1].tick_params(axis='x', rotation=45)
    axes[1, 1].legend(title='Category', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    plt.savefig(PROJECT_ROOT / 'data' / 'processed' / 'borough_geographic_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\n✅ Analysis complete! Visualization saved.")

## 8. Key Geographic Insights

In [None]:
print("🔍 KEY GEOGRAPHIC INSIGHTS\n" + "="*50 + "\n")

if 'borough_name' in df.columns:
    # Best performing borough
    best_borough = df.groupby('borough_name')['total_score'].mean().idxmax()
    best_score = df.groupby('borough_name')['total_score'].mean().max()
    print(f"🏆 Highest performing borough: {best_borough} ({best_score:.0f} avg SAT)")
    
    # Most schools
    most_schools_borough = df['borough_name'].value_counts().idxmax()
    school_count = df['borough_name'].value_counts().max()
    print(f"📚 Most schools: {most_schools_borough} ({school_count} schools)")
    
    # Highest variability
    highest_var = df.groupby('borough_name')['total_score'].std().idxmax()
    var_value = df.groupby('borough_name')['total_score'].std().max()
    print(f"📊 Highest variability: {highest_var} (std: {var_value:.0f})")
    
    # Best school in each borough
    print("\n🌟 Top School by Borough:")
    for borough in df['borough_name'].unique():
        if pd.notna(borough):
            borough_df = df[df['borough_name'] == borough]
            top_school = borough_df.loc[borough_df['total_score'].idxmax()]
            print(f"  • {borough}: {top_school['school_name']} ({top_school['total_score']:.0f})")

print("\n" + "="*50)
print("\n✅ Geographic analysis complete!")
print("\n📁 Generated files:")
print("  • nyc_schools_map.html - Interactive school map")
print("  • nyc_schools_heatmap.html - Performance heat map")
print("  • nyc_schools_clusters.html - Clustered borough view")
print("  • borough_geographic_analysis.png - Analysis charts")

## 9. Next Steps

### What You Can Do Next:

1. **Enhance Maps**: Add more data layers (demographics, programs, etc.)
2. **Spatial Analysis**: Calculate geographic clusters and patterns
3. **Distance Analysis**: Measure distance to high-performing schools
4. **Neighborhood Analysis**: Compare school performance by neighborhood
5. **Transit Access**: Overlay with subway/bus data

### Additional Data Sources:
- NYC Borough boundaries (GeoJSON)
- Neighborhood tabulation areas
- Census tract data
- Transit route information