Load the 2007 Common Core of Data
===================================


In [27]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from functools import partial

import scipy

from IPython.display import Markdown as md, HTML
from nycschools import schools, geo, ui, class_size


Unnamed: 0,dbn,beds,district,geo_district,boro,school_name,short_name,ay,year,school_type,...,missing_race_ethnicity_data_pct,swd_n,swd_pct,ell_n,ell_pct,poverty_n,poverty_pct,eni_pct,clean_name,zip
162,02M033,310200010033,2,2,Manhattan,P.S. 033 Chelsea Prep,PS 33,2016,2016-17,community,...,0.0,87,0.14,20,0.032,286,0.46,0.415,chelsea prep,10001
163,02M033,310200010033,2,2,Manhattan,P.S. 033 Chelsea Prep,PS 33,2017,2017-18,community,...,0.0,101,0.155,29,0.045,313,0.482,0.499,chelsea prep,10001
164,02M033,310200010033,2,2,Manhattan,P.S. 033 Chelsea Prep,PS 33,2018,2018-19,community,...,0.0,107,0.169,38,0.06,312,0.494,0.494,chelsea prep,10001
165,02M033,310200010033,2,2,Manhattan,P.S. 033 Chelsea Prep,PS 33,2019,2019-20,community,...,0.0,99,0.162,45,0.074,291,0.475,0.496,chelsea prep,10001
166,02M033,310200010033,2,2,Manhattan,P.S. 033 Chelsea Prep,PS 33,2020,2020-21,community,...,0.0,87,0.157,34,0.061,274,0.494,0.472,chelsea prep,10001
797,02M570,310200011570,2,2,Manhattan,Satellite Academy High School,HS 570,2016,2016-17,community,...,0.0,53,0.189,15,0.054,217,0.775,0.703,satellite academy high school,10001
798,02M570,310200011570,2,2,Manhattan,Satellite Academy High School,HS 570,2017,2017-18,community,...,0.0,64,0.215,13,0.044,232,0.779,0.843,satellite academy high school,10001
799,02M570,310200011570,2,2,Manhattan,Satellite Academy High School,NA 570,2018,2018-19,community,...,0.0,80,0.281,21,0.074,216,0.758,0.822,satellite academy high school,10001
800,02M570,310200011570,2,2,Manhattan,Satellite Academy High School,HS 570,2019,2019-20,community,...,0.0,73,0.287,24,0.094,213,0.839,0.872,satellite academy high school,10001
801,02M570,310200011570,2,2,Manhattan,Satellite Academy High School,HS 570,2020,2020-21,community,...,0.0,53,0.233,24,0.106,194,0.855,0.889,satellite academy high school,10001


In [29]:
# load common core of data for 2007-2008 to match the data in the journal article
# description of columns: https://nces.ed.gov/ccd/pdf/psu071bgen.pdf
ccd = pd.read_csv("_data/ccod-2007.csv")
ccd.head()

print(list(ccd.columns))
[c for c in ccd.columns if "ZIP" in c]
ccd[ccd["MZIP07"] == "10001"]

ccd["zip"] = ccd["MZIP07"].apply(lambda x: f"{x:05}")
ccd["zip"].unique()


['NCESSCH', 'FIPST', 'LEAID', 'school_id', 'STID07', 'SEASCH07', 'LEANM07', 'school_name', 'phone', 'MSTREE07', 'MCITY07', 'MSTATE07', 'MZIP07', 'MZIP407', 'LSTREE07', 'city', 'state', 'zip', 'LZIP407', 'TYPE07', 'STATUS07', 'ULOCAL07', 'latitude', 'longitude', 'CONUM07', 'county', 'FTE07', 'CDCODE07', 'GSLO07', 'GSHI07', 'LEVEL07', 'TITLEI07', 'STITLI07', 'MAGNET07', 'CHARTR07', 'SHARED07', 'FRELCH07', 'REDLCH07', 'TOTFRL07', 'MIGRNT07', 'PK07', 'AMPKM07', 'AMPKF07', 'ASPKM07', 'ASPKF07', 'HIPKM07', 'HIPKF07', 'BLPKM07', 'BLPKF07', 'WHPKM07', 'WHPKF07', 'KG07', 'AMKGM07', 'AMKGF07', 'ASKGM07', 'ASKGF07', 'HIKGM07', 'HIKGF07', 'BLKGM07', 'BLKGF07', 'WHKGM07', 'WHKGF07', 'G0107', 'AM01M07', 'AM01F07', 'AS01M07', 'AS01F07', 'HI01M07', 'HI01F07', 'BL01M07', 'BL01F07', 'WH01M07', 'WH01F07', 'G0207', 'AM02M07', 'AM02F07', 'AS02M07', 'AS02F07', 'HI02M07', 'HI02F07', 'BL02M07', 'BL02F07', 'WH02M07', 'WH02F07', 'G0307', 'AM03M07', 'AM03F07', 'AS03M07', 'AS03F07', 'HI03M07', 'HI03F07', 'BL03M07

  ccd = pd.read_csv("_data/ccod-2007.csv")


array(['35220', '36016', '36057', ..., '00830', '00831', '00804'],
      dtype=object)

In [4]:
encoding = "latin-1"
cbsa = pd.read_csv("_data/zip07_cbsa06.csv", encoding=encoding)
cbsa.head()


  cbsa = pd.read_csv("_data/zip07_cbsa06.csv", encoding=encoding)


Unnamed: 0,ZIP5,ZIP4,ZIP9,STATE CODE,STATE,COUNTY CODE,COUNTY NAME,CBSA CODE,CBSA TITLE,CBSA LSAD,METRO DIVISION CODE,METRO DIVISION TITLE,METRO DIVISION LSAD,CSA CODE,CSA TITLE,CSA LSAD
0,4841,,4841,23,ME,13,Knox County,40500.0,"Rockland, ME",Micropolitan Statistical Area,,,,,,
1,4843,,4843,23,ME,13,Knox County,40500.0,"Rockland, ME",Micropolitan Statistical Area,,,,,,
2,4846,,4846,23,ME,13,Knox County,40500.0,"Rockland, ME",Micropolitan Statistical Area,,,,,,
3,4847,,4847,23,ME,13,Knox County,40500.0,"Rockland, ME",Micropolitan Statistical Area,,,,,,
4,4848,,4848,23,ME,27,Waldo County,,,,,,,,,


In [31]:
ny = cbsa[(cbsa["STATE"] == "NY") & (cbsa["COUNTY NAME"] == "New York County")]
ny[["ZIP5","COUNTY NAME","CBSA CODE"]].drop_duplicates().head()
ny["zip"] = ny["ZIP5"].apply(lambda x: f"{x:05}")
ny["zip"].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ny["zip"] = ny["ZIP5"].apply(lambda x: f"{x:05}")


array(['10001', '10002', '10003', '10004', '10005', '10006', '10007',
       '10008', '10009', '10010', '10011', '10012', '10013', '10014',
       '10016', '10017', '10018', '10019', '10020', '10021', '10022',
       '10023', '10024', '10025', '10026', '10027', '10028', '10029',
       '10030', '10031', '10032', '10033', '10034', '10035', '10036',
       '10037', '10038', '10039', '10040', '10041', '10043', '10044',
       '10045', '10047', '10048', '10055', '10069', '10072', '10080',
       '10081', '10082', '10087', '10101', '10102', '10103', '10104',
       '10105', '10106', '10107', '10108', '10109', '10110', '10111',
       '10112', '10113', '10114', '10115', '10116', '10117', '10118',
       '10119', '10120', '10121', '10122', '10123', '10124', '10125',
       '10126', '10128', '10129', '10130', '10131', '10132', '10133',
       '10138', '10149', '10150', '10151', '10152', '10153', '10154',
       '10155', '10156', '10157', '10158', '10159', '10160', '10162',
       '10163', '101

In [35]:
ccod_ny_metro = ccd[ccd["zip"].isin(ny["zip"])]
ccod_ny_metro.count()

NCESSCH         354
FIPST           354
LEAID           354
school_id       354
STID07          354
               ... 
Unnamed: 224      0
Unnamed: 225      0
Unnamed: 226      0
Unnamed: 227      0
Unnamed: 228      0
Length: 229, dtype: int64

In [None]:

ccod_ny_metro = ccod_ny_metro[["school_id", "ASIAN07", "BLACK07", "HISP07", "WHITE07"]]
ccod_ny_metro.rename(columns={"ASIAN07": "asian_n", "BLACK07": "black_n", "HISP07": "hispanic_n", "WHITE07":"white_n"}, inplace=True)
ccod_ny_metro.to_csv("_data/ccod-2007-ny_metro.csv", index=False)
