In [1]:
import requests
import pandas as pd
import zipfile
import io
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import WebDriverException

In [None]:
urls = [
    # from 2015 to 2018, there are 5 files in common core, and a separate geocode file

    # these are school-level files

    # to find these files via the website, go to this URL:
    # https://nces.ed.gov/ccd/files.asp
    # select 'Nonfiscal' and 'School' for the level.
    # the geocode file is separate:
    # https://nces.ed.gov/programs/edge/Geographic/SchoolLocations

    # three-char codes in the filenames:
    # 029 = directory file
    # 052 = membership file
    # 059 = staff file
    # 129 = school characteristics file
    # 033 = lunch program accessibility

    # 2020
    "https://nces.ed.gov/ccd/Data/zip/ccd_sch_029_1920_w_1a_082120.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_SCH_052_1920_l_1a_082120.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_059_1920_l_1a_082120.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_129_1920_w_1a_082120.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_033_1920_l_1a_082120.zip"
    # geocode
    ,"https://nces.ed.gov/programs/edge/data/EDGE_GEOCODE_PUBLICSCH_1920.zip"

    # 2019
    ,"https://nces.ed.gov/ccd/data/zip/ccd_sch_029_1819_w_1a_091019.zip"
    ,"https://nces.ed.gov/ccd/data/zip/ccd_sch_052_1819_l_1a_091019.zip"
    ,"https://nces.ed.gov/ccd/data/zip/ccd_sch_059_1819_l_1a_091019.zip"
    ,"https://nces.ed.gov/ccd/data/zip/ccd_sch_129_1819_w_1a_091019.zip"
    ,"https://nces.ed.gov/ccd/data/zip/ccd_sch_033_1819_l_1a_091019.zip"
    # geocode
    ,"https://nces.ed.gov/programs/edge/data/EDGE_GEOCODE_PUBLICSCH_1819.zip"

    # 2018
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_029_1718_w_1a_083118.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_052_1718_l_1a_083118.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_059_1718_l_1a_083118.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_129_1718_w_1a_083118.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_033_1718_l_1a_083118.zip"
    # geocode
    ,"https://nces.ed.gov/programs/edge/data/EDGE_GEOCODE_PUBLICSCH_1718.zip"

    # 2017
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_029_1617_w_1a_11212017_csv.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_SCH_052_1617_l_2a_11212017_CSV.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_059_1617_l_2a_11212017_csv.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_129_1617_w_1a_11212017_csv.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_033_1617_l_2a_11212017_csv.zip"
    # geocode
    ,"https://nces.ed.gov/programs/edge/data/EDGE_GEOCODE_PUBLICSCH_1617.zip"

    # 2016
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_029_1516_w_2a_011717_csv.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_052_1516_w_2a_011717_csv.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_059_1516_w_2a_011717_csv.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_129_1516_w_2a_011717_csv.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_033_1516_w_2a_011717_csv.zip"
    # geocode
    ,"https://nces.ed.gov/programs/edge/data/EDGE_GEOCODE_PUBLICSCH_1516.zip"

    # 2015
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_029_1415_w_0216601a_txt.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_052_1415_w_0216161a_txt.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_059_1415_w_0216161a_txt.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_129_1415_w_0216161a_txt.zip"
    ,"https://nces.ed.gov/ccd/Data/zip/ccd_sch_033_1415_w_0216161a_txt.zip"
    # last year that geocode data is provided as part of common core data; after this, it's on EDGE page
    ,"https://nces.ed.gov/ccd/Data/zip/EDGE_GEOIDS_201415_PUBLIC_SCHOOL_csv.zip"

    # 2014
    ,"https://nces.ed.gov/ccd/Data/zip/sc132a_txt.zip"
    # 2013
    ,"https://nces.ed.gov/ccd/Data/zip/sc122a_txt.zip"
    # 2012
    ,"https://nces.ed.gov/ccd/Data/zip/sc111a_supp_txt.zip"
    # 2011
    ,"https://nces.ed.gov/ccd/Data/zip/sc102a_txt.zip"
    # 2010
    ,"https://nces.ed.gov/ccd/data/zip/sc092a_txt.zip"
    # 2009
    ,"https://nces.ed.gov/ccd/data/zip/sc081b_txt.zip"
    # 2008
    ,"https://nces.ed.gov/ccd/data/zip/sc071b_txt.zip"

    # 2007
    ,"https://nces.ed.gov/ccd/data/zip/sc061cai_dat.zip"
    ,"https://nces.ed.gov/ccd/data/zip/sc061ckn_dat.zip"
    ,"https://nces.ed.gov/ccd/data/zip/sc061cow_dat.zip"

    # 2006
    ,"https://nces.ed.gov/ccd/data/zip/sc051aai_dat.zip"
    ,"https://nces.ed.gov/ccd/data/zip/sc051akn_dat.zip"
    ,"https://nces.ed.gov/ccd/data/zip/sc051aow_dat.zip"

    # 2005
    ,"https://nces.ed.gov/ccd/data/zip/sc041bai_dat.zip"
    ,"https://nces.ed.gov/ccd/data/zip/sc041bkn_dat.zip"
    ,"https://nces.ed.gov/ccd/data/zip/sc041bow_dat.zip"

    # 2004
    ,"https://nces.ed.gov/ccd/data/zip/sc031aai_dat.zip"
    ,"https://nces.ed.gov/ccd/data/zip/sc031akn_dat.zip"
    ,"https://nces.ed.gov/ccd/data/zip/sc031aow_dat.zip"

    # 2003
    ,"https://nces.ed.gov/ccd/data/zip/sc021aai_dat.zip"
    ,"https://nces.ed.gov/ccd/data/zip/sc021akn_dat.zip"
    ,"https://nces.ed.gov/ccd/data/zip/sc021aow_dat.zip"

    # 2002
    ,"https://nces.ed.gov/ccd/data/zip/sc011aai_dat.zip"
    ,"https://nces.ed.gov/ccd/data/zip/sc011akn_dat.zip"
    ,"https://nces.ed.gov/ccd/data/zip/sc011aow_dat.zip"

    # 2001
    ,"https://nces.ed.gov/ccd/data/zip/sc001aai_dat.zip"
    ,"https://nces.ed.gov/ccd/data/zip/sc001akn_dat.zip"
    ,"https://nces.ed.gov/ccd/data/zip/sc001aow_dat.zip"

    # 2000
    ,"https://nces.ed.gov/ccd/data/zip/sc991bai_dat.zip"
    ,"https://nces.ed.gov/ccd/data/zip/sc991bkn_dat.zip"
    ,"https://nces.ed.gov/ccd/data/zip/sc991bow_dat.zip"

]

In [None]:
# opt = webdriver.ChromeOptions()
# opt.add_argument("--headless")
# driver = webdriver.Chrome(options=opt)
# driver.get(url)
# html = driver.page_source

# soup = BeautifulSoup(html, 'html.parser')

In [2]:
years = range(1986, 2024)
# years = range(2001, 2024)
# years = range(2009, 2024)

fmt_urla = "https://nces.ed.gov/ccd/data/txt/psu{year}lay.txt"
fmt_urlb = "https://nces.ed.gov/ccd/data/txt/sc{year}1alay.txt"

data_urls = [
    "https://nces.ed.gov/ccd/data/zip/psu{year}ai_dat.zip",
    "https://nces.ed.gov/ccd/data/zip/psu{year}kn_dat.zip",
    "https://nces.ed.gov/ccd/data/zip/psu{year}ow_dat.zip"
]


def get_fmt_url(year, session):
    y = year % 100
    if year in [2002, 2003, 2005, 2008]:
        y = f"{y:02d}1a"
    elif year in [2009, 2010]:
        y = f"{y:02d}2a"
    elif year > 2001:
        y = f"{y:02d}1b"
    else:
        y = f"{y:02d}"

    if year > 2010:
        url = fmt_urlb.format(year=year % 100)
    else:
        url = fmt_urla.format(year=y)
    
    response = session.get(url)
    txt = response.content.decode('latin1')
    if response.status_code == 404:
        return (year, url, False)
    return (year, url, True)


with requests.Session() as s:
    fmt_urls = [get_fmt_url(year, s) for year in years]
    # x = get_fmt_url(2011, s)
fmt_urls


[(1986, 'https://nces.ed.gov/ccd/data/txt/psu86lay.txt', True),
 (1987, 'https://nces.ed.gov/ccd/data/txt/psu87lay.txt', True),
 (1988, 'https://nces.ed.gov/ccd/data/txt/psu88lay.txt', True),
 (1989, 'https://nces.ed.gov/ccd/data/txt/psu89lay.txt', True),
 (1990, 'https://nces.ed.gov/ccd/data/txt/psu90lay.txt', True),
 (1991, 'https://nces.ed.gov/ccd/data/txt/psu91lay.txt', True),
 (1992, 'https://nces.ed.gov/ccd/data/txt/psu92lay.txt', True),
 (1993, 'https://nces.ed.gov/ccd/data/txt/psu93lay.txt', True),
 (1994, 'https://nces.ed.gov/ccd/data/txt/psu94lay.txt', True),
 (1995, 'https://nces.ed.gov/ccd/data/txt/psu95lay.txt', True),
 (1996, 'https://nces.ed.gov/ccd/data/txt/psu96lay.txt', True),
 (1997, 'https://nces.ed.gov/ccd/data/txt/psu97lay.txt', True),
 (1998, 'https://nces.ed.gov/ccd/data/txt/psu98lay.txt', True),
 (1999, 'https://nces.ed.gov/ccd/data/txt/psu99lay.txt', True),
 (2000, 'https://nces.ed.gov/ccd/data/txt/psu00lay.txt', True),
 (2001, 'https://nces.ed.gov/ccd/data/tx

In [3]:
failed = {y: url for y, url, success in fmt_urls if not success}
# failed[2000]
'https://nces.ed.gov/ccd/data/txt/psu00lay.txt'  # 2000
'https://nces.ed.gov/ccd/data/txt/psu01lay.txt' # 2001
'https://nces.ed.gov/ccd/data/txt/psu021alay.txt' # 2002
'https://nces.ed.gov/ccd/data/txt/psu092alay.txt'  # 2009
'https://nces.ed.gov/ccd/data/txt/psu102alay.txt' #2010
'https://nces.ed.gov/ccd/data/txt/sc111alay.txt'  # 2011

valid = {y: url for y, url, success in fmt_urls if success}
valid

{1986: 'https://nces.ed.gov/ccd/data/txt/psu86lay.txt',
 1987: 'https://nces.ed.gov/ccd/data/txt/psu87lay.txt',
 1988: 'https://nces.ed.gov/ccd/data/txt/psu88lay.txt',
 1989: 'https://nces.ed.gov/ccd/data/txt/psu89lay.txt',
 1990: 'https://nces.ed.gov/ccd/data/txt/psu90lay.txt',
 1991: 'https://nces.ed.gov/ccd/data/txt/psu91lay.txt',
 1992: 'https://nces.ed.gov/ccd/data/txt/psu92lay.txt',
 1993: 'https://nces.ed.gov/ccd/data/txt/psu93lay.txt',
 1994: 'https://nces.ed.gov/ccd/data/txt/psu94lay.txt',
 1995: 'https://nces.ed.gov/ccd/data/txt/psu95lay.txt',
 1996: 'https://nces.ed.gov/ccd/data/txt/psu96lay.txt',
 1997: 'https://nces.ed.gov/ccd/data/txt/psu97lay.txt',
 1998: 'https://nces.ed.gov/ccd/data/txt/psu98lay.txt',
 1999: 'https://nces.ed.gov/ccd/data/txt/psu99lay.txt',
 2000: 'https://nces.ed.gov/ccd/data/txt/psu00lay.txt',
 2001: 'https://nces.ed.gov/ccd/data/txt/psu01lay.txt',
 2002: 'https://nces.ed.gov/ccd/data/txt/psu021alay.txt',
 2003: 'https://nces.ed.gov/ccd/data/txt/psu03

In [16]:

format_files = {
    1986: 'https://nces.ed.gov/ccd/data/txt/psu86lay.txt',
    1987: 'https://nces.ed.gov/ccd/data/txt/psu87lay.txt',
    1988: 'https://nces.ed.gov/ccd/data/txt/psu88lay.txt',
    1989: 'https://nces.ed.gov/ccd/data/txt/psu89lay.txt',
    1990: 'https://nces.ed.gov/ccd/data/txt/psu90lay.txt',
    1991: 'https://nces.ed.gov/ccd/data/txt/psu91lay.txt',
    1992: 'https://nces.ed.gov/ccd/data/txt/psu92lay.txt',
    1993: 'https://nces.ed.gov/ccd/data/txt/psu93lay.txt',
    1994: 'https://nces.ed.gov/ccd/data/txt/psu94lay.txt',
    1995: 'https://nces.ed.gov/ccd/data/txt/psu95lay.txt',
    1996: 'https://nces.ed.gov/ccd/data/txt/psu96lay.txt',
    1997: 'https://nces.ed.gov/ccd/data/txt/psu97lay.txt',
    1998: 'https://nces.ed.gov/ccd/data/txt/psu98lay.txt',
    1999: 'https://nces.ed.gov/ccd/data/txt/psu99lay.txt',
    2000: 'https://nces.ed.gov/ccd/data/txt/psu00lay.txt',
    2001: 'https://nces.ed.gov/ccd/data/txt/psu01lay.txt',
    2002: 'https://nces.ed.gov/ccd/data/txt/psu021alay.txt',
    2003: 'https://nces.ed.gov/ccd/data/txt/psu031alay.txt',
    2004: 'https://nces.ed.gov/ccd/data/txt/psu041blay.txt',
    2005: 'https://nces.ed.gov/ccd/data/txt/psu051alay.txt',
    2006: 'https://nces.ed.gov/ccd/data/txt/psu061blay.txt',
    2007: 'https://nces.ed.gov/ccd/data/txt/psu071blay.txt',
    2008: 'https://nces.ed.gov/ccd/data/txt/psu081alay.txt',
    2009: 'https://nces.ed.gov/ccd/data/txt/psu092alay.txt',
    2010: 'https://nces.ed.gov/ccd/data/txt/psu102alay.txt',
    2011: 'https://nces.ed.gov/ccd/data/txt/sc111alay.txt',
    2012: 'https://nces.ed.gov/ccd/data/txt/sc121alay.txt',
    2013: 'https://nces.ed.gov/ccd/data/txt/sc131alay.txt'
}

with requests.Session() as s:
    for year, url in valid.items():
        print(f"Downloading {year} data from {url}")
        response = s.get(url)
        if response.status_code != 200:
            print(f"Failed to download {year} data.")
            continue
        txt = response.content.decode('latin1')
        # Save the file
        filename = f"layout-{year}.txt"
        path = f"/home/mxc/Projects/school-data-portal/drafts/_data/ccd/layouts/{filename}"
        with open(path, 'w', encoding='latin1') as f:
            f.write(txt)
        print(f"Saved {filename}")

Downloading 1986 data from https://nces.ed.gov/ccd/data/txt/psu86lay.txt
Saved layout-1986.txt
Downloading 1987 data from https://nces.ed.gov/ccd/data/txt/psu87lay.txt
Saved layout-1987.txt
Downloading 1988 data from https://nces.ed.gov/ccd/data/txt/psu88lay.txt
Saved layout-1988.txt
Downloading 1989 data from https://nces.ed.gov/ccd/data/txt/psu89lay.txt
Saved layout-1989.txt
Downloading 1990 data from https://nces.ed.gov/ccd/data/txt/psu90lay.txt
Saved layout-1990.txt
Downloading 1991 data from https://nces.ed.gov/ccd/data/txt/psu91lay.txt
Saved layout-1991.txt
Downloading 1992 data from https://nces.ed.gov/ccd/data/txt/psu92lay.txt
Saved layout-1992.txt
Downloading 1993 data from https://nces.ed.gov/ccd/data/txt/psu93lay.txt
Saved layout-1993.txt
Downloading 1994 data from https://nces.ed.gov/ccd/data/txt/psu94lay.txt
Saved layout-1994.txt
Downloading 1995 data from https://nces.ed.gov/ccd/data/txt/psu95lay.txt
Saved layout-1995.txt
Downloading 1996 data from https://nces.ed.gov/ccd

In [None]:

def get_format(txt):
    txt = re.sub("\s+DKB.*$", "", txt, flags=re.MULTILINE)
    i = txt.find("\nName")
    txt = txt[i:]
    txt = txt.replace("  GRADE", " GRADE")
    txt = re.sub(r"^(\s*\+.*) AN", r"\1\tAN", txt, flags=re.MULTILINE)
    txt = txt.replace("\t", "    ")
    lines = txt.splitlines()
    lines = [line.strip() for line in lines if line.strip()]

    rows = [re.split("\s\s+", line) for line in lines]
    rows = [[word.strip() for word in row if word.strip()] for row in rows]
    cols = rows.pop(0)
    
    df = pd.DataFrame(rows, columns=cols)
    df.loc[df["Type"].str.endswith("*"), "Type"] = "F"
    df.loc[df["Size"].str.endswith("*"), "Type"] = "F"
    df.Position = df.Position.str.replace("*", "")
    df.Size = df.Size.str.replace("*", "")
    df["start"] = df.Position.str.split("-").str[0].astype(int) - 1
    df["end"] = df.Position.str.split("-").str[1].astype(int)
    df["Name"] = df["Name"].str.replace("+", "")
    return df


def get_fmt_year(year, session):
    url = format_files[year]
    
    response = session.get(url)

    txt = response.content.decode('latin1')
    try:
        format = get_format(txt)
        format["ay"] = year
        return format
    except Exception as e:
        print(f"Error processing format for year {year}: {e}")
        return None



with requests.Session() as s:
    formats = {year: get_fmt_year(year, s) for year in range(1986, 1995)}

# # formats

# format = get_fmt_year(1988, requests.Session())
formats[1995].head(50)

Error processing format for year 1996: pop from empty list


Unnamed: 0,Name,Type,Position,Size,Description,start,end,ay
0,NCESSHC,AN,001-012,12,UNIQUE SCHOOL ID (NCES ASSIGNED),0,12,1995
1,FIPS,AN,001-002,2,FIPS STATE CODE FOR LOCATION OF SCHOOL,0,2,1995
2,LEAID,AN,001-007,7,UNIQUE SYSTEM ID (NCES ASSIGNED),0,7,1995
3,SCHNO,AN,008-012,5,SCHOOL NUMBER WITHIN STATE (NCES),7,12,1995
4,STID95,AN,013-026,14,STATE AGENCY ID,12,26,1995
5,LEANM95,AN,027-056,30,NAME OF OPERATING AGENCY,26,56,1995
6,SEASCH95,AN,057-076,20,STATE SCHOOL ID,56,76,1995
7,SCHNAM95,AN,077-106,30,SCHOOL NAME,76,106,1995
8,STREET95,AN,107-136,30,MAILING ADDRESS OF SCHOOL,106,136,1995
9,CITY95,AN,137-154,18,CITY NAME (MAILING ADDRESS),136,154,1995


In [25]:
# Check for any invalid rows

def isint(n):
    try:
        int(n)
        return True
    except ValueError:
        return False

for f in formats.values():
    f["valid"] = f.apply(lambda x: isint(x.start) and isint(x.end), axis=1 )
    bad  = f[~f["valid"]]
    if not bad.empty:
        display(bad)
formats.keys()

dict_keys([1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995])

In [34]:


def val(t, v):
    if t == "AN":
        return v
    if v == ".":
        return 0
    if t == "N":
        return int(v) if v.isdigit() else float(v)
    return float(v)

def line_to_dict(line, format):
    row = {}
    def split(x):
        value = line[x["start"]:x["end"]].strip()
        value = val(x.Type, value)
        row[x.Name] = value

    format.apply(split, axis=1)
    return row


def read_data(year, format, session):
    results = []
    
    pre = ["ai", "kn", "ow"]
    for p in pre:
        url = f"https://nces.ed.gov/ccd/data/zip/psu{year % 100}{p}_dat.zip"
        response = session.get(url)
        with zipfile.ZipFile(io.BytesIO(response.content)) as z:
            fname = z.namelist()[0]
            with z.open(fname) as f:
                lines = [line_to_dict(line.decode("latin1"), format) for line in f]
                results.extend(lines)
    return pd.DataFrame(lines)


out = "/home/mxc/Projects/school-data-portal/drafts/_data/ccd/data"
with requests.Session() as s:
    for year in range(1987, 1996):
        try:
            df = read_data(year, formats[year], s)
        except Exception as e:
            print(f"Error reading data for year {year}: {e}")
            continue
        try:
            df.to_csv(f"{out}/ccd-{year}.csv", index=False, encoding="utf-8", escapechar="\\")
        except Exception as e:
            print(f"Error saving data for year {year}: {e}")
            continue
df

Error reading data for year 1987: could not convert string to float: 'N'
Error reading data for year 1988: could not convert string to float: 'N'
Error reading data for year 1989: could not convert string to float: 'M'
Error reading data for year 1990: could not convert string to float: 'M'
Error reading data for year 1991: could not convert string to float: 'M'
Error reading data for year 1992: could not convert string to float: 'N'
Error reading data for year 1993: could not convert string to float: 'N'
Error reading data for year 1994: could not convert string to float: 'N'
Error reading data for year 1995: could not convert string to float: 'N'


Unnamed: 0,NCESSCH,FIPS,AGENCY,LEAID,SCHNO,STID86,LEANM86,SEASCH86,SCHNAM86,ADRESS86,...,G0586,G0686,G0786,G0886,G0986,G1086,G1186,G1286,MEMBER86,YEAR
0,390434800001,39,4348,3904348,1,043489,AKRON CITY SD,001537,BARBER ELEM SCHOOL,665 GARRY RD,...,72,65,0,0,0,0,0,0,480,86
1,390434800002,39,4348,3904348,2,043489,AKRON CITY SD,065490,BARRETT ELEM SCHOOL,888 JONATHAN AVE,...,82,86,0,0,0,0,0,0,751,86
2,390434800003,39,4348,3904348,3,043489,AKRON CITY SD,002600,BETTES ELEM SCHOOL,1333 BETANA AVE,...,53,42,0,0,0,0,0,0,384,86
3,390434800004,39,4348,3904348,4,043489,AKRON CITY SD,002634,BETTY JANE ELEM SCHOOL,444 DARROW RD,...,102,103,0,0,0,0,0,0,644,86
4,390434800005,39,4348,3904348,5,043489,AKRON CITY SD,004077,BUCHTEL HIGH SCHOOL,1040 COPLEY RD,...,0,0,0,0,346,320,288,270,1224,86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29733,600003000043,60,30,6000030,43,,N MARIANAS DEPT OF EDUC,CM69000300024PU,SUSUPE HEADSTART,C/O MARIANA EDUC DEPT,...,0,0,0,0,0,0,0,0,19,85
29734,600003000044,60,30,6000030,44,,N MARIANAS DEPT OF EDUC,CM69000300025PU,TANAPAG HEADSTART,C/O MARIANA EDUC DEPT,...,0,0,0,0,0,0,0,0,20,85
29735,600003000045,60,30,6000030,45,,N MARIANAS DEPT OF EDUC,CM69000300026PU,SAN ROQUE HEADSTART,C/O MARIANA EDUC DEPT,...,0,0,0,0,0,0,0,0,14,85
29736,600003000046,60,30,6000030,46,,N MARIANAS DEPT OF EDUC,CM69000300027PU,ROTA HEADSTART,SONGSONG VILLAGE,...,0,0,0,0,0,0,0,0,41,85


In [120]:
nyc = df[(df.ST86 == "NY") & (df.LEANM86 == "NEW YORK CITY SCH DIST")].copy()

cols = [
"NCESSCH", "SCHNO", "SEASCH86","SCHNAM86","ADRESS86","CITY86",
"ST86","ZIP86","TYPE86", "STATUS86","LOCALE86","FTE86","GRSPAN86","MEMBER86"
]
nyc = nyc[cols]
nyc.sort_values("MEMBER86", ascending=False).head(30)

Unnamed: 0,NCESSCH,SCHNO,SEASCH86,SCHNAM86,ADRESS86,CITY86,ST86,ZIP86,TYPE86,STATUS86,LOCALE86,FTE86,GRSPAN86,MEMBER86
50700,362058001914,1914.0,317800010510,AUXILIARY H S,198 FORSYTH ST,NEW YORK,NY,10002,1,1,,110.0,1010,7047
50719,362058001940,1940.0,327800010440,DE WITT CLINTON H S,MOSHOLU PKWY+ PAUL AV,BRONX,NY,10468,1,1,,255.5,0912,5038
50785,362058002016,2016.0,327800010475,JOHN F KENNEDY H S,99 TERRACE VIEW AVE,BRONX,NY,10463,1,1,,262.8,0912,4986
50693,362058001907,1907.0,327800010450,ADLAI E STEVENSON H S,1980 LAFAYETTE AVE,BRONX,NY,10473,1,1,,248.5,0912,4912
50733,362058001954,1954.0,337800010420,FRANKLIN K LANE H S,999 JAMAICA AVE,BROOKLYN,NY,11208,1,1,,214.2,0912,4526
50711,362058001928,1928.0,337800010430,BROOKLYN TECHNICAL H S,29 FT GREENE PL,BROOKLYN,NY,11217,1,1,,199.6,0912,4366
50803,362058002038,2038.0,347800010455,NEWTOWN H S,48 01 90TH ST,ELMHURST,NY,11373,1,1,,205.4,0912,4137
50706,362058001921,1921.0,337800010455,BOYS + GIRLS H S,1700 FULTON ST,BROOKLYN,NY,11213,1,1,,186.0,0912,4128
50742,362058001965,1965.0,347800010505,HILLCREST H S,160-05 HIGHLAND AVE,JAMAICA,NY,11432,1,1,,175.5,0912,3822
50724,362058001945,1945.0,337800010465,ERASMUS HALL H S,911 FLATBUSH AVE,BROOKLYN,NY,11226,1,1,,186.1,0912,3728


In [121]:
nyc.to_csv("ccd-1986.csv", index=False, encoding="utf-8")
