# Read in CSVs based on NCES Tables

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
from ipywidgets import interactive
pd.set_option('display.max_columns', 500)

ELSI Table Generator - https://nces.ed.gov/ccd/elsi/tableGenerator.aspx?savedTableID=357855

Select a Table Row - District

Select Years - 13-14, 14-15, 15-16, 16-17, 17-18, 18-19, 19-20, 20-21

Select Table Columns - Information (7), Characteristics (4), Enrollments (1), Teachers & Staff (3)

Select Filters (Refinements) - All 50 States + DC

In [2]:
year_1314 = pd.read_csv("../data/elsi1314.csv", skiprows=6, nrows=18609)
#year_1314.tail()

In [3]:
year_1415 = pd.read_csv("../data/elsi1415.csv", skiprows=6, nrows=18620)
#year_1415.tail()

In [4]:
year_1516 = pd.read_csv("../data/elsi1516.csv", skiprows=6, nrows=18678)
#year_1516.tail()

In [5]:
year_1617 = pd.read_csv("../data/elsi1617.csv", skiprows=6, nrows=18468)
#year_1617.tail()

In [6]:
year_1718 = pd.read_csv("../data/elsi1718.csv", skiprows=6, nrows=18440)
#year_1718.tail()

In [7]:
year_1819 = pd.read_csv("../data/elsi1819.csv", skiprows=6, nrows=19406)
#year_1819.tail()

In [8]:
year_1920 = pd.read_csv("../data/elsi1920.csv", skiprows=6, nrows=19534)
#year_1920.tail()

In [9]:
year_2021 = pd.read_csv("../data/elsi2021.csv", skiprows=6, nrows=19388)
#year_2021.tail()

## DataFrame Shape
Confirms that each DataFrame has an equivalent number of columns

In [10]:
#year_1314.shape

In [11]:
#year_1415.shape

In [12]:
#year_1516.shape

In [13]:
#year_1617.shape

In [14]:
#year_1718.shape

In [15]:
#year_1819.shape

In [16]:
#year_1920.shape

In [17]:
#year_2021.shape

## Adding Year Column
By adding a static year column to each DataFrame, the columns can be given generic names in order to be combined later

In [18]:
year_1314['Year'] = '2013-2014'
#year_1314.head(2)

In [19]:
year_1415['Year'] = '2014-2015'
#year_1415.head(2)

In [20]:
year_1516['Year'] = '2015-2016'
#year_1516.head(2)

In [21]:
year_1617['Year'] = '2016-2017'
#year_1617.head(2)

In [22]:
year_1718['Year'] = '2017-2018'
#year_1718.head(2)

In [23]:
year_1819['Year'] = '2018-2019'
#year_1819.head(2)

In [24]:
year_1920['Year'] = '2019-2020'
#year_1920.head(2)

In [25]:
year_2021['Year'] = '2020-2021'
#year_2021.head(2)

## Standardizing Column Names
Each annual DataFrame will have the same columns. Based on the number of nulls in each, either the State Name or State column will be dropped. State Name is derived from the 'last year available' data and State is derived from the selected year.

In [26]:
year_1314.columns = ['Agency Name', 'State Name', 'State Abbreviation', 'NCES ID', 'State', 'District', 'County', 'School Count', 'ZIP Code', 'District Type', 'Locale', 'Start of Year Status', 'Student Count', 'FTE Teachers', 'Pupil/Teacher Ratio', 'Total Staff', 'Year']
#year_1314.head(2)

In [27]:
year_1415.columns = ['Agency Name', 'State Name', 'State Abbreviation', 'NCES ID', 'State', 'District', 'County', 'School Count', 'ZIP Code', 'District Type', 'Locale', 'Start of Year Status', 'Updated Status', 'Student Count', 'FTE Teachers', 'Pupil/Teacher Ratio', 'Total Staff', 'Year']
#year_1415.head(2)

In [28]:
year_1516.columns = ['Agency Name', 'State Name', 'State Abbreviation', 'NCES ID', 'State', 'District', 'County', 'School Count', 'ZIP Code', 'District Type', 'Locale', 'Start of Year Status', 'Updated Status', 'Student Count', 'FTE Teachers', 'Pupil/Teacher Ratio', 'Total Staff', 'Year']
#year_1516.head(2)

In [29]:
year_1617.columns = ['Agency Name', 'State Name', 'State Abbreviation', 'NCES ID', 'State', 'District', 'County', 'School Count', 'ZIP Code', 'District Type', 'Locale', 'Start of Year Status', 'Updated Status', 'Student Count', 'FTE Teachers', 'Pupil/Teacher Ratio', 'Total Staff', 'Year']
#year_1617.head(2)

In [30]:
year_1718.columns = ['Agency Name', 'State Name', 'State Abbreviation', 'NCES ID', 'State', 'District', 'County', 'School Count', 'ZIP Code', 'District Type', 'Locale', 'Start of Year Status', 'Updated Status', 'Student Count', 'FTE Teachers', 'Pupil/Teacher Ratio', 'Total Staff', 'Year']
#year_1718.head(2)

In [31]:
year_1819.columns = ['Agency Name', 'State Name', 'State Abbreviation', 'NCES ID', 'State', 'District', 'County', 'School Count', 'ZIP Code', 'District Type', 'Locale', 'Start of Year Status', 'Updated Status', 'Student Count', 'FTE Teachers', 'Pupil/Teacher Ratio', 'Total Staff', 'Year']
#year_1819.head(2)

In [32]:
year_1920.columns = ['Agency Name', 'State Name', 'State Abbreviation', 'NCES ID', 'State', 'District', 'County', 'School Count', 'ZIP Code', 'District Type', 'Locale', 'Start of Year Status', 'Updated Status', 'Student Count', 'FTE Teachers', 'Pupil/Teacher Ratio', 'Total Staff', 'Year']
#year_1920.head(2)

In [33]:
year_2021.columns = ['Agency Name', 'State Name', 'State Abbreviation', 'NCES ID', 'State', 'District', 'County', 'School Count', 'ZIP Code', 'District Type', 'Locale', 'Start of Year Status', 'Updated Status', 'Student Count', 'FTE Teachers', 'Pupil/Teacher Ratio', 'Total Staff', 'Year']
#year_2021.head(2)

## pd.Replace
Convert symbols included from ELSI into NA and NaN for more accurate EDA

In [34]:
year_13141 =year_1314.replace('†', 'NA')
#year_13141.head()

In [35]:
year_14151 = year_1415.replace('†', 'NA')
#year_14151.head()

In [36]:
year_15161 = year_1516.replace('†', 'NA')
#year_15161.head(2)

In [37]:
year_16171 = year_1617.replace('†', 'NA')
#year_16171.head(2)

In [38]:
year_17181 = year_1718.replace('†', 'NA')
#year_17181.head()

In [39]:
year_18191 = year_1819.replace('†', 'NA')
#year_18191.head()

In [40]:
year_19201 = year_1920.replace('†', 'NA')
#year_19201.head()

In [41]:
year_20211 = year_2021.replace('†', 'NA')
#year_20211.head()

In [42]:
year_13142 =year_13141.replace('–', 'NaN')
#year_13142.head()

In [43]:
year_14152 =year_14151.replace('–', 'NaN')
#year_14152.head()

In [44]:
year_15162 =year_15161.replace('–', 'NaN')
#year_15162.head()

In [45]:
year_16172 =year_16171.replace('–', 'NaN')
#year_16172.head()

In [46]:
year_17182 =year_17181.replace('–', 'NaN')
#year_17182.head()

In [47]:
year_18192 =year_18191.replace('–', 'NaN')
#year_18192.head()

In [48]:
year_19202 =year_19201.replace('–', 'NaN')
#year_19202.head()

In [49]:
year_20212 =year_20211.replace('–', 'NaN')
#year_20212.head()

## pd.Concat & df.reindex

In [50]:
enrollments1 = pd.concat([year_13142, year_14152, year_15162, year_16172, year_17182, year_18192, year_19202, year_20212], axis=0)
#enrollments1.head()

In [51]:
enrollments1 = enrollments1.reindex(columns = ['Agency Name', 'State Name', 'State Abbreviation', 'NCES ID', 'State', 'District', 'County', 'School Count', 'ZIP Code', 'District Type', 'Locale', 'Start of Year Status', 'Updated Status', 'Student Count', 'FTE Teachers', 'Pupil/Teacher Ratio', 'Total Staff', 'Year'])
#enrollments1.head()

In [52]:
enrollments1['State Name'] = enrollments1['State Name'].str.upper()
#enrollments1.head()

In [53]:
enrollments1['State'] = enrollments1['State'].str.upper()
#enrollments1.head()

In [54]:
enrollments1['District'] = enrollments1['District'].str.upper()
#enrollments1.head(15)

## General EDA

In [55]:
pd.set_option("display.max_rows", None)

In [56]:
enrollments1.info()
#symbols used by NCES are see as non-null by Python. Will need further EDA to discern best state column to keep

<class 'pandas.core.frame.DataFrame'>
Int64Index: 151143 entries, 0 to 19387
Data columns (total 18 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   Agency Name           151143 non-null  object
 1   State Name            151141 non-null  object
 2   State Abbreviation    151143 non-null  object
 3   NCES ID               151143 non-null  object
 4   State                 151143 non-null  object
 5   District              151143 non-null  object
 6   County                151143 non-null  object
 7   School Count          151143 non-null  object
 8   ZIP Code              151143 non-null  object
 9   District Type         151143 non-null  object
 10  Locale                151143 non-null  object
 11  Start of Year Status  151143 non-null  object
 12  Updated Status        132534 non-null  object
 13  Student Count         151143 non-null  object
 14  FTE Teachers          151143 non-null  object
 15  Pupil/Teacher Rati

In [57]:
enrollments1['Agency Name'].value_counts()

JEFFERSON COUNTY                                                40
DEPARTMENT OF CORRECTIONS                                       39
DISCOVERY CHARTER SCHOOL                                        37
SALEM SCHOOL DISTRICT                                           32
LIBERTY                                                         32
WINDHAM SCHOOL DISTRICT                                         32
MONROE SCHOOL DISTRICT                                          32
MORGAN COUNTY                                                   32
JACKSON COUNTY                                                  32
CLAY COUNTY                                                     32
COLUMBIA SCHOOL DISTRICT                                        32
INDIAN CAPITAL TECHNOLOGY CTR                                   32
WASHINGTON COUNTY                                               32
FRANKLIN COUNTY                                                 32
CLINTON                                                       

In [58]:
enrollments1['State Name'].value_counts()

CALIFORNIA              12311
TEXAS                    9951
OHIO                     8768
ILLINOIS                 8469
NEW YORK                 8399
MICHIGAN                 7313
PENNSYLVANIA             6359
ARIZONA                  5731
NEW JERSEY               5547
OKLAHOMA                 4800
MINNESOTA                4637
MISSOURI                 4540
MONTANA                  3936
WISCONSIN                3739
INDIANA                  3430
MASSACHUSETTS            3384
IOWA                     2754
WASHINGTON               2650
NORTH CAROLINA           2643
KANSAS                   2575
NEW HAMPSHIRE            2435
ARKANSAS                 2361
VERMONT                  2335
NEBRASKA                 2262
MAINE                    2171
COLORADO                 2139
GEORGIA                  1834
NORTH DAKOTA             1808
OREGON                   1770
VIRGINIA                 1749
CONNECTICUT              1640
KENTUCKY                 1498
LOUISIANA                1488
ALABAMA   

In [59]:
#enrollments1['State'].value_counts()
#will drop this column as the numbers in State Name are consistently higher and indicate a more complete column

In [60]:
enrollments = enrollments1.drop("State", axis=1)
#enrollments.head()

In [61]:
enrollments['District Type'].value_counts()

1-Regular local school district that is NOT a component of a supervisory union                            104922
7-Independent Charter District                                                                             27917
4-Regional Education Service Agency (RESA)                                                                  8178
2-Local school district that is a component of a supervisory union                                          3607
5-State agency providing elementary and/or secondary level instruction                                      1976
3-Supervisory union administrative center (or county superintendent's office serving the same purpose)      1748
8-Other education agencies                                                                                  1305
9-Specialized public school district                                                                        1264
NA                                                                                              

In [62]:
enrollments['District Type'].value_counts(normalize=True)

1-Regular local school district that is NOT a component of a supervisory union                            0.694190
7-Independent Charter District                                                                            0.184706
4-Regional Education Service Agency (RESA)                                                                0.054108
2-Local school district that is a component of a supervisory union                                        0.023865
5-State agency providing elementary and/or secondary level instruction                                    0.013074
3-Supervisory union administrative center (or county superintendent's office serving the same purpose)    0.011565
8-Other education agencies                                                                                0.008634
9-Specialized public school district                                                                      0.008363
NA                                                                              

In [63]:
enrollments['Locale'].value_counts()

21-Suburb: Large       27929
42-Rural: Distant      26995
43-Rural: Remote       20661
41-Rural: Fringe       17163
11-City: Large         15494
32-Town: Distant       11355
33-Town: Remote         8357
13-City: Small          6637
31-Town: Fringe         4970
12-City: Mid-size       4649
22-Suburb: Mid-size     3773
23-Suburb: Small        2875
NA                       285
Name: Locale, dtype: int64

In [64]:
enrollments['Locale'].value_counts(normalize=True)

21-Suburb: Large       0.184785
42-Rural: Distant      0.178606
43-Rural: Remote       0.136698
41-Rural: Fringe       0.113555
11-City: Large         0.102512
32-Town: Distant       0.075128
33-Town: Remote        0.055292
13-City: Small         0.043912
31-Town: Fringe        0.032883
12-City: Mid-size      0.030759
22-Suburb: Mid-size    0.024963
23-Suburb: Small       0.019022
NA                     0.001886
Name: Locale, dtype: float64

In [65]:
enrollments['School Count'].value_counts()
#replace ="0" with 0

1       39112
2       19598
3       17540
4       11097
="0"     9129
5        7408
1        6363
6        5219
7        3632
2        2816
8        2798
3        2530
9        2228
10       1688
4        1576
11       1484
0        1137
12       1091
5        1053
13        769
6         732
14        729
15        596
7         525
17        504
16        498
19        426
18        405
8         400
20        329
22        326
23        317
9         299
21        261
10        244
24        231
11        227
28        215
25        208
26        208
NA        188
27        185
30        147
29        143
12        138
32        134
34        125
35        115
13        115
14        114
31        103
33         98
38         91
36         90
42         86
15         84
40         76
43         75
41         74
37         73
44         72
17         71
39         69
46         69
18         69
49         66
16         64
51         61
48         59
19         57
53         57
47    

In [84]:
enrollments = enrollments.replace('="0"', 0)
#enrollments['School Count'].value_counts()

In [67]:
#enrollments['Pupil/Teacher Ratio'].value_counts()
#need to remove ="" notation from some entries. regex?

In [68]:
enrollments['Pupil/Teacher Ratio'] = enrollments['Pupil/Teacher Ratio'].map(lambda x: x.strip('="').rstrip('"'))
#https://stackoverflow.com/questions/13682044/remove-unwanted-parts-from-strings-in-a-column

In [83]:
#enrollments['Pupil/Teacher Ratio'].value_counts()

In [71]:
enrollments['ZIP Code'] = enrollments['ZIP Code'].map(lambda x: str(x).strip('="').rstrip('"'))

In [82]:
#enrollments['ZIP Code'].value_counts()

In [73]:
enrollments['NCES ID'] = enrollments['NCES ID'].map(lambda x: str(x).strip('="').rstrip('"'))

In [81]:
#enrollments['NCES ID'].value_counts()

In [75]:
enrollments['FTE Teachers'] = enrollments['FTE Teachers'].map(lambda x: str(x).strip('="').rstrip('"'))

In [80]:
#enrollments['FTE Teachers'].value_counts()

In [77]:
enrollments['Total Staff'] = enrollments['Total Staff'].map(lambda x: str(x).strip('="').rstrip('"'))

In [79]:
#enrollments['Total Staff'].value_counts()