New York City School Data Basics
=============================
03 Basic Statistics <small>[[◀ 02 Querying Data](02-querying-data.ipynb)] [[04 Calculating New Columns ▶](04-calculating-columns.ipynb)]</small>
---------------------------------------------------------------------

This Notebook shows us how to use the `pandas` library to find
basic quantitative descriptions of our data

Topics in this Notebook:
<br><br>
- selecting/dropping columns 
- sorting data using [`sort_values()`](https://www.w3schools.com/python/pandas/ref_df_sort_values.asp) method
- ['slicing'](https://datacarpentry.org/python-ecology-lesson/03-index-slice-subset/#extracting-range-based-subsets-slicing) DataFrames with `[]` operator
<br><br>
- calculating various statistics of sets of data using [methods](https://www.w3schools.com/python/pandas/pandas_ref_dataframe.asp) of DataFrames [`min()`, `max()`, `mean()`, `median()`, `mode()`...]
- statistical summaries with [`describe()`](https://www.w3schools.com/python/pandas/ref_df_describe.asp)
- correlation matrices with [`corr()`](https://www.w3schools.com/python/pandas/ref_df_corr.asp)



In [1]:
# import schools from the nycschool package
from nycschools import schools

# load the demographic data into a `DataFrame` called df
df = schools.load_school_demographics()

# let's just use one year of data
df = df[df.ay == 2020]

# use a subset of columns for this notebook
cols = [
    'dbn',
    'district',
    'boro',
    'school_name',
    'total_enrollment',
    'asian_pct',
    'black_pct',
    'hispanic_pct',
    'white_pct',
    'swd_pct',
    'ell_pct',
    'poverty_pct'
]
df = df[cols]


df.head()

Unnamed: 0,dbn,district,boro,school_name,total_enrollment,asian_pct,black_pct,hispanic_pct,white_pct,swd_pct,ell_pct,poverty_pct
4,01M015,1,Manhattan,P.S. 015 Roberto Clemente,193,0.135,0.275,0.528,0.057,0.223,0.109,0.819
9,01M019,1,Manhattan,P.S. 019 Asher Levy,212,0.061,0.193,0.613,0.08,0.392,0.042,0.712
14,01M020,1,Manhattan,P.S. 020 Anna Silver,412,0.248,0.133,0.522,0.073,0.218,0.119,0.709
19,01M034,1,Manhattan,P.S. 034 Franklin D. Roosevelt,273,0.026,0.381,0.557,0.029,0.392,0.062,0.96
24,01M063,1,Manhattan,The STAR Academy - P.S.63,208,0.029,0.192,0.635,0.091,0.279,0.014,0.769


In [2]:
# sort enrollments in descending order (greatest to least) by using the 'by' and 'ascending' parameters

data = df.sort_values(by="total_enrollment", ascending=False)
data

# Comment out the two lines of code above and uncomment 
# the ones below to sort enrollments in ascending order (least to greatest)
# Take note of the change to the 'ascending' parameter

# data = df.sort_values(by="total_enrollment", ascending=True)
# data

Unnamed: 0,dbn,district,boro,school_name,total_enrollment,asian_pct,black_pct,hispanic_pct,white_pct,swd_pct,ell_pct,poverty_pct
3723,13K430,13,Brooklyn,Brooklyn Technical High School,5921,0.599000,0.056000,0.066000,0.237000,0.016000,0.001,0.590
5310,20K490,20,Brooklyn,Fort Hamilton High School,4678,0.290000,0.025000,0.317000,0.347000,0.153000,0.162,0.706
6686,26Q430,26,Queens,Francis Lewis High School,4424,0.577000,0.047000,0.210000,0.150000,0.150000,0.115,0.690
5802,22K405,22,Brooklyn,Midwood High School,4109,0.354000,0.255000,0.133000,0.225000,0.138000,0.036,0.723
5807,22K425,22,Brooklyn,James Madison High School,3851,0.209000,0.128000,0.175000,0.472000,0.154000,0.121,0.758
...,...,...,...,...,...,...,...,...,...,...,...,...
9324,84M337,84,Manhattan,NYC Autism Charter School East Harlem,40,0.075000,0.200000,0.475000,0.200000,1.000000,0.025,0.950
9906,84X587,84,Bronx,NYC Autism Charter School Bronx,28,0.071429,0.392857,0.500000,0.035714,1.000000,0.036,0.893
9965,84X633,84,Bronx,Wildflower New York Charter School,19,0.000000,0.578947,0.421053,0.000000,0.263158,0.053,0.842
8712,79M331,79,Manhattan,The Judith S. Kaye School - D79,14,0.071000,0.429000,0.500000,0.000000,0.429000,0.000,0.857


In [3]:
# show the first 3 rows

data[:3]

Unnamed: 0,dbn,district,boro,school_name,total_enrollment,asian_pct,black_pct,hispanic_pct,white_pct,swd_pct,ell_pct,poverty_pct
3723,13K430,13,Brooklyn,Brooklyn Technical High School,5921,0.599,0.056,0.066,0.237,0.016,0.001,0.59
5310,20K490,20,Brooklyn,Fort Hamilton High School,4678,0.29,0.025,0.317,0.347,0.153,0.162,0.706
6686,26Q430,26,Queens,Francis Lewis High School,4424,0.577,0.047,0.21,0.15,0.15,0.115,0.69


In [4]:
# get just the total_enrollment column, called a Series in pandas
enrollment = df["total_enrollment"]

# locate the maxmimum enrollment
print("The largest school:", enrollment.max())

# locate the minimum enrollment
print("The smallest school:", enrollment.min())

# calculate the average enrollment
print("Average school size:", enrollment.mean())

# calculate the median enrollment
print("Median school size:", enrollment.median())

# calculate the mode of enrollments
print("Mode (can return multiple values) of school sizes:", list(enrollment.mode()))

The largest school: 5921
The smallest school: 7
Average school size: 560.79296875
Median school size: 460.5
Mode (can return multiple values) of school sizes: [479, 714]


In [5]:
# the built in describe() function calculates several descriptive statistics for each column
# in the data frame and returns them as a new dataframe
df.describe()

Unnamed: 0,district,total_enrollment,asian_pct,black_pct,hispanic_pct,white_pct,swd_pct,ell_pct,poverty_pct
count,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0,2048.0
mean,27.62207,560.792969,0.11771,0.302644,0.431721,0.118602,0.237641,0.141212,0.76656
std,26.68098,461.01451,0.166804,0.256375,0.246118,0.168041,0.16078,0.143127,0.194297
min,1.0,7.0,0.0,0.0,0.015,0.0,0.0,0.0,0.05
25%,10.0,304.0,0.015,0.081474,0.206,0.016,0.163,0.049,0.711
50%,19.0,460.5,0.046,0.245,0.405,0.036,0.208,0.1,0.827
75%,30.0,663.25,0.148,0.47225,0.63125,0.14525,0.261,0.18725,0.903
max,84.0,5921.0,0.926,0.935,1.0,0.945,1.0,1.0,0.96


In [12]:
# we can also call describe on a single series:
df.swd_pct.describe()

count    2048.000000
mean        0.237641
std         0.160780
min         0.000000
25%         0.163000
50%         0.208000
75%         0.261000
max         1.000000
Name: swd_pct, dtype: float64

In [19]:
# just checking in on what our DateFrame 'df' looks like
df

Unnamed: 0,dbn,district,boro,school_name,total_enrollment,asian_pct,black_pct,hispanic_pct,white_pct,swd_pct,ell_pct,poverty_pct
4,01M015,1,Manhattan,P.S. 015 Roberto Clemente,193,0.135000,0.275000,0.528000,0.057000,0.223000,0.109,0.819
9,01M019,1,Manhattan,P.S. 019 Asher Levy,212,0.061000,0.193000,0.613000,0.080000,0.392000,0.042,0.712
14,01M020,1,Manhattan,P.S. 020 Anna Silver,412,0.248000,0.133000,0.522000,0.073000,0.218000,0.119,0.709
19,01M034,1,Manhattan,P.S. 034 Franklin D. Roosevelt,273,0.026000,0.381000,0.557000,0.029000,0.392000,0.062,0.960
24,01M063,1,Manhattan,The STAR Academy - P.S.63,208,0.029000,0.192000,0.635000,0.091000,0.279000,0.014,0.769
...,...,...,...,...,...,...,...,...,...,...,...,...
9980,84X705,84,Bronx,Family Life Academy Charter School,416,0.007212,0.211538,0.776442,0.000000,0.100962,0.197,0.960
9985,84X706,84,Bronx,Harriet Tubman Charter School,647,0.000000,0.616692,0.358578,0.006182,0.097372,0.110,0.832
9990,84X717,84,Bronx,Icahn Charter School,328,0.012195,0.500000,0.478659,0.000000,0.070122,0.076,0.878
9995,84X718,84,Bronx,Bronx Charter School for Better Learning,570,0.008772,0.845614,0.100000,0.014035,0.108772,0.012,0.819


In [21]:
# we can also call the corr() method to show correlations between columns
# we will take out "district", "dbn", "school_name", and "boro" from this data because these are categorical -- not the measure of a value

# correlations close to -1 or 1 represent strong correlations
# correlations closer to 0 represent weak correlations
data = df.drop(columns=["district", "dbn", "school_name", "boro"])
data.corr()

Unnamed: 0,total_enrollment,asian_pct,black_pct,hispanic_pct,white_pct,swd_pct,ell_pct,poverty_pct
total_enrollment,1.0,0.350881,-0.242553,-0.111551,0.179056,-0.175962,-0.018582,-0.155614
asian_pct,0.350881,1.0,-0.452366,-0.370264,0.210809,-0.210489,0.138261,-0.287675
black_pct,-0.242553,-0.452366,1.0,-0.416505,-0.450801,0.138008,-0.362794,0.299685
hispanic_pct,-0.111551,-0.370264,-0.416505,1.0,-0.396739,0.070421,0.441878,0.490632
white_pct,0.179056,0.210809,-0.450801,-0.396739,1.0,-0.083534,-0.177996,-0.795595
swd_pct,-0.175962,-0.210489,0.138008,0.070421,-0.083534,1.0,0.000117,0.233832
ell_pct,-0.018582,0.138261,-0.362794,0.441878,-0.177996,0.000117,1.0,0.359285
poverty_pct,-0.155614,-0.287675,0.299685,0.490632,-0.795595,0.233832,0.359285,1.0


In [22]:
# last, we can use styles to make the correlation table easier to read
# note: you need to run this cell to see the colors -- it gets saved without the styled output
corr = data.corr()

# a coolwarm color map will show values in a gradient where -1 is the deepest blue and 1 is deepest red
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,total_enrollment,asian_pct,black_pct,hispanic_pct,white_pct,swd_pct,ell_pct,poverty_pct
total_enrollment,1.0,0.350881,-0.242553,-0.111551,0.179056,-0.175962,-0.018582,-0.155614
asian_pct,0.350881,1.0,-0.452366,-0.370264,0.210809,-0.210489,0.138261,-0.287675
black_pct,-0.242553,-0.452366,1.0,-0.416505,-0.450801,0.138008,-0.362794,0.299685
hispanic_pct,-0.111551,-0.370264,-0.416505,1.0,-0.396739,0.070421,0.441878,0.490632
white_pct,0.179056,0.210809,-0.450801,-0.396739,1.0,-0.083534,-0.177996,-0.795595
swd_pct,-0.175962,-0.210489,0.138008,0.070421,-0.083534,1.0,0.000117,0.233832
ell_pct,-0.018582,0.138261,-0.362794,0.441878,-0.177996,0.000117,1.0,0.359285
poverty_pct,-0.155614,-0.287675,0.299685,0.490632,-0.795595,0.233832,0.359285,1.0
