Basic Statistics
================

This Notebook shows us how to use the `pandas` to find
basic quantitative descriptions of our data

Topics in this Notebook:

- dropping columns
- minimum and maximum ranges
- averages
- counts
- sorting data
- correlations with `corr()`
- `describe()`

In [1]:
# import schools from the nycschool package
from nycschools import schools

# load the demographic data into a `DataFrame` called df
df = schools.load_school_demographics()

# let's just use one year of data
df = df[df.ay == 2020]

# use a subset of columns for this notebook
cols = [
    'dbn',
    'district',
    'boro',
    'school_name',
    'total_enrollment',
    'asian_pct',
    'black_pct',
    'hispanic_pct',
    'white_pct',
    'swd_pct',
    'ell_pct',
    'poverty_pct'
]
df = df[cols]


df.head()

Unnamed: 0,dbn,district,boro,school_name,total_enrollment,asian_pct,black_pct,hispanic_pct,white_pct,swd_pct,ell_pct,poverty_pct
1790,01M015,1,Manhattan,P.S. 015 Roberto Clemente,193,0.134715,0.274611,0.528497,0.056995,0.227979,0.108808,0.834197
1795,01M020,1,Manhattan,P.S. 020 Anna Silver,412,0.247573,0.133495,0.521845,0.072816,0.223301,0.118932,0.720874
1800,01M034,1,Manhattan,P.S. 034 Franklin D. Roosevelt,273,0.025641,0.380952,0.556777,0.029304,0.395604,0.062271,0.96
1805,01M063,1,Manhattan,The STAR Academy - P.S.63,208,0.028846,0.192308,0.634615,0.091346,0.283654,0.014423,0.778846
1810,01M064,1,Manhattan,P.S. 064 Robert Simon,220,0.031818,0.181818,0.727273,0.040909,0.277273,0.018182,0.918182


In [2]:
# sort the data and show just the 10 largest schools
# sort in descending order (biggest --> smallest)
data = df.sort_values(by="total_enrollment", ascending=False)

# show the first 10 rows
data[:10]

Unnamed: 0,dbn,district,boro,school_name,total_enrollment,asian_pct,black_pct,hispanic_pct,white_pct,swd_pct,ell_pct,poverty_pct
5154,13K430,13,Brooklyn,Brooklyn Technical High School,5921,0.598716,0.056409,0.066036,0.236615,0.0152,0.000844,0.595676
6617,20K490,20,Brooklyn,Fort Hamilton High School,4678,0.289654,0.024583,0.316802,0.346729,0.150492,0.162035,0.712911
7887,26Q430,26,Queens,Francis Lewis High School,4424,0.576854,0.047242,0.210443,0.150316,0.14896,0.114602,0.695072
7038,22K405,22,Brooklyn,Midwood High School,4109,0.353857,0.255293,0.133366,0.224629,0.13872,0.035775,0.729131
7043,22K425,22,Brooklyn,James Madison High School,3851,0.209296,0.128278,0.1745,0.471826,0.153726,0.120748,0.762399
8442,28Q440,28,Queens,Forest Hills High School,3775,0.246623,0.065166,0.380927,0.271788,0.164503,0.09404,0.711258
9309,31R455,31,Staten Island,Tottenville High School,3726,0.072195,0.014224,0.132045,0.760601,0.224101,0.023349,0.42351
6838,21K525,21,Brooklyn,Edward R. Murrow High School,3691,0.27418,0.176104,0.192631,0.308318,0.178542,0.124898,0.678136
6607,20K445,20,Brooklyn,New Utrecht High School,3572,0.390817,0.030235,0.335946,0.233763,0.164614,0.217805,0.807391
7882,26Q415,26,Queens,Benjamin N. Cardozo High School,3405,0.422908,0.213803,0.222907,0.120117,0.145668,0.050514,0.671953


In [3]:
# get just the total_enrollment column, called a Series in pandas
enrollment = df["total_enrollment"]
print("The largest school:", enrollment.max())
print("The smallest school:", enrollment.min())

print("Avg (mean) school size:", enrollment.mean())
print("Avg (median) school size:", enrollment.median())
print("Avg (mode, can return multiple values) school size:", list(enrollment.mode()))

The largest school: 5921
The smallest school: 7
Avg (mean) school size: 559.7328
Avg (median) school size: 456.0
Avg (mode, can return multiple values) school size: [373]


In [4]:
# the built in describe() function calculates several descriptive statististics for each column
# in the data frame and returns them as a new dataframe
df.describe()

Unnamed: 0,district,total_enrollment,asian_pct,black_pct,hispanic_pct,white_pct,swd_pct,ell_pct,poverty_pct
count,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0
mean,27.983467,559.7328,0.11683,0.302142,0.43504,0.116964,0.23953,0.14158,0.774975
std,26.80586,468.41833,0.165721,0.256789,0.247839,0.16888,0.157192,0.140997,0.191384
min,1.0,7.0,0.0,0.0,0.014963,0.0,0.0,0.0,0.052941
25%,10.0,302.0,0.013957,0.079164,0.208437,0.015811,0.166461,0.050651,0.721908
50%,20.0,456.0,0.043257,0.243655,0.408451,0.034783,0.211699,0.102484,0.833919
75%,30.0,658.5,0.14776,0.473335,0.639441,0.139874,0.263433,0.186553,0.908411
max,84.0,5921.0,0.925735,0.935385,1.0,0.945137,1.0,1.0,0.96


In [5]:
# we can also call describe aon a single series:
df.swd_pct.describe()

count    1875.000000
mean        0.239530
std         0.157192
min         0.000000
25%         0.166461
50%         0.211699
75%         0.263433
max         1.000000
Name: swd_pct, dtype: float64

In [6]:
# we can also call the corr() method to show correclations between columns
# we will take out "district" from this data because the district number
# is categorical -- not the measure of a value

# correlations close to 1 or negative one show high correlations
# closer to zero items are not closely correlated
data = df.drop(columns=["district"])
data.corr()

  data.corr()


Unnamed: 0,total_enrollment,asian_pct,black_pct,hispanic_pct,white_pct,swd_pct,ell_pct,poverty_pct
total_enrollment,1.0,0.351473,-0.246863,-0.113577,0.190804,-0.183169,-0.017371,-0.164646
asian_pct,0.351473,1.0,-0.450517,-0.37028,0.214854,-0.204568,0.116053,-0.309208
black_pct,-0.246863,-0.450517,1.0,-0.421608,-0.447071,0.128486,-0.35813,0.305314
hispanic_pct,-0.113577,-0.37028,-0.421608,1.0,-0.397966,0.059059,0.451864,0.496305
white_pct,0.190804,0.214854,-0.447071,-0.397966,1.0,-0.062842,-0.180826,-0.800583
swd_pct,-0.183169,-0.204568,0.128486,0.059059,-0.062842,1.0,-0.012036,0.222089
ell_pct,-0.017371,0.116053,-0.35813,0.451864,-0.180826,-0.012036,1.0,0.353824
poverty_pct,-0.164646,-0.309208,0.305314,0.496305,-0.800583,0.222089,0.353824,1.0


In [7]:
# last, we can use styles to make the correlation table easier to read
# note: you need to run this cell to see the colors -- it's get saved without the styled output
corr = data.corr()
# a coolwarm color map will show values in a gradient where -1 is the deepest blue and 1 is deepest red
corr.style.background_gradient(cmap='coolwarm')

  corr = data.corr()


Unnamed: 0,total_enrollment,asian_pct,black_pct,hispanic_pct,white_pct,swd_pct,ell_pct,poverty_pct
total_enrollment,1.0,0.351473,-0.246863,-0.113577,0.190804,-0.183169,-0.017371,-0.164646
asian_pct,0.351473,1.0,-0.450517,-0.37028,0.214854,-0.204568,0.116053,-0.309208
black_pct,-0.246863,-0.450517,1.0,-0.421608,-0.447071,0.128486,-0.35813,0.305314
hispanic_pct,-0.113577,-0.37028,-0.421608,1.0,-0.397966,0.059059,0.451864,0.496305
white_pct,0.190804,0.214854,-0.447071,-0.397966,1.0,-0.062842,-0.180826,-0.800583
swd_pct,-0.183169,-0.204568,0.128486,0.059059,-0.062842,1.0,-0.012036,0.222089
ell_pct,-0.017371,0.116053,-0.35813,0.451864,-0.180826,-0.012036,1.0,0.353824
poverty_pct,-0.164646,-0.309208,0.305314,0.496305,-0.800583,0.222089,0.353824,1.0
