## Import Dependencies

In [1]:
# Dependencies
from matplotlib import pyplot as plt
from scipy import stats
import numpy as np
import pandas as pd
import json
import requests

# Educationdata.urban.org
## Postsecondary Enrollment Headcounts

In [2]:
# Get higher ed info from api
ipeds_baseurl = 'https://educationdata.urban.org/api/v1/college-university/ipeds/enrollment-headcount/summaries?'
params = {
    'var': 'headcount',
    'stat': 'sum',
    'by': 'unitid',
    # 1—Undergraduate 2—Graduate 3—First professional 4—Postbaccalaureate 99—Total
    'level_of_study': '1',
    'year': ''
}

years = ['2017', '2018', '2019', '2020', '2021']
undergrad_headcount_data = {}

for year in years:
    params['year'] = year
    print(f'Getting {year} data.')
    response = requests.get(ipeds_baseurl, params=params)
    data = response.json()
    undergrad_headcount_data[f'{year}'] = pd.DataFrame(data['results'])['headcount'].sum()

undergrad_series = pd.Series(undergrad_headcount_data, name='Enrolled in Undergrad')
undergrad_series

Getting 2017 data.
Getting 2018 data.
Getting 2019 data.
Getting 2020 data.
Getting 2021 data.


2017    22962495
2018    22723047
2019    22723047
2020    22525259
2021    21687926
Name: Enrolled in Undergrad, dtype: int64

In [3]:
params['level_of_study'] = 99

total_headcount_data = {}

for year in years:
    params['year'] = year
    print(f'Getting {year} data.')
    response = requests.get(ipeds_baseurl, params=params)
    data = response.json()
    total_headcount_data[f'{year}'] = pd.DataFrame(data['results'])['headcount'].sum()

total_ps_series = pd.Series(total_headcount_data, name='Total Enrolled in Postsecondary')
total_ps_series

Getting 2017 data.
Getting 2018 data.
Getting 2019 data.
Getting 2020 data.
Getting 2021 data.


2017    26887067
2018    26685592
2019    26685592
2020    26497087
2021    25762172
Name: Total Enrolled in Postsecondary, dtype: int64

In [5]:
ps_df = pd.concat([
    undergrad_series,
    total_ps_series],
    axis=1
)
ps_df

Unnamed: 0,Enrolled in Undergrad,Total Enrolled in Postsecondary
2017,22962495,26887067
2018,22723047,26685592
2019,22723047,26685592
2020,22525259,26497087
2021,21687926,25762172


In [6]:
ps_df['Enrolled Past Undergrad'] = ps_df['Total Enrolled in Postsecondary']-ps_df['Enrolled in Undergrad']
ps_df = ps_df[['Enrolled in Undergrad', 'Enrolled Past Undergrad', 'Total Enrolled in Postsecondary']]
ps_df

Unnamed: 0,Enrolled in Undergrad,Enrolled Past Undergrad,Total Enrolled in Postsecondary
2017,22962495,3924572,26887067
2018,22723047,3962545,26685592
2019,22723047,3962545,26685592
2020,22525259,3971828,26497087
2021,21687926,4074246,25762172


## Completed Higher Ed counts

In [4]:
baseurl = 'https://educationdata.urban.org/api/v1/college-university/ipeds/completers/summaries?'

params = {
    'var': 'completers',
    'stat': 'sum',
    'by': 'unitid',
    'year': ''
}

years = ['2017', '2018', '2019', '2020', '2021']
completers_data = {}

for year in years:
    params['year'] = year
    print(f'Getting {year} data.')
    response = requests.get(baseurl, params=params)
    data = response.json()['results']
    df = pd.DataFrame(data)
    completers_data[f'{year}'] = df['completers'].sum()

completers_series = pd.Series(completers_data, name='Completed Higher Ed')
completers_series

# 2017    4720982
# 2018    4765256
# 2019    4742858
# 2020    4742858
# 2021    4843181

Getting 2017 data.
Getting 2018 data.
Getting 2019 data.
Getting 2020 data.
Getting 2021 data.


2017    4720982
2018    4765256
2019    4742858
2020    4742858
2021    4843181
Name: Completed Higher Ed, dtype: int64

## Fall Retention Rates

In [7]:
baseurl = 'https://educationdata.urban.org/api/v1/college-university/ipeds/fall-retention/summaries?'

params = {
    'var': 'retention_rate',
    'stat': 'avg',
    'by': 'ftpt',
    'year': ''
}

years = ['2017', '2018', '2019', '2020']
retention_data = {}

for year in years:
    params['year'] = year
    print(f'Getting {year} data.')
    response = requests.get(baseurl, params=params)
    data = response.json()['results']
    datadict = {}
    datadict['Full-Time Fall Retention'] = data[0]['retention_rate']
    datadict['Part-Time Fall Retention'] = data[1]['retention_rate']
    datadict['Avg Fall Retention'] = data[2]['retention_rate']
    retention_data[f'{year}'] = datadict

retention_df = pd.DataFrame(retention_data)
retention_df = retention_df.transpose()
retention_df

Getting 2017 data.
Getting 2018 data.
Getting 2019 data.
Getting 2020 data.
Getting 2020 data.


Unnamed: 0,Full-Time Fall Retention,Part-Time Fall Retention,Avg Fall Retention
2017,0.704055,0.505465,0.683817
2018,0.706103,0.509421,0.68701
2019,0.714044,0.51685,0.696807
2020,0.706521,0.510495,0.688849


## K-12 enrollment data

In [8]:
baseurl = 'https://educationdata.urban.org/api/v1/schools/ccd/enrollment/summaries?'

params = {
    'var': 'enrollment',
    'stat': 'sum',
    'by': 'grade',
    'year': ''
}

years = ['2017', '2018', '2019', '2020']
enrollment_by_grade = {}

for year in years:
    params['year'] = year
    print(f'Getting {year} data.')
    response = requests.get(baseurl, params=params)
    data = response.json()
    enrollment = {}
    for x in data['results']:
        grade = x['grade']
        enrollment[f'{grade}'] = x['enrollment']
    enrollment_by_grade[f'{year}'] = enrollment

Getting 2017 data.
Getting 2018 data.
Getting 2019 data.
Getting 2020 data.


In [9]:
grade_numbers = ['-1','1','2','3','4','5','6','7','8','9','10','11','12']
grade_strings = ['Enrolled in K', 'Enrolled in 1st', 'Enrolled in 2nd', 'Enrolled in 3rd', 'Enrolled in 4th', 'Enrolled in 5th', 'Enrolled in 6th', 'Enrolled in 7th', 'Enrolled in 8th', 'Enrolled in 9th', 'Enrolled in 10th', 'Enrolled in 11th', 'Enrolled in 12th']
grade_dict = dict(zip(grade_numbers, grade_strings))

k_12_enrollmentdata = pd.DataFrame(enrollment_by_grade)
k_12_enrollmentdata = k_12_enrollmentdata.transpose()
k_12_enrollmentdata = k_12_enrollmentdata[grade_numbers]
k_12_enrollmentdata = k_12_enrollmentdata.rename(columns=grade_dict)

k_12_enrollmentdata

Unnamed: 0,Enrolled in K,Enrolled in 1st,Enrolled in 2nd,Enrolled in 3rd,Enrolled in 4th,Enrolled in 5th,Enrolled in 6th,Enrolled in 7th,Enrolled in 8th,Enrolled in 9th,Enrolled in 10th,Enrolled in 11th,Enrolled in 12th
2017,1255505,3697231,3711904,3816334,3887802,3905590,3854926,3805355,3798345,4018689,3855133,3695341,3640445
2018,1318877,3666344,3678424,3732953,3802452,3901341,3916316,3872358,3811121,4025850,3866284,3667744,3654021
2019,1360941,3669432,3661781,3709415,3728758,3824609,3919004,3940340,3884860,4065478,3885743,3685420,3624904
2020,1080497,3538666,3545463,3568901,3624353,3663730,3765069,3878814,3907716,4031746,3912087,3714309,3665497


In [18]:
complete_ps_df = pd.concat([ps_df,
                            completers_series,
                            retention_df],
                           axis=1)
complete_ps_df.index.name = 'Year'
complete_ps_df.to_csv("data_output/postsecondary_data.csv", encoding="utf-8", index=True)

In [19]:
k_12_enrollmentdata.index.name = 'Year'
k_12_enrollmentdata.to_csv("data_output/k_12_data.csv", encoding="utf-8", index=True)