# collegescorecard.ed.gov

To run this code, you'll need to get an [API key](https://collegescorecard.ed.gov/data/api-documentation/) and store it in `../../keys/data_gov_api.txt`.

In [16]:
import pandas as pd
import requests
import time

In [2]:
with open("../../keys/data_gov_api.txt") as f:
    api_key = f.read().strip()

In [6]:
def get_scorecards_by_state(state: str = "CA") -> list[dict]: 
    """ 
    Sends a `GET` request to the College Scorecard API to retrieve a list of 
    colleges in a given state.

    :param state: The state to retrieve colleges from. Defaults to "CA".
    :return list: A list of dictionaries containing the college data.
    """
    
    url = "https://api.data.gov/ed/collegescorecard/v1/schools"
    params = {
        "api_key": api_key,
        "school.state": state,
        "page": 1,
        "per_page": 100
    }

    time.sleep(1)
    response = requests.get(url, params=params)
    print(response.json()["metadata"])
    data = response.json()
    
    number_of_pages = data["metadata"]["total"] // data["metadata"]["per_page"] + 1

    for page in range(2, number_of_pages + 1):
        time.sleep(1)
        params["page"] = page
        response = requests.get(url, params=params)
        print(response.json()["metadata"])
        data["results"] += response.json()["results"]

    return data["results"]

In [None]:
# ETC < 40 seconds

data = get_scorecards_by_state()

{'page': 1, 'total': 688, 'per_page': 100}
{'page': 2, 'total': 688, 'per_page': 100}
{'page': 3, 'total': 688, 'per_page': 100}
{'page': 4, 'total': 688, 'per_page': 100}
{'page': 5, 'total': 688, 'per_page': 100}
{'page': 6, 'total': 688, 'per_page': 100}
{'page': 7, 'total': 688, 'per_page': 100}


## Exploring the Scorecard Data

In [10]:
to_examine = data[0]
to_examine.keys()

dict_keys(['latest', 'school', 'location', 'id', 'ope6_id', 'ope8_id', 'fed_sch_cd'])

In [12]:
# Second level keys
for key in to_examine.keys():
    if isinstance(to_examine.get(key), dict):
        print(f"{key}: {to_examine.get(key).keys()}")
    else:
        print(f"{key}: {type(to_examine.get(key))}")

latest: dict_keys(['school', 'student', 'cost', 'aid', 'earnings', 'completion', 'repayment', 'admissions', 'academics', 'programs'])
school: dict_keys(['zip', 'city', 'name', 'alias', 'state', 'locale', 'address', 'dolflag', 'branches', 'men_only', 'operating', 'ownership', 'region_id', 'accreditor', 'school_url', 'women_only', 'main_campus', 'online_only', 'endowment', 'carnegie_basic', 'faculty_salary', 'ownership_peps', 'accreditor_code', 'ft_faculty_rate', 'carnegie_undergrad', 'degree_urbanization', 'under_investigation', 'price_calculator_url', 'carnegie_size_setting', 'minority_serving', 'religious_affiliation', 'open_admissions_policy', 'title_iv', 'degrees_awarded', 'tuition_revenue_per_fte', 'instructional_expenditure_per_fte', 'institutional_characteristics'])
location: dict_keys(['lat', 'lon'])
id: <class 'int'>
ope6_id: <class 'str'>
ope8_id: <class 'str'>
fed_sch_cd: <class 'str'>


In [13]:
# Even more keys
for key in to_examine.get("latest").keys():
    if isinstance(to_examine.get("latest").get(key), dict):
        print(f"latest.{key}: {to_examine.get("latest").get(key).keys()}")
    else:
        print(f"latest.{key}: {type(to_examine.get("latest").get(key))}")

latest.school: dict_keys(['zip', 'city', 'name', 'alias', 'state', 'locale', 'address', 'dolflag', 'branches', 'men_only', 'operating', 'ownership', 'region_id', 'accreditor', 'school_url', 'state_fips', 'women_only', 'main_campus', 'online_only', 'endowment', 'carnegie_basic', 'faculty_salary', 'ownership_peps', 'peps_ownership', 'accreditor_code', 'ft_faculty_rate', 'carnegie_undergrad', 'degree_urbanization', 'under_investigation', 'price_calculator_url', 'carnegie_size_setting', 'minority_serving', 'religious_affiliation', 'open_admissions_policy', 'title_iv', 'degrees_awarded', 'tuition_revenue_per_fte', 'instructional_expenditure_per_fte', 'institutional_characteristics'])
latest.student: dict_keys(['size', 'grad_students', 'enrollment', 'share_25_older', 'part_time_share', 'demographics', 'FAFSA_applications', 'fafsa_sent', 'part_time_share_2000', 'family_income', 'share_firstgeneration', 'parents_education_level', 'share_lowincome', 'valid_dependency_status', 'students_with_pel

## Wrangle Data

In [14]:
def flatten_dict(d: dict, parent_key: str = '', sep: str = '_') -> dict:
    """ 
    Flattens a nested dictionary with recursion. New keys will be the 
    concatenation of the parent key and the current key, separated by `sep`.

    :param d: The dictionary to flatten.
    :param parent_key: The base key to use for the new keys. Defaults to an 
        empty string.
    :param sep: The separator to use between the parent key and the current key.
        Defaults to "_".
    :return dict: A flattened dictionary.
    """

    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

In [17]:
# Gather the student data from each college and transform it to a dataframe.

student = pd.DataFrame()
for college in data:
    df_temp = pd.DataFrame.from_dict(
        flatten_dict(college.get("latest").get("student")),
        orient="index"
    ).T
    df_temp["college"] = college.get("school").get("name")
    df_temp = df_temp[
        ["college"] + [col for col in df_temp.columns if col != "college"]
    ]
    student = pd.concat([student, df_temp])

student.head()

  student = pd.concat([student, df_temp])
  student = pd.concat([student, df_temp])
  student = pd.concat([student, df_temp])
  student = pd.concat([student, df_temp])
  student = pd.concat([student, df_temp])
  student = pd.concat([student, df_temp])


Unnamed: 0,college,size,grad_students,enrollment_all,enrollment_grad_12_month,enrollment_undergrad_12_month,share_25_older,part_time_share,demographics_men,demographics_women,...,share_independent_middleincome_48001_75000,undergrads_with_pell_grant_or_federal_student_loan,retention_rate_suppressed_four_year_full_time_pooled,retention_rate_suppressed_four_year_part_time_pooled,retention_rate_suppressed_lt_four_year_full_time_pooled,retention_rate_suppressed_lt_four_year_part_time_pooled,dcs_undergrads_with_pell_grant_or_federal_student_loan,ftft_undergrads_with_pell_grant_or_federal_student_loan,dcs_undergrads_with_pell_grant_or_federal_student_loan_pooled,ftft_undergrads_with_pell_grant_or_federal_student_loan_pooled
0,De Anza College,14294.0,,,,26779.0,0.3012,0.4638,0.5126,0.4874,...,0.03537,16707.0,,,0.7838,0.3722,14711.0,2418.0,14711.0,4926.0
0,International School of Beauty Inc,186.0,,,,288.0,0.3962,0.0,0.2849,0.7151,...,,288.0,,,0.8438,,288.0,94.0,288.0,184.0
0,Dell'Arte International School of Physical The...,8.0,,,,13.0,0.7692,0.0,0.125,0.875,...,,14.0,,,,,14.0,1.0,14.0,5.0
0,College of the Desert,8664.0,,,,14068.0,0.3241,0.5652,0.4198,0.5802,...,0.053652,10229.0,,,0.6572,0.4126,8634.0,1095.0,8634.0,2383.0
0,Design Institute of San Diego,90.0,11.0,,12.0,133.0,0.5882,0.2111,0.0889,0.9111,...,,119.0,,,,,119.0,3.0,119.0,17.0


In [25]:
for col in student.columns:
    print(f"{col}: {len(student[col].unique())} unique values")

college: 583 unique values
size: 416 unique values
grad_students: 146 unique values
enrollment_all: 1 unique values
enrollment_grad_12_month: 144 unique values
enrollment_undergrad_12_month: 439 unique values
share_25_older: 460 unique values
part_time_share: 281 unique values
demographics_men: 452 unique values
demographics_women: 453 unique values
demographics_married: 322 unique values
demographics_veteran: 141 unique values
demographics_age_entry: 26 unique values
demographics_dependent: 348 unique values
demographics_faculty_men: 242 unique values
demographics_faculty_women: 242 unique values
demographics_faculty_race_ethnicity_aian: 89 unique values
demographics_faculty_race_ethnicity_nhpi: 73 unique values
demographics_faculty_race_ethnicity_asian: 215 unique values
demographics_faculty_race_ethnicity_black: 185 unique values
demographics_faculty_race_ethnicity_white: 245 unique values
demographics_faculty_race_ethnicity_unknown: 153 unique values
demographics_faculty_race_ethni

In [22]:
student.describe()

Unnamed: 0,size,grad_students,enrollment_all,enrollment_grad_12_month,enrollment_undergrad_12_month,share_25_older,part_time_share,demographics_men,demographics_women,demographics_married,...,share_independent_middleincome_48001_75000,undergrads_with_pell_grant_or_federal_student_loan,retention_rate_suppressed_four_year_full_time_pooled,retention_rate_suppressed_four_year_part_time_pooled,retention_rate_suppressed_lt_four_year_full_time_pooled,retention_rate_suppressed_lt_four_year_part_time_pooled,dcs_undergrads_with_pell_grant_or_federal_student_loan,ftft_undergrads_with_pell_grant_or_federal_student_loan,dcs_undergrads_with_pell_grant_or_federal_student_loan_pooled,ftft_undergrads_with_pell_grant_or_federal_student_loan_pooled
count,500.0,154.0,0.0,158.0,500.0,488.0,493.0,500.0,500.0,423.0,...,229.0,489.0,87.0,21.0,271.0,114.0,494.0,494.0,494.0,500.0
mean,2463.08,1220.116883,,1544.772152,4158.99,0.484098,0.25137,0.354635,0.639366,0.180435,...,0.070392,2988.492843,0.776523,0.4712,0.725855,0.502738,2618.370445,346.09919,2618.370445,689.794
std,4802.840436,2969.434809,,3656.148445,8125.75558,0.228898,0.311442,0.236904,0.24049,0.113043,...,0.036918,5556.252181,0.134729,0.199843,0.130985,0.210352,4878.093473,589.09066,4878.093473,1149.068261
min,0.0,1.0,,2.0,1.0,0.0011,0.0,0.0,0.0,0.005046,...,0.019523,2.0,0.3029,0.0857,0.2258,0.1446,0.0,0.0,0.0,0.0
25%,99.0,96.5,,105.0,176.0,0.329375,0.0,0.156175,0.524575,0.112816,...,0.04878,175.0,0.7123,0.4032,0.665,0.3638,158.5,24.25,158.5,59.0
50%,333.0,232.0,,295.5,522.5,0.4636,0.0686,0.35535,0.6386,0.156969,...,0.057082,502.0,0.8,0.4545,0.7297,0.42925,484.0,103.5,484.0,204.0
75%,1751.75,848.5,,1088.5,2494.5,0.6365,0.5437,0.4739,0.8395,0.213254,...,0.085973,2246.0,0.8792,0.5882,0.8158,0.626875,2205.75,426.25,2205.75,894.5
max,31810.0,28246.0,,31467.0,49266.0,1.0,1.0,1.0,1.0,0.653631,...,0.210055,33084.0,0.9829,0.85,0.9888,1.0,31534.0,5233.0,31534.0,9973.0
