In [214]:
from sqlalchemy import create_engine
import os
import time
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load API key for data.gov

with open('data_gov_api.txt', 'r') as file:
    api_key = file.read().split('\n')[0] # Remove new line



## Helper functions

In [2]:
def find_max_string_length(pd_series):
    """
    For string column in pandas Series, find the maximum length of thes string
    """
    return pd_series.map(lambda x: len(str(x))).max()

## Download College Scorecard Data Dictionary

In [100]:
data_dir = os.path.join(os.path.curdir, "data")
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

data_dict_url = "https://collegescorecard.ed.gov/assets/CollegeScorecardDataDictionary.xlsx"

data_dict_file = data_dict_url.split('/')[-1]

data_dict_path = os.path.join(data_dir, data_dict_file)

if not os.path.exists(data_dict_path):
    req = requests.get(data_dict_url)
    
    with open(data_dict_path, 'wb') as file:
        file.write(req.content)


## Load into memory and create table to define variables

In [101]:
sc_data_dict = pd.read_excel(data_dict_path, sheetname='data_dictionary')

In [102]:
sc_data_dict.iloc[300]

NAME OF DATA ELEMENT       Percentage of degrees awarded in Architecture ...
dev-category                                                       academics
developer-friendly name                      program_percentage.architecture
API data type                                                          float
VARIABLE NAME                                                         PCIP04
VALUE                                                                    NaN
LABEL                                      Architecture and Related Services
SOURCE                                                                 IPEDS
NOTES                                        Shown/used on consumer website.
Name: 300, dtype: object

In [103]:
sc_data_dict.iloc[25:28, :]

Unnamed: 0,NAME OF DATA ELEMENT,dev-category,developer-friendly name,API data type,VARIABLE NAME,VALUE,LABEL,SOURCE,NOTES
25,Control of institution,school,ownership,integer,CONTROL,1.0,Public,IPEDS,Shown/used on consumer website.
26,,school,,,,2.0,Private nonprofit,,
27,,school,,,,3.0,Private for-profit,,


In [104]:
sc_data_dict_nv = sc_data_dict.drop(['VALUE', 'LABEL'], axis = 1)

In [105]:
sc_data_dict_nv.columns = ['Name', 'DevCategory', 'DeveloperName', 'DataType', 'VarName', 'Source', 'Notes']

In [106]:
sc_data_dict_nv = sc_data_dict_nv[sc_data_dict_nv['Name'].notnull()]

In [107]:
sc_data_dict_nv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1734 entries, 0 to 1974
Data columns (total 7 columns):
Name             1734 non-null object
DevCategory      1734 non-null object
DeveloperName    1734 non-null object
DataType         1734 non-null object
VarName          1734 non-null object
Source           1734 non-null object
Notes            274 non-null object
dtypes: object(7)
memory usage: 108.4+ KB


### Find maximum string length in each column so we can define SQL table

In [108]:
string_length_dict = dict()
for column in sc_data_dict_nv.columns:
    string_length_dict[column] = find_max_string_length(sc_data_dict_nv[column])

In [109]:
sc_data_dict_nv.to_csv('datadef.csv', index=False)

In [110]:

query = """CREATE TABLE IF NOT EXISTS datadefinitions (
"""
for column, length in string_length_dict.items():
    query += column + " varchar("+str(length)+") DEFAULT NULL,\n"
    
query += """
PRIMARY KEY (developername)
);"""

In [111]:
conn = create_engine('postgresql://ubuntu@52.53.236.232:5432/collegesc')

In [112]:
conn.execute(query)

<sqlalchemy.engine.result.ResultProxy at 0x7f54e18d8358>

In [130]:
query2 = """COPY datadefinitions FROM '/home/ubuntu/Notebooks/CollegeClassification/datadef.csv' DELIMITER ',' CSV HEADER;"""

In [131]:
conn.execute(query2)

<sqlalchemy.engine.result.ResultProxy at 0x7f54e2e7eda0>

In [132]:
query3 = """SELECT * FROM datadefinitions;"""

In [133]:
data_definitions = pd.read_sql_query(query3, conn)

## Use API to gather data for 2013

In [134]:
data_definitions.devcategory.unique() # categories

array(['root', 'school', 'admissions', 'academics', 'student', 'cost',
       'completion', 'aid', 'repayment', 'earnings'], dtype=object)

In [135]:
data_definitions[data_definitions.devcategory == 'earnings'].head()

Unnamed: 0,name,devcategory,developername,datatype,varname,source,notes
1594,Count of students in the earnings cohort,earnings,student_count,integer,COUNT_ED,Treasury,
1627,Number of students not working and not enrolle...,earnings,10_yrs_after_entry.not_working_not_enrolled.ov...,integer,COUNT_NWNE_P10,Treasury,
1628,Number of students working and not enrolled 10...,earnings,10_yrs_after_entry.working_not_enrolled.overall,integer,COUNT_WNE_P10,Treasury,
1629,Mean earnings of students working and not enro...,earnings,10_yrs_after_entry.working_not_enrolled.mean_e...,integer,MN_EARN_WNE_P10,Treasury,
1630,Median earnings of students working and not en...,earnings,10_yrs_after_entry.median,integer,MD_EARN_WNE_P10,Treasury,Shown/used on consumer website.


In [56]:
earnings_data = list(data_definitions[data_definitions.devcategory == 'earnings'].developername.values)[27:53]
earnings_data

['6_yrs_after_entry.not_working_not_enrolled.overall',
 '6_yrs_after_entry.working_not_enrolled.overall',
 '6_yrs_after_entry.working_not_enrolled.mean_earnings',
 '6_yrs_after_entry.median',
 '6_yrs_after_entry.working_not_enrolled.earnings_percentile.10',
 '6_yrs_after_entry.working_not_enrolled.earnings_percentile.25',
 '6_yrs_after_entry.working_not_enrolled.earnings_percentile.75',
 '6_yrs_after_entry.working_not_enrolled.earnings_percentile.90',
 '6_yrs_after_entry.working_not_enrolled.std_dev',
 '6_yrs_after_entry.working_not_enrolled.income.lowest_tercile',
 '6_yrs_after_entry.working_not_enrolled.income.middle_tercile',
 '6_yrs_after_entry.working_not_enrolled.income.highest_tercile',
 '6_yrs_after_entry.working_not_enrolled.dependent_students_lowest_tercile',
 '6_yrs_after_entry.working_not_enrolled.dependent_students',
 '6_yrs_after_entry.independent_students',
 '6_yrs_after_entry.female_students',
 '6_yrs_after_entry.male_students',
 '6_yrs_after_entry.percent_greater_than_

In [57]:
earnings_string = ',2013.earnings.'.join(earnings_data)
earnings_string

'6_yrs_after_entry.not_working_not_enrolled.overall,2013.earnings.6_yrs_after_entry.working_not_enrolled.overall,2013.earnings.6_yrs_after_entry.working_not_enrolled.mean_earnings,2013.earnings.6_yrs_after_entry.median,2013.earnings.6_yrs_after_entry.working_not_enrolled.earnings_percentile.10,2013.earnings.6_yrs_after_entry.working_not_enrolled.earnings_percentile.25,2013.earnings.6_yrs_after_entry.working_not_enrolled.earnings_percentile.75,2013.earnings.6_yrs_after_entry.working_not_enrolled.earnings_percentile.90,2013.earnings.6_yrs_after_entry.working_not_enrolled.std_dev,2013.earnings.6_yrs_after_entry.working_not_enrolled.income.lowest_tercile,2013.earnings.6_yrs_after_entry.working_not_enrolled.income.middle_tercile,2013.earnings.6_yrs_after_entry.working_not_enrolled.income.highest_tercile,2013.earnings.6_yrs_after_entry.working_not_enrolled.dependent_students_lowest_tercile,2013.earnings.6_yrs_after_entry.working_not_enrolled.dependent_students,2013.earnings.6_yrs_after_entry

In [165]:
collegesc_api = 'https://api.data.gov/ed/collegescorecard/v1/schools.json'
payload = {'api_key' : api_key, '_fields' : 'id,school.name,2013.earnings.'+earnings_string} #'school.ownership' : 3,

req = requests.get(collegesc_api, params=payload)

In [166]:
req.json()['results'][0]

{'2013.earnings.6_yrs_after_entry.female_students': None,
 '2013.earnings.6_yrs_after_entry.independent_students': None,
 '2013.earnings.6_yrs_after_entry.male_students': None,
 '2013.earnings.6_yrs_after_entry.mean_earnings.dependent_students': None,
 '2013.earnings.6_yrs_after_entry.mean_earnings.dependent_students_lowest_tercile': None,
 '2013.earnings.6_yrs_after_entry.mean_earnings.female_students': None,
 '2013.earnings.6_yrs_after_entry.mean_earnings.highest_tercile': None,
 '2013.earnings.6_yrs_after_entry.mean_earnings.independent_students': None,
 '2013.earnings.6_yrs_after_entry.mean_earnings.lowest_tercile': None,
 '2013.earnings.6_yrs_after_entry.mean_earnings.male_students': None,
 '2013.earnings.6_yrs_after_entry.mean_earnings.middle_tercile': None,
 '2013.earnings.6_yrs_after_entry.median': None,
 '2013.earnings.6_yrs_after_entry.not_working_not_enrolled.overall': None,
 '2013.earnings.6_yrs_after_entry.percent_greater_than_25000': None,
 '2013.earnings.6_yrs_after_entr

This part of the data is relatively new, so it is not available for most schools.

In [97]:
data_definitions[data_definitions.devcategory == 'school'].head(20)

Unnamed: 0,name,devcategory,developername,datatype,varname,source,notes
3,Institution name,school,name,autocomplete,INSTNM,IPEDS,Shown/used on consumer website.
4,City,school,city,autocomplete,CITY,IPEDS,Shown/used on consumer website.
5,State postcode,school,state,string,STABBR,IPEDS,Shown/used on consumer website.
6,ZIP code,school,zip,integer,ZIP,IPEDS,
7,Accreditor for institution,school,accreditor,string,ACCREDAGENCY,FSA,
8,URL for institution's homepage,school,school_url,string,INSTURL,IPEDS,Shown/used on consumer website.
9,URL for institution's net price calculator,school,price_calculator_url,string,NPCURL,IPEDS,Shown/used on consumer website.
10,Predominant degree awarded (recoded 0s and 4s),school,degrees_awarded.predominant_recoded,integer,SCH_DEG,IPEDS/NSLDS,"Missing values, 0s, and 4s from PREDDEG recode..."
11,Schools that are on Heightened Cash Monitoring...,school,under_investigation,integer,HCM2,FSA,Shown/used on consumer website; Flag (1=HCM2)
12,Flag for main campus,school,main_campus,integer,MAIN,IPEDS,


In [75]:
school_data = list(data_definitions[data_definitions.devcategory == 'school'].developername.values)

In [77]:
school_data[:15]

['name',
 'city',
 'state',
 'zip',
 'accreditor',
 'school_url',
 'price_calculator_url',
 'degrees_awarded.predominant_recoded',
 'under_investigation',
 'main_campus',
 'branches',
 'degrees_awarded.predominant',
 'degrees_awarded.highest',
 'ownership',
 'state_fips']

In [86]:
school_string = ",school.".join(school_data)

In [87]:
school_string

'name,school.city,school.state,school.zip,school.accreditor,school.school_url,school.price_calculator_url,school.degrees_awarded.predominant_recoded,school.under_investigation,school.main_campus,school.branches,school.degrees_awarded.predominant,school.degrees_awarded.highest,school.ownership,school.state_fips,school.region_id,school.locale,school.degree_urbanization,school.carnegie_basic,school.carnegie_undergrad,school.carnegie_size_setting,school.minority_serving.historically_black,school.minority_serving.predominantly_black,school.minority_serving.annh,school.minority_serving.tribal,school.minority_serving.aanipi,school.minority_serving.hispanic,school.minority_serving.nant,school.men_only,school.women_only,school.religious_affiliation,school.online_only,school.operating,school.tuition_revenue_per_fte,school.instructional_expenditure_per_fte,school.faculty_salary,school.ft_faculty_rate,school.alias,school.institutional_characteristics.level'

In [140]:
payload = {'api_key' : api_key, '_fields' : 'id,school.'+school_string, '_per_page' : 50} #'school.ownership' : 3,

req = requests.get(collegesc_api, params=payload)

In [153]:
school = pd.DataFrame(req.json()["results"])

In [145]:
metadata = req.json()["metadata"]

In [156]:
metadata

{'page': 0, 'per_page': 50, 'total': 7703}

In [147]:
metadata['total'] // metadata['per_page']

154

In [154]:
school.columns = school.columns.str.replace("school.", "")

In [155]:
school.head()

Unnamed: 0,id,accreditor,alias,branches,carnegie_basic,carnegie_size_setting,carnegie_undergrad,city,degree_urbanization,degrees_awarded.highest,...,price_calculator_url,region_id,religious_affiliation,url,state,state_fips,tuition_revenue_per_fte,under_investigation,women_only,zip
0,121983,Western Association of Schools and Colleges Se...,,28,26,6,5,Alameda,,4,...,tcc.noellevitz.com/edmc/Argosy University Net ...,8,-2,www.argosy.edu/sanfrancisco,CA,6,12380,0,0,94501
1,120838,Accrediting Council for Independent Colleges a...,,1,29,6,11,Los Angeles,,4,...,www.psuca.edu/node/100,8,-2,www.psuca.edu,CA,6,13784,0,0,90010
2,434973,North Central Association of Colleges and Scho...,,38,29,6,11,Columbia,,4,...,www.phoenix.edu/tuition_and_financial_options/...,2,-2,www.phoenix.edu,MD,24,10131,0,0,21045
3,436012,,,2,-2,-2,-2,Hempstead,,1,...,www.franklincareer.edu,2,-2,www.franklincareer.edu,NY,36,6935,0,0,11550
4,436021,Montessori Accreditation Council for Teacher E...,,1,-2,-2,-2,Milwaukee,,0,...,www.montessori6-12ami.org,3,-2,www.montessori6-12ami.org,WI,55,6063,0,0,53207


In [229]:
def call_collegesc_api(category, api_key, conn, per_page = 50, year = 2013):
    """
    category: the category to be downloaded
    api_key: key for the college sc api
    conn: sql_alchemy engine for querying data definitions
    per_page: number of entries per api call (max: 100)
    year: year to be called (default: 2013)
    calls the college scordcard api given a particular category
    and returns a dataframe transformed from json
    """
    sql_query = """SELECT
                    developername
                FROM 
                    datadefinitions 
                WHERE devcategory ='"""
    
    sql_query += category+"';"
    
    fields = [fields[0] for fields in conn.execute(sql_query).fetchall()]
    
    if(category == "school"):
        join_string = "," + category + "."
        fields_string = "id" + join_string + join_string.join(fields)
    else:
        join_string = "," + str(year) + "." + category + "."
        fields_string = "id" + join_string + join_string.join(fields)
    
    
    college_sc_url = 'https://api.data.gov/ed/collegescorecard/v1/schools.json'
    api_query = {'api_key' : api_key, '_fields' : fields_string, '_per_page' : per_page}
    
    req = requests.get(college_sc_url, params=api_query)
    
    pages = req.json()["metadata"]["total"] // per_page
    
    category_df = pd.DataFrame(req.json()["results"])
    
    for page in range(1, pages+1):
        
        api_query["_page"] = page
        
        req = requests.get(college_sc_url, params=api_query)
        
        temp_df = pd.DataFrame(req.json()["results"])
        
        category_df = category_df.append(temp_df)
        
        time.sleep(1)
    
    category_df = category_df.reset_index()
    
    if(category == "school"):
        category_df.columns = category_df.columns.str.replace(category+".", "")
    else:
        category_df.columns = category_df.columns.str.replace(str(year)+"."+category+".", "")
        
    
    return category_df
    

In [216]:
school_df = call_collegesc_api('school', api_key, conn)

In [224]:
school_df = school_df.reset_index()

In [227]:
school_df.columns = school_df.columns.str.replace("school.", "")

In [230]:
category_file = os.path.join(data_dir, "school.csv")
school_df.to_csv(category_file)

In [170]:
category = "school"
sql_query = """SELECT
                developername
            FROM 
                datadefinitions 
            WHERE devcategory ='"""

sql_query += category+"';"

In [171]:
sql_query

"SELECT\n                developername\n            FROM \n                datadefinitions \n            WHERE devcategory ='school';"

In [172]:
test = pd.read_sql_query(sql_query, conn)

In [177]:
fields = [fields[0] for fields in conn.execute(sql_query).fetchall()]

In [178]:
fields

['name',
 'city',
 'state',
 'zip',
 'accreditor',
 'school_url',
 'price_calculator_url',
 'degrees_awarded.predominant_recoded',
 'under_investigation',
 'main_campus',
 'branches',
 'degrees_awarded.predominant',
 'degrees_awarded.highest',
 'ownership',
 'state_fips',
 'region_id',
 'locale',
 'degree_urbanization',
 'carnegie_basic',
 'carnegie_undergrad',
 'carnegie_size_setting',
 'minority_serving.historically_black',
 'minority_serving.predominantly_black',
 'minority_serving.annh',
 'minority_serving.tribal',
 'minority_serving.aanipi',
 'minority_serving.hispanic',
 'minority_serving.nant',
 'men_only',
 'women_only',
 'religious_affiliation',
 'online_only',
 'operating',
 'tuition_revenue_per_fte',
 'instructional_expenditure_per_fte',
 'faculty_salary',
 'ft_faculty_rate',
 'alias',
 'institutional_characteristics.level']