Skip to content

Commit

Permalink
Merge 8b6e08d into 642bb7f
Browse files Browse the repository at this point in the history
  • Loading branch information
higs4281 committed Nov 8, 2018
2 parents 642bb7f + 8b6e08d commit af96144
Show file tree
Hide file tree
Showing 11 changed files with 36,929 additions and 822,948 deletions.
7,154 changes: 7,154 additions & 0 deletions paying_for_college/data_sources/ipeds/hd2017.csv

Large diffs are not rendered by default.

7,154 changes: 7,154 additions & 0 deletions paying_for_college/data_sources/ipeds/hd2017_cleaned.csv

Large diffs are not rendered by default.

6,882 changes: 6,882 additions & 0 deletions paying_for_college/data_sources/ipeds/ic2017.csv

Large diffs are not rendered by default.

4,282 changes: 4,282 additions & 0 deletions paying_for_college/data_sources/ipeds/ic2017_ay.csv

Large diffs are not rendered by default.

4,282 changes: 4,282 additions & 0 deletions paying_for_college/data_sources/ipeds/ic2017_ay_cleaned.csv

Large diffs are not rendered by default.

6,882 changes: 6,882 additions & 0 deletions paying_for_college/data_sources/ipeds/ic2017_cleaned.csv

Large diffs are not rendered by default.

216 changes: 216 additions & 0 deletions paying_for_college/data_sources/ipeds/schools_added_on_2018-09-27.csv

Large diffs are not rendered by default.

137 changes: 54 additions & 83 deletions paying_for_college/disclosures/scripts/api_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,70 +14,48 @@
"""
from __future__ import print_function
import os
import sys
import json
import datetime
from copy import copy
from decimal import Decimal

import requests

from paying_for_college.models import ConstantCap

try:
LATEST_YEAR = ConstantCap.objects.get(slug='apiYear').value
except: # pragma: no cover
LATEST_YEAR = 2013
try:
LATEST_SALARY_YEAR = ConstantCap.objects.get(slug='salaryYear').value
except: # pragma: no cover
LATEST_SALARY_YEAR = 2011

try:
API_KEY = os.getenv('ED_API_KEY')
except: # pragma: no cover
API_KEY = '0123456789' * 4
API_KEY = os.getenv('ED_API_KEY', '')
API_ROOT = "https://api.data.gov/ed/collegescorecard/v1"
SCHOOLS_ROOT = "{0}/schools".format(API_ROOT)
# SCHEMA_ROOT = "{0}/data.json".format(API_ROOT)
SCHOOLS_ROOT = "{}/schools".format(API_ROOT)
PAGE_MAX = 100 # the max page size allowed as of 2015-09-14

MODEL_MAP = {
'ope6_id': 'ope6_id',
'ope8_id': 'ope8_id',
'{0}.student.size'.format(LATEST_YEAR): 'enrollment',
'latest.student.size': 'enrollment',
'school.accreditor': 'accreditor',
'school.school_url': 'url',
'school.degrees_awarded.predominant': 'degrees_predominant', # data guide says this is INDICATORGROUP
'school.degrees_awarded.predominant': 'degrees_predominant', # data guide says this is INDICATORGROUP # noqa
'school.degrees_awarded.highest': 'degrees_highest',
'school.ownership': 'ownership',
'school.main_campus': 'main_campus',
# 'school.branches', # ??
'school.online_only': 'online_only',
'school.operating': 'operating',
'school.under_investigation': 'under_investigation',
'school.zip': 'zip5',
'{0}.completion.completion_rate_4yr_150nt_pooled'.format(LATEST_YEAR): 'grad_rate_4yr',
'{0}.completion.completion_rate_less_than_4yr_150nt_pooled'.format(LATEST_YEAR): 'grad_rate_lt4',
'{0}.repayment.repayment_cohort.3_year_declining_balance'.format(LATEST_YEAR): 'repay_3yr', # NEW
'{0}.repayment.3_yr_default_rate'.format(LATEST_YEAR): 'default_rate',
'{0}.aid.median_debt_suppressed.overall'.format(LATEST_YEAR): 'median_total_debt',
'{0}.aid.median_debt_suppressed.completers.monthly_payments'.format(LATEST_YEAR): 'median_monthly_debt', # NEW
'{0}.cost.avg_net_price.overall'.format(LATEST_YEAR): 'avg_net_price',
'{0}.cost.tuition.out_of_state'.format(LATEST_YEAR): 'tuition_out_of_state',
'{0}.cost.tuition.in_state'.format(LATEST_YEAR): 'tuition_in_state',
'{0}.earnings.10_yrs_after_entry.median'.format(LATEST_SALARY_YEAR): 'median_annual_pay',
'latest.completion.completion_rate_4yr_150nt_pooled': 'grad_rate_4yr',
'latest.completion.completion_rate_less_than_4yr_150nt_pooled': 'grad_rate_lt4', # noqa
'latest.repayment.repayment_cohort.3_year_declining_balance': 'repay_3yr',
'latest.repayment.3_yr_default_rate': 'default_rate',
'latest.aid.median_debt_suppressed.overall': 'median_total_debt',
'latest.aid.median_debt_suppressed.completers.monthly_payments': 'median_monthly_debt', # noqa
'latest.cost.avg_net_price.overall': 'avg_net_price',
'latest.cost.tuition.out_of_state': 'tuition_out_of_state',
'latest.cost.tuition.in_state': 'tuition_in_state',
'latest.earnings.10_yrs_after_entry.median': 'median_annual_pay',
}

# JSON_MAP = {
# # '{0}.student.retention_rate.four_year.full_time'.format(LATEST_YEAR): 'RETENTRATE',
# # '{0}.student.retention_rate.lt_four_year.full_time'.format(LATEST_YEAR): 'RETENTRATELT4', # NEW
# # 'latest.student.retention_rate.four_year.full_time': 'RETENTRATE',
# # 'latest.student.retention_rate.lt_four_year.full_time': 'RETENTRATELT4', # noqa
# }

BASE_FIELDS = [
'id',
'ope6_id',
# 'ope8_id',
'school.name',
'school.city',
'school.state',
Expand Down Expand Up @@ -110,7 +88,7 @@
'student.fafsa_sent.overall',
'student.fafsa_sent.1_college',
'student.fafsa_sent.2_colleges',
'student.fafsa_sent.3_college', # yes, should be 'colleges'
'student.fafsa_sent.3_college', # yes, should be 'colleges' but isn't
'student.fafsa_sent.4_colleges',
'student.fafsa_sent.5_or_more_colleges',
'student.fafsa_sent.2_college_allyrs',
Expand Down Expand Up @@ -165,43 +143,39 @@
]


def build_field_string(YEAR=LATEST_YEAR):
def build_field_string():
"""assemble fields for an api query"""
fields = BASE_FIELDS + ['{0}.{1}'.format(YEAR, field)
fields = BASE_FIELDS + ['latest.{}'.format(field)
for field in YEAR_FIELDS]
field_string = ",".join([field for field in fields])
return field_string


# def get_schools_by_page(year, page=0):
# def get_schools_by_page(page=0):
# """get a page of schools for a single year as dict"""
# field_string = build_fields_string(year)
# url = "{0}?api_key={1}&page={2}&per_page={3}&fields={4}".format(SCHOOLS_ROOT,
# API_KEY,
# page,
# PAGE_MAX,
# field_string)
# import json
# field_string = build_field_string()
# url = "latest?api_key={1}&page={2}&per_page={3}&fields={4}".format(
# SCHOOLS_ROOT, API_KEY, page, PAGE_MAX, field_string)
# data = json.loads(requests.get(url).text)
# return data


def search_by_school_name(name):
"""search api by school name, return school name, id, city, state"""
fields = "id,school.name,school.city,school.state"
url = "{0}?api_key={1}&school.name={2}&fields={3}".format(SCHOOLS_ROOT,
API_KEY,
name,
fields)
url = "{0}?api_key={1}&school.name={2}&fields={3}".format(
SCHOOLS_ROOT, API_KEY, name, fields)
data = requests.get(url).json()['results']
return data


# def get_all_school_ids():
# """traverse pages, assemble all school ids and names and output as json."""
# collector = {0}
# url = '{0}?api_key={1}&fields=id,school.name'.format(SCHOOLS_ROOT, API_KEY)
# """traverse pages, assemble all school ids and names and output as json.""" # noqa
# collector = latest
# url = 'latest?api_key={1}&fields=id,school.name'.format(SCHOOLS_ROOT, API_KEY) # noqa
# for page in range(1, 391):
# next_url = "{0}&page={1}".format(url, page)
# next_url = "latest&page={1}".format(url, page)
# nextdata = json.loads(requests.get(next_url).text)
# for entry in nextdata['results']:
# collector[entry['id']] = entry['school.name']
Expand All @@ -211,11 +185,10 @@ def search_by_school_name(name):

# def unpack_alias(alist, school):
# "create alias objects from a list of aliases"
# from paying_for_college.models import Alias
# for alias in alist:
# new, created = Alias.objects.get_or_create(alias=alias,
# institution=school,
# defaults={'is_primary':
# False})
# new, created = Alias.objects.get_or_create(
# alias=alias, institution=school, defaults={'is_primary': False})
# #example from Penn's alias string
# ALIST = [
# 'Penn',
Expand All @@ -238,30 +211,28 @@ def calculate_group_percent(group1, group2):


# USF = 137351
def get_repayment_data(school_id, year):
def get_repayment_data(school_id):
"""return metric on student debt repayment"""
school_id = "{0}".format(school_id)
entrylist = [
'{0}.repayment.3_yr_repayment_suppressed.overall',
'{0}.repayment.repayment_cohort.1_year_declining_balance',
'{0}.repayment.1_yr_repayment.completers',
'{0}.repayment.1_yr_repayment.noncompleters',
'{0}.repayment.repayment_cohort.3_year_declining_balance',
'{0}.repayment.3_yr_repayment.completers',
'{0}.repayment.3_yr_repayment.noncompleters',
'{0}.repayment.repayment_cohort.5_year_declining_balance',
'{0}.repayment.5_yr_repayment.completers',
'{0}.repayment.5_yr_repayment.noncompleters',
'{0}.repayment.repayment_cohort.7_year_declining_balance',
'{0}.repayment.7_yr_repayment.completers',
'{0}.repayment.7_yr_repayment.noncompleters']
fields = ",".join([entry.format(year) for entry in entrylist])
url = "{0}?id={1}&api_key={2}&fields={3}".format(SCHOOLS_ROOT,
school_id,
API_KEY,
fields)
'latest.repayment.3_yr_repayment_suppressed.overall',
'latest.repayment.repayment_cohort.1_year_declining_balance',
'latest.repayment.1_yr_repayment.completers',
'latest.repayment.1_yr_repayment.noncompleters',
'latest.repayment.repayment_cohort.3_year_declining_balance',
'latest.repayment.3_yr_repayment.completers',
'latest.repayment.3_yr_repayment.noncompleters',
'latest.repayment.repayment_cohort.5_year_declining_balance',
'latest.repayment.5_yr_repayment.completers',
'latest.repayment.5_yr_repayment.noncompleters',
'latest.repayment.repayment_cohort.7_year_declining_balance',
'latest.repayment.7_yr_repayment.completers',
'latest.repayment.7_yr_repayment.noncompleters']
fields = ",".join(entrylist)
url = "{0}?id={1}&api_key={2}&fields={3}".format(
SCHOOLS_ROOT, school_id, API_KEY, fields)
data = requests.get(url).json()['results'][0]
repay_completers = data['{0}.repayment.5_yr_repayment.completers'.format(year)]
repay_non = data['{0}.repayment.5_yr_repayment.noncompleters'.format(year)]
data['completer_repayment_rate_after_5_yrs'] = calculate_group_percent(repay_completers, repay_non)
repay_completers = data['latest.repayment.5_yr_repayment.completers']
repay_non = data['latest.repayment.5_yr_repayment.noncompleters']
data['completer_repayment_rate_after_5_yrs'] = calculate_group_percent(
repay_completers, repay_non)
return data
26 changes: 14 additions & 12 deletions paying_for_college/disclosures/scripts/update_colleges.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,14 @@
import requests

from paying_for_college.disclosures.scripts import api_utils
from paying_for_college.disclosures.scripts.api_utils import (
MODEL_MAP, LATEST_YEAR, LATEST_SALARY_YEAR)
from paying_for_college.disclosures.scripts.api_utils import MODEL_MAP
from paying_for_college.models import School, CONTROL_MAP

DATESTAMP = datetime.datetime.now().strftime("%Y-%m-%d")
HOME = os.path.expanduser("~")
NO_DATA_FILE = "{0}/no_data_YEAR{1}_{2}.json".format(
HOME, LATEST_YEAR, DATESTAMP)
NO_DATA_FILE = "{}/no_data_{}.json".format(HOME, DATESTAMP)
SCRIPTNAME = os.path.basename(__file__).partition('.')[0]
ID_BASE = "{0}?api_key={1}".format(api_utils.SCHOOLS_ROOT, api_utils.API_KEY)
ID_BASE = "{}?api_key={}".format(api_utils.SCHOOLS_ROOT, api_utils.API_KEY)
FIELDS = sorted(MODEL_MAP.keys())
FIELDSTRING = ",".join(FIELDS)

Expand All @@ -41,9 +39,12 @@ def update(exclude_ids=[], single_school=None):
FAILED = [] # failed to get a good API response
NO_DATA = [] # API responded, but with no data
CLOSED = 0 # schools that have closed since our last scrape
START_MSG = "Requesting school data from {0} and salary data from {1}."
JOB_MSG = "The job is paced for the Ed API, so it can take an hour to run."
print(START_MSG.format(LATEST_YEAR, LATEST_SALARY_YEAR))
START_MSG = "Requesting latest school data."
JOB_MSG = (
"The job is paced to be friendly to the Scorecard API, "
"so it can take an hour to run.\n"
"A dot means a school was updated; a dash means no data found.")
print(START_MSG)
if not single_school:
print(JOB_MSG)
UPDATED = False
Expand All @@ -60,8 +61,6 @@ def update(exclude_ids=[], single_school=None):
base_query = base_query.exclude(pk__in=exclude_ids)
for school in base_query:
PROCESSED += 1
sys.stdout.write('.')
sys.stdout.flush()
if PROCESSED % 500 == 0: # pragma: no cover
print("\n{0}\n".format(PROCESSED))
if PROCESSED % 5 == 0:
Expand All @@ -70,13 +69,15 @@ def update(exclude_ids=[], single_school=None):
# print(url)
try:
resp = requests.get(url)
except:
except Exception:
FAILED.append(school)
continue
else:
if resp.ok is True:
raw_data = resp.json()
if raw_data and raw_data['results']:
sys.stdout.write('.')
sys.stdout.flush()
data = raw_data['results'][0]
for key in MODEL_MAP:
if key in data.keys() and data[key] is not None:
Expand Down Expand Up @@ -120,7 +121,7 @@ def update(exclude_ids=[], single_school=None):
len(FAILED),
CLOSED,
SCRIPTNAME,
(datetime.datetime.now()-STARTER))
(datetime.datetime.now() - STARTER))
if NO_DATA:
data_note = "\nA list of schools that had no API data was saved to {0}"
endmsg += data_note.format(NO_DATA_FILE)
Expand All @@ -132,5 +133,6 @@ def update(exclude_ids=[], single_school=None):
# print(endmsg)
return (FAILED, NO_DATA, endmsg)


if __name__ == '__main__':
(failed, no_data, endmsg) = update()
Loading

0 comments on commit af96144

Please sign in to comment.