In [1]:
from sqlalchemy import create_engine
import os
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load API key for data.gov

with open('data_gov_api.txt', 'r') as file:
    api_key = file.read().split('\n')[0] # Remove new line



## Helper functions

In [45]:
def find_max_string_length(pd_series):
    """
    For string column in pandas Series, find the maximum length of thes string
    """
    return pd_series.map(lambda x: len(str(x))).max()

## Download College Scorecard Data Dictionary

In [2]:
data_dir = os.path.join(os.path.curdir, "data")
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

data_dict_url = "https://collegescorecard.ed.gov/assets/CollegeScorecardDataDictionary.xlsx"

data_dict_file = data_dict_url.split('/')[-1]

data_dict_path = os.path.join(data_dir, data_dict_file)

if not os.path.exists(data_dict_path):
    req = requests.get(data_dict_url)
    
    with open(data_dict_path, 'wb') as file:
        file.write(req.content)


## Load into memory and create table to define variables

In [3]:
sc_data_dict = pd.read_excel(data_dict_path, sheetname='data_dictionary')

In [4]:
sc_data_dict.iloc[300]

NAME OF DATA ELEMENT       Percentage of degrees awarded in Architecture ...
dev-category                                                       academics
developer-friendly name                      program_percentage.architecture
API data type                                                          float
VARIABLE NAME                                                         PCIP04
VALUE                                                                    NaN
LABEL                                      Architecture and Related Services
SOURCE                                                                 IPEDS
NOTES                                        Shown/used on consumer website.
Name: 300, dtype: object

In [5]:
sc_data_dict.iloc[25:28, :]

Unnamed: 0,NAME OF DATA ELEMENT,dev-category,developer-friendly name,API data type,VARIABLE NAME,VALUE,LABEL,SOURCE,NOTES
25,Control of institution,school,ownership,integer,CONTROL,1.0,Public,IPEDS,Shown/used on consumer website.
26,,school,,,,2.0,Private nonprofit,,
27,,school,,,,3.0,Private for-profit,,


In [38]:
sc_data_dict_nv = sc_data_dict.drop(['VALUE', 'LABEL'], axis = 1)

In [48]:
sc_data_dict_nv.columns = ['Name', 'DevCategory', 'DeveloperName', 'DataType', 'VarName', 'Source', 'Notes']

In [49]:
sc_data_dict_nv = sc_data_dict_nv[sc_data_dict_nv['Name'].notnull()]

In [50]:
sc_data_dict_nv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1734 entries, 0 to 1974
Data columns (total 7 columns):
Name             1734 non-null object
DevCategory      1734 non-null object
DeveloperName    1734 non-null object
DataType         1734 non-null object
VarName          1734 non-null object
Source           1734 non-null object
Notes            274 non-null object
dtypes: object(7)
memory usage: 108.4+ KB


### Find maximum string length in each column so we can define SQL table

In [51]:
string_length_dict = dict()
for column in sc_data_dict_nv.columns:
    string_length_dict[column] = find_max_string_length(sc_data_dict_nv[column])

In [65]:
sc_data_dict_nv.to_csv('datadef.csv', index=False)

In [57]:

query = """CREATE TABLE IF NOT EXISTS DataDefinitions (
"""
for column, length in string_length_dict.items():
    query += column + " varchar("+str(length)+") DEFAULT NULL,\n"
    
query += """
PRIMARY KEY (VarName)
);"""

In [60]:
conn = create_engine('postgresql://ubuntu@52.53.236.232:5432/collegesc')

In [61]:
conn.execute(query)

<sqlalchemy.engine.result.ResultProxy at 0x7f5232749828>

In [108]:
sc_data_dict_nv.to_sql('datadefinitions', conn, index=False, if_exists='fail')

In [116]:
sc_data_dict_nv[sc_data_dict_nv.Name.str.contains('academic year')]

Unnamed: 0,Name,DevCategory,DeveloperName,DataType,VarName,Source,Notes
336,Certificate of less than one academic year in ...,academics,program.certificate_lt_1_yr.agriculture,integer,CIP01CERT1,IPEDS,
337,Certificate of at least one but less than two ...,academics,program.certificate_lt_2_yr.agriculture,integer,CIP01CERT2,IPEDS,
339,Awards of at least two but less than four acad...,academics,program.certificate_lt_4_yr.agriculture,integer,CIP01CERT4,IPEDS,
341,Certificate of less than one academic year in ...,academics,program.certificate_lt_1_yr.resources,integer,CIP03CERT1,IPEDS,
342,Certificate of at least one but less than two ...,academics,program.certificate_lt_2_yr.resources,integer,CIP03CERT2,IPEDS,
344,Award of at least two but less than four acade...,academics,program.certificate_lt_4_yr.resources,integer,CIP03CERT4,IPEDS,
346,Certificate of less than one academic year in ...,academics,program.certificate_lt_1_yr.architecture,integer,CIP04CERT1,IPEDS,
347,Certificate of at least one but less than two ...,academics,program.certificate_lt_2_yr.architecture,integer,CIP04CERT2,IPEDS,
349,Award of more than two but less than four acad...,academics,program.certificate_lt_4_yr.architecture,integer,CIP04CERT4,IPEDS,
351,Certificate of less than one academic year in ...,academics,program.certificate_lt_1_yr.ethnic_cultural_ge...,integer,CIP05CERT1,IPEDS,


In [111]:
query3 = """SELECT * FROM datadefinitions LIMIT 10;"""

In [112]:
pd.read_sql_query(query3, conn)

Unnamed: 0,name,devcategory,developername,datatype,varname,source,notes
0,Unit ID for institution,root,id,integer,UNITID,IPEDS,Shown/used on consumer website.
1,8-digit OPE ID for institution,root,ope8_id,integer,OPEID,IPEDS,Shown/used on consumer website.
2,6-digit OPE ID for institution,root,ope6_id,integer,OPEID6,IPEDS,Shown/used on consumer website.
3,Institution name,school,name,autocomplete,INSTNM,IPEDS,Shown/used on consumer website.
4,City,school,city,autocomplete,CITY,IPEDS,Shown/used on consumer website.
5,State postcode,school,state,string,STABBR,IPEDS,Shown/used on consumer website.
6,ZIP code,school,zip,integer,ZIP,IPEDS,
7,Accreditor for institution,school,accreditor,string,ACCREDAGENCY,FSA,
8,URL for institution's homepage,school,school_url,string,INSTURL,IPEDS,Shown/used on consumer website.
9,URL for institution's net price calculator,school,price_calculator_url,string,NPCURL,IPEDS,Shown/used on consumer website.


## Use API to gather data

In [15]:
collegesc_api = 'https://api.data.gov/ed/collegescorecard/v1/schools.json'
payload = {'api_key' : api_key}#,  '_fields' : 'id,school.name,school.ownership'} #'school.ownership' : 3,

req = requests.get(collegesc_api, params=payload)

In [117]:
#req.json()['results'][0]['2014']

In [94]:
test

Unnamed: 0,id,school.name,school.ownership
0,121983,Argosy University-San Francisco Bay Area,3
1,434973,University of Phoenix-Maryland,3
2,436030,Hair Academy II,3
3,436067,Pryor Beauty College,3
4,436191,Vatterott College-Sunset Hills,3
5,436270,Utah College of Massage Therapy-Utah Valley,3
6,149499,Tri-County Beauty Academy,3
7,108065,Velvatex College of Beauty Culture,3
8,436483,National American University-Bloomington,3
9,436599,FINE Mortuary College,3


In [96]:
req.json()['results']

[{'id': 121983,
  'school.name': 'Argosy University-San Francisco Bay Area',
  'school.ownership': 3},
 {'id': 434973,
  'school.name': 'University of Phoenix-Maryland',
  'school.ownership': 3},
 {'id': 436030, 'school.name': 'Hair Academy II', 'school.ownership': 3},
 {'id': 436067, 'school.name': 'Pryor Beauty College', 'school.ownership': 3},
 {'id': 436191,
  'school.name': 'Vatterott College-Sunset Hills',
  'school.ownership': 3},
 {'id': 436270,
  'school.name': 'Utah College of Massage Therapy-Utah Valley',
  'school.ownership': 3},
 {'id': 149499,
  'school.name': 'Tri-County Beauty Academy',
  'school.ownership': 3},
 {'id': 108065,
  'school.name': 'Velvatex College of Beauty Culture',
  'school.ownership': 3},
 {'id': 436483,
  'school.name': 'National American University-Bloomington',
  'school.ownership': 3},
 {'id': 436599, 'school.name': 'FINE Mortuary College', 'school.ownership': 3},
 {'id': 436632,
  'school.name': 'The Medical Arts School',
  'school.ownership': 3}