In [22]:
from sqlalchemy import create_engine
import os
import time
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load API key for data.gov

with open('data_gov_api.txt', 'r') as file:
    api_key = file.read().split('\n')[0] # Remove new line

conn = create_engine('postgresql://ubuntu@52.53.236.232:5432/collegesc')

data_dir = os.path.join(os.path.curdir, "data")
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

## Helper functions and dictionaries

In [86]:
def find_max_string_length(pd_series):
    """
    For string column in pandas Series, find the maximum length of thes string
    """
    return pd_series.map(lambda x: len(str(x))).max()

def call_collegesc_api(category, api_key, conn, per_page = 50, year = 2013):
    """
    category: the category to be downloaded
    api_key: key for the college sc api
    conn: sql_alchemy engine for querying data definitions
    per_page: number of entries per api call (max: 100)
    year: year to be called (default: 2013)
    calls the college scordcard api given a particular category
    and returns a dataframe transformed from json
    """
    sql_query = """SELECT
                    developername
                FROM 
                    datadefinitions 
                WHERE devcategory ='"""
    
    sql_query += category+"';"
    
    fields = [fields[0] for fields in conn.execute(sql_query).fetchall()]

    
    if(category == "school"):
        join_string = "," + category + "."
        fields_string = "id,location.lat,location.lon" + join_string + join_string.join(fields)
    elif(category == "academics"):
        join_string = "," + category + "."
        fields_string = "id" + join_string + join_string.join(fields)
    else:
        join_string = "," + str(year) + "." + category + "."
        fields_string = "id" + join_string + join_string.join(fields)
    
    college_sc_url = 'https://api.data.gov/ed/collegescorecard/v1/schools.json'
    api_query = {'api_key' : api_key, '_fields' : fields_string, '_per_page' : per_page}
    
    

    req = requests.get(college_sc_url, params=api_query)
    print(req)
    
    pages = req.json()["metadata"]["total"] // per_page
    
    category_df = pd.DataFrame(req.json()["results"])
    
    for page in range(1, pages+1):
        
        api_query["_page"] = page
        i = 0
        while (i < 10):
            try:
                req = requests.get(college_sc_url, params=api_query)
        
                temp_df = pd.DataFrame(req.json()["results"])
        
                category_df = category_df.append(temp_df)
                
                i = 0
                break
            except:
                print("Connection or query error")
                print(req)
                i += 1
                time.sleep(2)
        
        time.sleep(2)
    
    category_df = category_df.reset_index(drop=True)
    
    category_df.columns = category_df.columns.str.replace(category+".", "")
    category_df.columns = category_df.columns.str.replace(str(year)+".", "")
        
    
    return category_df
    
def check_highnull_columns(df, threshold=1000):
    """
    takes a dataframe and checks which columns
    have lower than threshold nulls
    returns partial dataframe info with columns with low nulls
    """
    return df.loc[:, df.isnull().sum() < threshold].info()

sql_type_dict = {'integer' : 'integer', 'float' : 'real', 'string' : 'text', 'autocomplete' : 'text'}

def create_datetype_dict(conn, category, type_dict):
    """
    Generates sql query to conn with category to get
    API data categories and convert to Postgres types
    with type dictionary
    Returns type dictionary for that category to generate
    SQL table
    """
    
    datatype_query = """SELECT 
                        developername, datatype
                    FROM
                        datadefinitions
                    WHERE
                        devcategory = '"""
                    
    datatype_query += category+"';"
    
    type_defs = conn.execute(datatype_query).fetchall()
    
    fields = [x[0] for x in type_defs]
    entries = [type_dict[x[1]] for x in type_defs]
    
    sql_type_def_dict = dict(zip(fields, entries))
    sql_type_def_dict['id'] = 'integer'
    sql_type_def_dict['index'] = 'integer'
    
    return sql_type_def_dict
    

## Download College Scorecard Data Dictionary

In [3]:


data_dict_url = "https://collegescorecard.ed.gov/assets/CollegeScorecardDataDictionary.xlsx"

data_dict_file = data_dict_url.split('/')[-1]

data_dict_path = os.path.join(data_dir, data_dict_file)

if not os.path.exists(data_dict_path):
    req = requests.get(data_dict_url)
    
    with open(data_dict_path, 'wb') as file:
        file.write(req.content)


## Load into memory and create table to define variables

In [101]:
sc_data_dict = pd.read_excel(data_dict_path, sheetname='data_dictionary')

In [102]:
sc_data_dict.iloc[300]

NAME OF DATA ELEMENT       Percentage of degrees awarded in Architecture ...
dev-category                                                       academics
developer-friendly name                      program_percentage.architecture
API data type                                                          float
VARIABLE NAME                                                         PCIP04
VALUE                                                                    NaN
LABEL                                      Architecture and Related Services
SOURCE                                                                 IPEDS
NOTES                                        Shown/used on consumer website.
Name: 300, dtype: object

In [103]:
sc_data_dict.iloc[25:28, :]

Unnamed: 0,NAME OF DATA ELEMENT,dev-category,developer-friendly name,API data type,VARIABLE NAME,VALUE,LABEL,SOURCE,NOTES
25,Control of institution,school,ownership,integer,CONTROL,1.0,Public,IPEDS,Shown/used on consumer website.
26,,school,,,,2.0,Private nonprofit,,
27,,school,,,,3.0,Private for-profit,,


In [104]:
sc_data_dict_nv = sc_data_dict.drop(['VALUE', 'LABEL'], axis = 1)

In [105]:
sc_data_dict_nv.columns = ['Name', 'DevCategory', 'DeveloperName', 'DataType', 'VarName', 'Source', 'Notes']

In [106]:
sc_data_dict_nv = sc_data_dict_nv[sc_data_dict_nv['Name'].notnull()]

In [107]:
sc_data_dict_nv.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1734 entries, 0 to 1974
Data columns (total 7 columns):
Name             1734 non-null object
DevCategory      1734 non-null object
DeveloperName    1734 non-null object
DataType         1734 non-null object
VarName          1734 non-null object
Source           1734 non-null object
Notes            274 non-null object
dtypes: object(7)
memory usage: 108.4+ KB


### Find maximum string length in each column so we can define SQL table

In [108]:
string_length_dict = dict()
for column in sc_data_dict_nv.columns:
    string_length_dict[column] = find_max_string_length(sc_data_dict_nv[column])

In [109]:
sc_data_dict_nv.to_csv('datadef.csv', index=False)

In [110]:

query = """CREATE TABLE IF NOT EXISTS datadefinitions (
"""
for column, length in string_length_dict.items():
    query += column + " varchar("+str(length)+") DEFAULT NULL,\n"
    
query += """
PRIMARY KEY (developername)
);"""

In [112]:
conn.execute(query)

<sqlalchemy.engine.result.ResultProxy at 0x7f54e18d8358>

In [130]:
query2 = """COPY datadefinitions FROM '/home/ubuntu/Notebooks/CollegeClassification/datadef.csv' DELIMITER ',' CSV HEADER;"""

In [131]:
conn.execute(query2)

<sqlalchemy.engine.result.ResultProxy at 0x7f54e2e7eda0>

In [14]:
query3 = """SELECT * FROM datadefinitions;"""

In [15]:
data_definitions = pd.read_sql_query(query3, conn)

## Use API to gather data for 2013

In [73]:
datatype_query = """SELECT
                        *
                    FROM 
                        datadefinitions
                    WHERE
                        devcategory='root';"""

In [74]:
conn.execute(datatype_query).fetchall()

[('Unit ID for institution', 'root', 'id', 'integer', 'UNITID', 'IPEDS', 'Shown/used on consumer website.'),
 ('8-digit OPE ID for institution', 'root', 'ope8_id', 'integer', 'OPEID', 'IPEDS', 'Shown/used on consumer website.'),
 ('6-digit OPE ID for institution', 'root', 'ope6_id', 'integer', 'OPEID6', 'IPEDS', 'Shown/used on consumer website.'),
 ('Latitude', 'root', 'location.lat', 'float', 'LATITUDE', 'IPEDS', None),
 ('Longitude', 'root', 'location.lon', 'float', 'LONGITUDE', 'IPEDS', None)]

In [76]:
autocomplete_query = """SELECT
                            *
                        FROM
                            datadefinitions
                        WHERE
                            datatype='autocomplete'"""

In [77]:
conn.execute(autocomplete_query).fetchall()

[('Institution name', 'school', 'name', 'autocomplete', 'INSTNM', 'IPEDS', 'Shown/used on consumer website.'),
 ('City', 'school', 'city', 'autocomplete', 'CITY', 'IPEDS', 'Shown/used on consumer website.'),
 ('Institution name aliases', 'school', 'alias', 'autocomplete', 'ALIAS', 'IPEDS', None)]

In [17]:
category_query = """SELECT DISTINCT
                        devcategory
                    FROM
                        datadefinitions;"""

In [18]:
categories = [x[0] for x in conn.execute(category_query).fetchall()]

In [19]:
categories = categories[1:]

In [71]:
datatype_query = """SELECT DISTINCT
                        datatype
                    FROM
                        datadefinitions;"""

In [72]:
conn.execute(datatype_query).fetchall()

[('integer',), ('string',), ('autocomplete',), ('float',)]

#### Remove categories with mostly missing data

In [20]:
categories.remove('academics')
categories.remove('earnings')
categories.remove('completion')
categories

['repayment', 'admissions', 'aid', 'student', 'cost', 'school']

#### Gather CSV data using API calls

In [23]:
for category in categories:
    if((category == 'academics') | (category == 'earnings') | (category == 'completion')):
        continue
    print("Starting "+category)
    category_file = os.path.join(data_dir, category + ".csv")
    if not os.path.exists(category_file):
        category_df = call_collegesc_api(category, api_key, conn)
        category_df.to_csv(category_file)
    print("Completed "+category)

Starting repayment
Completed repayment
Starting admissions
Completed admissions
Starting aid
Completed aid
Starting student
Completed student
Starting cost
Completed cost
Starting school
Completed school


## Check resulting CSV files before filling out SQL Database

In [93]:
for category in categories:
    cat_file = os.path.join(data_dir, category+'.csv')
    category_df = pd.read_csv(cat_file)
    print(category)
    print(check_highnull_columns(category_df, threshold=1000))
    print()
    

repayment
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7703 entries, 0 to 7702
Data columns (total 3 columns):
3_yr_default_rate          7488 non-null float64
3_yr_default_rate_denom    7488 non-null float64
id                         7703 non-null int64
dtypes: float64(2), int64(1)
memory usage: 180.6 KB
None

admissions
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7703 entries, 0 to 7702
Data columns (total 1 columns):
id    7703 non-null int64
dtypes: int64(1)
memory usage: 60.3 KB
None

aid
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7703 entries, 0 to 7702
Data columns (total 3 columns):
federal_loan_rate    6806 non-null float64
pell_grant_rate      6806 non-null float64
id                   7703 non-null int64
dtypes: float64(2), int64(1)
memory usage: 180.6 KB
None

student
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7703 entries, 0 to 7702
Data columns (total 29 columns):
FAFSA_applications                                7204 non-null float64
avg_depende

Not worth using admissions or cost

In [68]:
sql_table_cat = list(categories)

In [69]:
sql_table_cat.remove('admissions')
sql_table_cat.remove('cost')

In [70]:
sql_table_cat

['repayment', 'aid', 'student', 'school']

In [79]:
datatype_query = """SELECT 
                        developername, datatype
                    FROM
                        datadefinitions
                    WHERE
                        devcategory = 'aid'
                 """

In [81]:
type_defs = conn.execute(datatype_query).fetchall()

In [82]:
fields = [x[0] for x in type_defs]
entries = [sql_type_dict[x[1]] for x in type_defs]

In [83]:
type_def_dict = dict(zip(fields, entries))

In [84]:
type_def_dict

{'cumulative_debt.10th_percentile': 'integer',
 'cumulative_debt.25th_percentile': 'integer',
 'cumulative_debt.75th_percentile': 'integer',
 'cumulative_debt.90th_percentile': 'integer',
 'cumulative_debt.number': 'integer',
 'federal_loan_rate': 'real',
 'loan_principal': 'real',
 'median_debt.completers.monthly_payments': 'real',
 'median_debt.completers.overall': 'real',
 'median_debt.dependent_students': 'real',
 'median_debt.female_students': 'real',
 'median_debt.first_generation_students': 'real',
 'median_debt.income.0_30000': 'real',
 'median_debt.income.30001_75000': 'real',
 'median_debt.income.greater_than_75000': 'real',
 'median_debt.independent_students': 'real',
 'median_debt.male_students': 'real',
 'median_debt.no_pell_grant': 'real',
 'median_debt.non_first_generation_students': 'real',
 'median_debt.noncompleters': 'real',
 'median_debt.number.completers': 'integer',
 'median_debt.number.dependent_students': 'integer',
 'median_debt.number.female_students': 'intege