# Indiana Commission for Higher Education's College Readiness Reports


In [1]:
import pandas as pd
import numpy as np
import pdfquery                   #Import pdfquery to extract data from structured pdf
import os                         #Import OS to extract basename from filepath
import re                         #Import regex to split data from basename
import requests                   #Import requests to fetch url
from bs4 import BeautifulSoup     #Import BS to extract pdf url list from webpage

## Step 1: Scrape web page for list of pdf files

*When changing year scraped:*
    1. Change url link
    2. Change year in loop to create pdf_list: current_link.startswith(...
    3. Step 4, add year column.
    4. Step 4, modify .csv file

In [2]:
#Specify URL:
#Opens Indiana Commission for Higher Education School Level College Readiness Reports 2015
url = 'http://www.in.gov/che/4557.htm'

In [3]:
#Apply BeautifulSoup
html = requests.get(url)
html_doc = html.text
soup = BeautifulSoup(html_doc, "lxml")

In [4]:
#Extract url and append to list of pdf urls
a_tags = soup.find_all('a')
pdf_list = []
for link in a_tags:
    current_link = link.get('href')
    if current_link.startswith('http://www.ai.org/che/files/graduation_counts'):
        pdf_list.append(link.get('href'))

In [5]:
#Extract '\n' from entries in url list
pdf_list = list(map(lambda x: x[:-1], pdf_list))

In [6]:
pdf_list;

## Step 2: Create DataFrame and data extraction function for each table in pdf

In [7]:
#Creates High School Graduation Stats DF and data extraction function
HS_Graduation_Stats = pd.DataFrame(columns=['School',
                                            'ID',
                                            'Diploma_Honors',
                                            'Diploma_Core40', 
                                            'Diploma_General',
                                            'Waiver_Yes',
                                            'Waiver_No',
                                            'AP_Passed',
                                            'AP_NotPassed',
                                            'AP_NotTaken',
                                            'DualCredit_Yes',
                                            'DualCredit_No',
                                            '21Century_Yes',
                                            '21Century_No',
                                            'LunchFR_Yes',
                                            'LunchFR_No',
                                            'Race_White',
                                            'Race_Black',
                                            'Race_Hispanic',
                                            'Race_Asian',
                                            'Race_Other'])

def HSGE_Graduation_Stats(file_path, pdf_file):
    #Function to extract High School Graduation Statistics
    
    #Select pdf page to reduce search time
    pdf_file.load(0)
    
    #Extract college name and ID from path
    basename = os.path.splitext(os.path.basename(file_path))[0]
    high_school, id_tag = basename.split('_')
    
    #VAR_01: HS Diploma Type: Honors
    #Define label to be searched in document. This label establishes reference point for data search.
    label = pdf_file.pq('LTTextLineHorizontal:contains("Honors")')
    
    #Define bottom (y) left (x) corner according to label coordinates. pdfquery uses this reference to find data. 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    
    #Define search box for desired data. Starting from the reference point in previous step, define bottom left and top right coordinates for search cuadrant
    data_01 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+274, bottom_corner, left_corner+346, bottom_corner+16)).text()

    #VAR_02: HS Diploma Type: Core 40
    label = pdf_file.pq('LTTextLineHorizontal:contains("Core 40")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_02 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+274, bottom_corner, left_corner+346, bottom_corner+16)).text()

    #VAR_03: HS Diploma Type: General
    label = pdf_file.pq('LTTextLineHorizontal:contains("General")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_03 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+274, bottom_corner, left_corner+346, bottom_corner+16)).text()

    #VAR_04: HS Graduation Waver Status: with waiver
    label = pdf_file.pq('LTTextLineHorizontal:contains("Graduated with Waiver")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_04 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+274, bottom_corner, left_corner+346, bottom_corner+16)).text()

    #VAR_05: HS Graduation Waver Status: without waiver
    label = pdf_file.pq('LTTextLineHorizontal:contains("Graduated without Waiver")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_05 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+274, bottom_corner, left_corner+346, bottom_corner+16)).text()

    #VAR_06: AP Passed
    label = pdf_file.pq('LTTextLineHorizontal:contains("Took and Passed an AP Test")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_06 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+274, bottom_corner, left_corner+346, bottom_corner+16)).text()

    #VAR_07: AP Not Passed
    label = pdf_file.pq('LTTextLineHorizontal:contains("Took but Did Not Pass an AP Test")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_07 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+274, bottom_corner, left_corner+346, bottom_corner+16)).text()
    
    #VAR_08: AP Not Taken
    label = pdf_file.pq('LTTextLineHorizontal:contains("Did Not Take an AP Test")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_08 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+274, bottom_corner, left_corner+346, bottom_corner+16)).text()
    
    #VAR_09: Dual Credit Status: Earned
    label = pdf_file.pq('LTTextLineHorizontal:contains("Earned Dual Credit from an Indiana Public College")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_09 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+274, bottom_corner, left_corner+346, bottom_corner+16)).text()

    #VAR_10: Dual Credit Status: Not Earned
    label = pdf_file.pq('LTTextLineHorizontal:contains("Did Not Earn Dual Credit from an Indiana Public College")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_10 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+274, bottom_corner, left_corner+346, bottom_corner+16)).text()
    
    #VAR_11: 21st Century Scholar Status: True
    #Original label "21st Century Scholar" returns a blanck result. Explored XML File to find exxact bbbox location.
    data_11 = pdf_file.pq('LTTextLineHorizontal:in_bbox("366.12, 349.029, 371.179, 361.738")').text()
    
    #VAR_12: 21st Century Scholar Status: False
    label = pdf_file.pq('LTTextLineHorizontal:contains("Non 21st Century Scholar")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_12 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+274, bottom_corner, left_corner+346, bottom_corner+16)).text()
    
    #VAR_13: Socioeconomic Status: F/R Lunch
    label = pdf_file.pq('LTTextLineHorizontal:contains("Free or Reduced Lunch")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_13 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+274, bottom_corner, left_corner+346, bottom_corner+16)).text()
    
    #VAR_14: Socioeconomic Status: No F/R Lunch
    label = pdf_file.pq('LTTextLineHorizontal:contains("Non Free or Reduced Lunch")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_14 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+274, bottom_corner, left_corner+346, bottom_corner+16)).text()
    
    #VAR_15: Race: White
    label = pdf_file.pq('LTTextLineHorizontal:contains("White")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_15 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+274, bottom_corner, left_corner+346, bottom_corner+16)).text()
    
    #VAR_16: Race: Black
    label = pdf_file.pq('LTTextLineHorizontal:contains("Black")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_16 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+274, bottom_corner, left_corner+346, bottom_corner+16)).text()
    
    #VAR_17: Race: Hispanic
    label = pdf_file.pq('LTTextLineHorizontal:contains("Hispanic")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_17 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+274, bottom_corner, left_corner+346, bottom_corner+16)).text()
    
    #VAR_18: Race: Asian
    label = pdf_file.pq('LTTextLineHorizontal:contains("Asian")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_18 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+274, bottom_corner, left_corner+346, bottom_corner+16)).text()
    
    #VAR_19: Race: Other
    label = pdf_file.pq('LTTextLineHorizontal:contains("Other")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_19 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+274, bottom_corner, left_corner+346, bottom_corner+16)).text()
        
    global HS_Graduation_Stats
    HS_Graduation_Stats = HS_Graduation_Stats.append({
            'School': high_school,
            'ID': id_tag,
            'Diploma_Honors': data_01,
            'Diploma_Core40': data_02,
            'Diploma_General': data_03,
            'Waiver_Yes': data_04,
            'Waiver_No': data_05,
            'AP_Passed': data_06,
            'AP_NotPassed': data_07,
            'AP_NotTaken': data_08,
            'DualCredit_Yes': data_09,
            'DualCredit_No': data_10,
            '21Century_Yes': data_11,
            '21Century_No': data_12,
            'LunchFR_Yes': data_13,
            'LunchFR_No': data_14,
            'Race_White': data_15,
            'Race_Black': data_16,
            'Race_Hispanic': data_17,
            'Race_Asian': data_18,
            'Race_Other': data_19}, ignore_index=True)

In [8]:
#Creates High School Graduates College Enrollment Stats and data extraction function
HS_College_Enrollment_Stats = pd.DataFrame(columns=['School',                                                
                                                 'ID',
                                                 'Diploma_Honors_EC',
                                                 'Diploma_Core40_EC',
                                                 'Diploma_General_EC',
                                                 'Waiver_Yes_EC',
                                                 'Waiver_No_EC',
                                                 'AP_Passed_EC',
                                                 'AP_NotPassed_EC',
                                                 'AP_NotTaken_EC',
                                                 'DualCredit_Yes_EC',
                                                 'DualCredit_No_EC',
                                                 '21Century_Yes_EC',
                                                 '21Century_No_EC',
                                                 'LunchFR_Yes_EC',
                                                 'LunchFR_No_EC',
                                                 'Race_White_EC',
                                                 'Race_Black_EC',
                                                 'Race_Hispanic_EC',
                                                 'Race_Asian_EC',
                                                 'Race_Other_EC'])

def HSGE_College_Enrollment_Stats(file_path, pdf_file):
    #Function to extract High School Graduate Enrollment Statistics
    
    #Select pdf page to reduce search time
    pdf_file.load(0)
    
    #Extract college name and ID from path
    basename = os.path.splitext(os.path.basename(file_path))[0]
    high_school, id_tag = basename.split('_')
    
    #VAR_01: HS Diploma Type: Honors
    #Define label to be searched in document. This label establishes reference point for data search.
    label = pdf_file.pq('LTTextLineHorizontal:contains("Honors")')
    
    #Define bottom (y) left (x) corner according to label coordinates. pdfquery uses this reference to find data. 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    
    #Define search box for desired data. Starting from the reference point in previous step, define bottom left and top right coordinates for search cuadrant
    data_01 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+346, bottom_corner, left_corner+418, bottom_corner+16)).text()

    #VAR_02: HS Diploma Type: Core 40
    label = pdf_file.pq('LTTextLineHorizontal:contains("Core 40")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_02 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+346, bottom_corner, left_corner+418, bottom_corner+16)).text()

    #VAR_03: HS Diploma Type: General
    label = pdf_file.pq('LTTextLineHorizontal:contains("General")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_03 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+346, bottom_corner, left_corner+418, bottom_corner+16)).text()

    #VAR_04: HS Graduation Waver Status: with waiver
    label = pdf_file.pq('LTTextLineHorizontal:contains("Graduated with Waiver")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_04 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+346, bottom_corner, left_corner+418, bottom_corner+16)).text()

    #VAR_05: HS Graduation Waver Status: without waiver
    label = pdf_file.pq('LTTextLineHorizontal:contains("Graduated without Waiver")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_05 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+346, bottom_corner, left_corner+418, bottom_corner+16)).text()

    #VAR_06: AP Passed
    label = pdf_file.pq('LTTextLineHorizontal:contains("Took and Passed an AP Test")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_06 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+346, bottom_corner, left_corner+418, bottom_corner+16)).text()

    #VAR_07: AP Not Passed
    label = pdf_file.pq('LTTextLineHorizontal:contains("Took but Did Not Pass an AP Test")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_07 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+346, bottom_corner, left_corner+418, bottom_corner+16)).text()
    
    #VAR_08: AP Not Taken
    label = pdf_file.pq('LTTextLineHorizontal:contains("Did Not Take an AP Test")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_08 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+346, bottom_corner, left_corner+418, bottom_corner+16)).text()
    
    #VAR_09: Dual Credit Status: Earned
    label = pdf_file.pq('LTTextLineHorizontal:contains("Earned Dual Credit from an Indiana Public College")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_09 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+346, bottom_corner, left_corner+418, bottom_corner+16)).text()

    #VAR_10: Dual Credit Status: Not Earned
    label = pdf_file.pq('LTTextLineHorizontal:contains("Did Not Earn Dual Credit from an Indiana Public College")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_10 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+346, bottom_corner, left_corner+418, bottom_corner+16)).text()
    
    #VAR_11: 21st Century Scholar Status: True
    #Original label "21st Century Scholar" returns a blanck result. Explored XML File to find exxact bbbox location.
    data_11 = pdf_file.pq('LTTextLineHorizontal:in_bbox("433.152, 349.029, 448.091, 361.738")').text()
    
    #VAR_12: 21st Century Scholar Status: False
    label = pdf_file.pq('LTTextLineHorizontal:contains("Non 21st Century Scholar")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_12 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+346, bottom_corner, left_corner+418, bottom_corner+16)).text()
    
    #VAR_13: Socioeconomic Status: F/R Lunch
    label = pdf_file.pq('LTTextLineHorizontal:contains("Free or Reduced Lunch")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_13 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+346, bottom_corner, left_corner+418, bottom_corner+16)).text()
    
    #VAR_14: Socioeconomic Status: No F/R Lunch
    label = pdf_file.pq('LTTextLineHorizontal:contains("Non Free or Reduced Lunch")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_14 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+346, bottom_corner, left_corner+418, bottom_corner+16)).text()
    
    #VAR_15: Race: White
    label = pdf_file.pq('LTTextLineHorizontal:contains("White")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_15 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+346, bottom_corner, left_corner+418, bottom_corner+16)).text()
    
    #VAR_16: Race: Black
    label = pdf_file.pq('LTTextLineHorizontal:contains("Black")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_16 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+346, bottom_corner, left_corner+418, bottom_corner+16)).text()
    
    #VAR_17: Race: Hispanic
    label = pdf_file.pq('LTTextLineHorizontal:contains("Hispanic")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_17 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+346, bottom_corner, left_corner+418, bottom_corner+16)).text()
    
    #VAR_18: Race: Asian
    label = pdf_file.pq('LTTextLineHorizontal:contains("Asian")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_18 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+346, bottom_corner, left_corner+418, bottom_corner+16)).text()
    
    #VAR_19: Race: Other
    label = pdf_file.pq('LTTextLineHorizontal:contains("Other")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_19 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+346, bottom_corner, left_corner+418, bottom_corner+16)).text()
        
    global HS_College_Enrollment_Stats
    HS_College_Enrollment_Stats = HS_College_Enrollment_Stats.append({
            'School': high_school,
            'ID': id_tag,
            'Diploma_Honors_EC': data_01,
            'Diploma_Core40_EC': data_02,
            'Diploma_General_EC': data_03,
            'Waiver_Yes_EC': data_04,
            'Waiver_No_EC': data_05,
            'AP_Passed_EC': data_06,
            'AP_NotPassed_EC': data_07,
            'AP_NotTaken_EC': data_08,
            'DualCredit_Yes_EC': data_09,
            'DualCredit_No_EC': data_10,
            '21Century_Yes_EC': data_11,
            '21Century_No_EC': data_12,
            'LunchFR_Yes_EC': data_13,
            'LunchFR_No_EC': data_14,
            'Race_White_EC': data_15,
            'Race_Black_EC': data_16,
            'Race_Hispanic_EC': data_17,
            'Race_Asian_EC': data_18,
            'Race_Other_EC': data_19}, ignore_index=True)

In [9]:
#Graduate Enrollment Type DF and extraction function
Graduate_Enrollment_Type = pd.DataFrame(columns=['School',
                                                'ID',
                                                'Indiana_Public_College',
                                                'Indiana_Private_College_np',
                                                'Indiana_Private_College_fp',
                                                'Out_of_State_Public_College',
                                                'Out_of_State_Private_College_(non-profit)',
                                                'Out_of_State_Private_College_(for-profit)',
                                                'Non_degree_Granting_School',
                                                'Did_Not_Enroll_in_College'])

def HSGE_Type(file_path, pdf_file):
    #Function to extract High School Graduate Enrollment by College Type
    
    #Select pdf page to reduce search time
    pdf_file.load(1)
    
    #Extract college name and ID from path
    basename = os.path.splitext(os.path.basename(file_path))[0]
    high_school, id_tag = basename.split('_')
    
    #VAR_01: Indiana Public College
    #Define label to be searched in document. This label establishes reference point for data search.
    label = pdf_file.pq('LTTextLineHorizontal:contains("Indiana Public College")')
    
    #Define bottom (y) left (x) corner according to label coordinates. pdfquery uses this reference to find data. 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    
    #Define search box for desired data. Starting from the reference point in previous step, define bottom left and top right coordinates for search cuadrant
    data_01 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_02: Indiana Private College (non-profit)
    label = pdf_file.pq('LTTextLineHorizontal:contains("Indiana Private College (non-profit)")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_02 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_03: Indiana Private College (for-profit)
    label = pdf_file.pq('LTTextLineHorizontal:contains("Indiana Private College (for-profit)")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_03 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_04: Out-of-State Public College
    label = pdf_file.pq('LTTextLineHorizontal:contains("Out-of-State Public College")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_04 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_05: Out-of-State Private College (non-profit)
    label = pdf_file.pq('LTTextLineHorizontal:contains("Out-of-State Private College (non-profit)")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_05 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_06: Out-of-State Private College (for-profit)
    label = pdf_file.pq('LTTextLineHorizontal:contains("Out-of-State Private College (for-profit)")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_06 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_07: Non-degree Granting School
    label = pdf_file.pq('LTTextLineHorizontal:contains("Non-degree Granting School")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_07 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()
    
    #VAR_08: Did Not Enroll in College
    label = pdf_file.pq('LTTextLineHorizontal:contains("Did Not Enroll in College")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_08 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()
    
    global Graduate_Enrollment_Type
    Graduate_Enrollment_Type = Graduate_Enrollment_Type.append({
             'School': high_school,
             'ID': id_tag,
             'Indiana_Public_College': data_01,
             'Indiana_Private_College_np': data_02,
             'Indiana_Private_College_fp': data_03,
             'Out_of_State_Public_College': data_04,
             'Out_of_State_Private_College_(non-profit)': data_05,
             'Out_of_State_Private_College_(for-profit)': data_06,
             'Non_degree_Granting_School': data_07,
             'Did_Not_Enroll_in_College': data_08}, ignore_index=True)

In [10]:
#Creates High School Graduate Enrollment by College DF and data extraction function
Graduate_Enrollment_College = pd.DataFrame(columns=['School',
                                                    'ID',
                                                    'Ball State University',
                                                    'Indiana State University',
                                                    'University of Southern Indiana',
                                                    'IU-Bloomington',
                                                    'IU-East',
                                                    'IU-Kokomo',
                                                    'IU-Northwest',
                                                    'IU-Purdue University-Indianapolis',
                                                    'IU-South Bend',
                                                    'IU-Southeast',
                                                    'IU-Purdue University-Fort Wayne',
                                                    'PU-Calumet Campus',
                                                    'PU-North Central Campus',
                                                    'PU-Polytechnic Statewide',
                                                    'PU-West Lafayette',
                                                    'Ivy Tech Community College',
                                                    'Vincennes University'])

def HSGE_College(file_path, pdf_file):
    #Function to extract High School Graduate Enrollment by College
    
    #Select pdf page to reduce search time
    pdf_file.load(1)
    
    #Extract college name and ID from path
    basename = os.path.splitext(os.path.basename(file_path))[0]
    high_school, id_tag = basename.split('_')
    
    #VAR_01: Ball State University
    #Define label to be searched in document. This label establishes reference point for data search.
    label = pdf_file.pq('LTTextLineHorizontal:contains("Ball State University")')
    
    #Define bottom (y) left (x) corner according to label coordinates. pdfquery uses this reference to find data. 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    
    #Define search box for desired data. Starting from the reference point in previous step, define bottom left and top right coordinates for search cuadrant
    data_01 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_02: Indiana State University
    label = pdf_file.pq('LTTextLineHorizontal:contains("Indiana State University")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_02 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_03: University of Southern Indiana
    label = pdf_file.pq('LTTextLineHorizontal:contains("University of Southern Indiana")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_03 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_04: Indiana University-Bloomington
    label = pdf_file.pq('LTTextLineHorizontal:contains("Indiana University-Bloomington")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_04 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_05: Indiana University-East
    label = pdf_file.pq('LTTextLineHorizontal:contains("Indiana University-East")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_05 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_06: Indiana University-Kokomo
    label = pdf_file.pq('LTTextLineHorizontal:contains("Indiana University-Kokomo")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_06 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_07: Indiana University-Northwest
    label = pdf_file.pq('LTTextLineHorizontal:contains("Indiana University-Northwest")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_07 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()
    
    #VAR_08: Indiana University-Purdue University-Indianapolis
    label = pdf_file.pq('LTTextLineHorizontal:contains("Indiana University-Purdue University-Indianapolis")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_08 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_09: Indiana University-South Bend
    label = pdf_file.pq('LTTextLineHorizontal:contains("Indiana University-South Bend")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_09 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_10: Indiana University-Southeast
    label = pdf_file.pq('LTTextLineHorizontal:contains("Indiana University-Southeast")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_10 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_11: Indiana University-Purdue University-Fort Wayne
    label = pdf_file.pq('LTTextLineHorizontal:contains("Indiana University-Purdue University-Fort Wayne")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_11 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_12: Purdue University-Calumet Campus
    label = pdf_file.pq('LTTextLineHorizontal:contains("Purdue University-Calumet Campus")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_12 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_13: Purdue University-North Central Campus
    label = pdf_file.pq('LTTextLineHorizontal:contains("Purdue University-North Central Campus")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_13 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_14: Purdue University-Polytechnic Statewide (Statewide Technology for year 2014)
    label = pdf_file.pq('LTTextLineHorizontal:contains("Purdue University-Statewide Technology")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_14 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_15: Purdue University-West Lafayette
    label = pdf_file.pq('LTTextLineHorizontal:contains("Purdue University-West Lafayette")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_15 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_16: Ivy Tech Community College
    label = pdf_file.pq('LTTextLineHorizontal:contains("Ivy Tech Community College")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_16 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_17: Vincennes University
    label = pdf_file.pq('LTTextLineHorizontal:contains("Vincennes University")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_17 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()


    global Graduate_Enrollment_College
    Graduate_Enrollment_College = Graduate_Enrollment_College.append({
             'School': high_school,
             'ID': id_tag,
             'Ball State University': data_01,
             'Indiana State University': data_02,
             'University of Southern Indiana': data_03,
             'IU-Bloomington': data_04,
             'IU-East': data_05,
             'IU-Kokomo': data_06,
             'IU-Northwest': data_07,
             'IU-Purdue University-Indianapolis': data_08,
             'IU-South Bend': data_09,
             'IU-Southeast': data_10,
             'IU-Purdue University-Fort Wayne': data_11,
             'PU-Calumet Campus': data_12,
             'PU-North Central Campus': data_13,
             'PU-Polytechnic Statewide': data_14,
             'PU-West Lafayette': data_15,
             'Ivy Tech Community College': data_16,
             'Vincennes University': data_17}, ignore_index=True)


In [11]:
#Creates High School Graduate Enrollment by Degree and data extraction function
Graduate_Enrollment_Degree = pd.DataFrame(columns=['School',
                                                'ID',
                                                'Bachelors Degree',
                                                'Associate Degree',
                                                'Award between 1 and 2 years',
                                                'Award less than 1 year',
                                                'Unclassified Undergraduate'])

def HSGE_Degree(file_path, pdf_file):
    #Function to extract High School Graduate Enrollment by College Type
    
    #Select pdf page to reduce search time
    pdf_file.load(2)
    
    #Extract college name and ID from path
    basename = os.path.splitext(os.path.basename(file_path))[0]
    high_school, id_tag = basename.split('_')
    
    #VAR_01: Bachelors Degree
    #Define label to be searched in document. This label establishes reference point for data search.
    label = pdf_file.pq('LTTextLineHorizontal:contains("Degree (four-year)")')
    
    #Define bottom (y) left (x) corner according to label coordinates. pdfquery uses this reference to find data. 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    
    #Define search box for desired data. Starting from the reference point in previous step, define bottom left and top right coordinates for search cuadrant
    data_01 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_02: Associate Degree
    label = pdf_file.pq('LTTextLineHorizontal:contains("Associate Degree (two-year)")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_02 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_03: Award between 1 and 2 years
    label = pdf_file.pq('LTTextLineHorizontal:contains("Award of at least 1 but less than 2 academic years")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_03 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_04: Award less than 1 year
    label = pdf_file.pq('LTTextLineHorizontal:contains("Award of less than 1 academic year")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_04 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()

    #VAR_05: Unclassified undergraduate
    label = pdf_file.pq('LTTextLineHorizontal:contains("Unclassified undergraduate")') 
    left_corner = float(label.attr('x0'))
    bottom_corner = float(label.attr('y0'))
    data_05 = pdf_file.pq('LTTextLineHorizontal:in_bbox("%s, %s, %s, %s")' % (left_corner+317, bottom_corner, left_corner+403, bottom_corner+16)).text()
    
    global Graduate_Enrollment_Degree
    Graduate_Enrollment_Degree = Graduate_Enrollment_Degree.append({
             'School': high_school,
             'ID': id_tag,
             'Bachelors Degree': data_01,
             'Associate Degree': data_02,
             'Award between 1 and 2 years': data_03,
             'Award less than 1 year': data_04,
             'Unclassified Undergraduate': data_05}, ignore_index=True)

## Step 3: Run iteration to scrape pdf list

*Note: due to connection time out when running through the pdf list, run in chunks of 100*

In [12]:
pdf_list_backup = pdf_list.copy()

In [62]:
pdf_list = pdf_list_backup[400:500]

*Note: exception handling commands for the iteration below should be turned into functions to simplify reading*

In [63]:
#pdf_url = pdf_list[0]
#r = requests.get(pdf_url, stream = True)

In [64]:
#with open("temp_pdf.pdf",'wb') as temp_pdf:
#    temp_pdf.write(r.content)

In [65]:
#local_path = 'temp_pdf.pdf'
#pdf = pdfquery.PDFQuery(local_path)


In [66]:
#HSGE_Graduation_Stats(pdf_url, pdf)

In [67]:
#HSGE_College_Enrollment_Stats(pdf_url, pdf)

In [68]:
#HSGE_Type(pdf_url, pdf)

In [69]:
#HSGE_College(pdf_url, pdf)

In [70]:
#HSGE_Degree(pdf_url, pdf)

In [73]:
%%time
for i in range(len(pdf_list)):
    pdf_url = pdf_list[i]
    
    # Open stream to url len(pdf_list)
    r = requests.get(pdf_url, stream = True)
    
    with open("temp_pdf.pdf",'wb') as temp_pdf:
 
        # Saving received content as a pdf file in binary format
        # write the contents of the response (r.content) to a new file in binary mode.
        temp_pdf.write(r.content)
   
    try:
        local_path = 'temp_pdf.pdf'
        pdf = pdfquery.PDFQuery(local_path)
        HSGE_Graduation_Stats(pdf_url, pdf)
        HSGE_College_Enrollment_Stats(pdf_url, pdf)
        HSGE_Type(pdf_url, pdf)
        HSGE_College(pdf_url, pdf)
        HSGE_Degree(pdf_url, pdf)
    except:
        basename = os.path.splitext(os.path.basename(pdf_url))[0]
        high_school, id_tag = basename.split('_')
        HS_Graduation_Stats = HS_Graduation_Stats.append({
            'School': high_school,
            'ID': id_tag,
            'Diploma_Honors': np.NaN,
            'Diploma_Core40': np.NaN,
            'Diploma_General': np.NaN,
            'Waiver_Yes': np.NaN,
            'Waiver_No': np.NaN,
            'AP_Passed': np.NaN,
            'AP_NotPassed': np.NaN,
            'AP_NotTaken': np.NaN,
            'DualCredit_Yes': np.NaN,
            'DualCredit_No': np.NaN,
            '21Century_Yes': np.NaN,
            '21Century_No': np.NaN,
            'LunchFR_Yes': np.NaN,
            'LunchFR_No': np.NaN,
            'Race_White': np.NaN,
            'Race_Black': np.NaN,
            'Race_Hispanic': np.NaN,
            'Race_Asian': np.NaN,
            'Race_Other': np.NaN}, ignore_index=True)
        HS_College_Enrollment_Stats = HS_College_Enrollment_Stats.append({
            'School': high_school,
            'ID': id_tag,
            'Diploma_Honors_EC': np.NaN,
            'Diploma_Core40_EC': np.NaN,
            'Diploma_General_EC': np.NaN,
            'Waiver_Yes_EC': np.NaN,
            'Waiver_No_EC': np.NaN,
            'AP_Passed_EC': np.NaN,
            'AP_NotPassed_EC': np.NaN,
            'AP_NotTaken_EC': np.NaN,
            'DualCredit_Yes_EC': np.NaN,
            'DualCredit_No_EC': np.NaN,
            '21Century_Yes_EC': np.NaN,
            '21Century_No_EC': np.NaN,
            'LunchFR_Yes_EC': np.NaN,
            'LunchFR_No_EC': np.NaN,
            'Race_White_EC': np.NaN,
            'Race_Black_EC': np.NaN,
            'Race_Hispanic_EC': np.NaN,
            'Race_Asian_EC': np.NaN,
            'Race_Other_EC': np.NaN}, ignore_index=True)
        Graduate_Enrollment_Type = Graduate_Enrollment_Type.append({
            'School': high_school,
            'ID': id_tag,
            'Indiana_Public_College': np.NaN,
            'Indiana_Private_College_np': np.NaN,
            'Indiana_Private_College_fp': np.NaN,
            'Out-of-State Public College': np.NaN,
            'Out-of-State Private College (non-profit)': np.NaN,
            'Out-of-State Private College (for-profit)': np.NaN,
            'Non-degree Granting School': np.NaN,
            'Did Not Enroll in College': np.NaN}, ignore_index=True)
        Graduate_Enrollment_College = Graduate_Enrollment_College.append({
             'School': high_school,
             'ID': id_tag,
             'Ball State University': np.NaN,
             'Indiana State University': np.NaN,
             'University of Southern Indiana': np.NaN,
             'IU-Bloomington': np.NaN,
             'IU-East': np.NaN,
             'IU-Kokomo': np.NaN,
             'IU-Northwest': np.NaN,
             'IU-Purdue University-Indianapolis': np.NaN,
             'IU-South Bend': np.NaN,
             'IU-Southeast': np.NaN,
             'IU-Purdue University-Fort Wayne': np.NaN,
             'PU-Calumet Campus': np.NaN,
             'PU-North Central Campus': np.NaN,
             'PU-Polytechnic Statewide': np.NaN,
             'PU-West Lafayette': np.NaN,
             'Ivy Tech Community College': np.NaN,
             'Vincennes University': np.NaN}, ignore_index=True)
        Graduate_Enrollment_Degree = Graduate_Enrollment_Degree.append({
            'School': high_school,
            'ID': id_tag,
            'Bachelors Degree': np.NaN,
            'Associate Degree': np.NaN,
            'Award between 1 and 2 years': np.NaN,
            'Award less than 1 year': np.NaN,
            'Unclassified Undergraduate': np.NaN}, ignore_index=True)
        continue

Wall time: 11min 39s


In [74]:
HS_Graduation_Stats

Unnamed: 0,School,ID,Diploma_Honors,Diploma_Core40,Diploma_General,Waiver_Yes,Waiver_No,AP_Passed,AP_NotPassed,AP_NotTaken,...,DualCredit_No,21Century_Yes,21Century_No,LunchFR_Yes,LunchFR_No,Race_White,Race_Black,Race_Hispanic,Race_Asian,Race_Other
0,21stCenturyCharterSchoolofGary,4164,0,33,0,0,33,0,6,27,...,11,,20,27,6,0,33,0,0,0
1,AcademyforInnovativeStudies,8270,0,11,58,24,45,0,0,69,...,56,3,66,46,23,44,17,2,0,6
2,AchieveVirtualEducationAcademy,5288,3,16,4,3,20,0,0,23,...,20,4,19,0,23,22,1,0,0,0
3,AdamsCentralHighSchool,0021,48,50,8,3,103,10,35,61,...,32,,89,15,91,104,0,2,0,0
4,AdamsHighSchool,7505,91,158,43,32,260,4,2,286,...,215,,243,130,162,149,83,25,11,24
5,AlexandriaMonroeHighSchool,5041,42,44,10,6,90,10,38,48,...,48,,80,33,63,91,0,3,0,2
6,AndersonChristianSchool,C250,***,***,***,***,***,***,***,***,...,***,,***,***,***,***,***,***,***,***
7,AndersonHighSchool,4945,81,181,109,62,309,14,59,298,...,245,,288,232,139,240,75,29,5,22
8,AndersonPreparatoryAcademy,5092,16,41,0,3,54,1,7,49,...,33,,39,24,33,38,11,6,0,2
9,AndreanHighSchool,B760,83,59,1,0,143,19,14,110,...,76,,131,0,143,90,21,23,8,1


In [75]:
HS_College_Enrollment_Stats

Unnamed: 0,School,ID,Diploma_Honors_EC,Diploma_Core40_EC,Diploma_General_EC,Waiver_Yes_EC,Waiver_No_EC,AP_Passed_EC,AP_NotPassed_EC,AP_NotTaken_EC,...,DualCredit_No_EC,21Century_Yes_EC,21Century_No_EC,LunchFR_Yes_EC,LunchFR_No_EC,Race_White_EC,Race_Black_EC,Race_Hispanic_EC,Race_Asian_EC,Race_Other_EC
0,21stCenturyCharterSchoolofGary,4164,--,23,--,--,23,--,***,***,...,7,9,14,***,***,--,23,--,--,--
1,AcademyforInnovativeStudies,8270,--,6,11,3,14,--,--,17,...,16,***,***,13,4,10,5,***,--,***
2,AchieveVirtualEducationAcademy,5288,***,7,***,***,***,--,--,10,...,***,***,***,--,10,***,***,--,--,--
3,AdamsCentralHighSchool,0021,***,27,***,***,***,10,31,29,...,9,47,60,7,63,***,--,***,--,--
4,AdamsHighSchool,7505,79,106,11,15,181,***,***,190,...,138,,157,75,121,109,51,15,7,14
5,AlexandriaMonroeHighSchool,5041,36,21,0,***,***,7,33,17,...,21,33,44,14,43,56,--,***,--,***
6,AndersonChristianSchool,C250,***,***,***,***,***,***,***,***,...,***,***,***,***,***,***,***,***,***,***
7,AndersonHighSchool,4945,76,94,18,16,172,13,51,124,...,87,87,126,107,81,123,44,7,***,***
8,AndersonPreparatoryAcademy,5092,15,26,--,***,***,***,***,34,...,19,17,26,18,23,25,10,***,--,***
9,AndreanHighSchool,B760,79,***,***,--,134,16,12,106,...,67,94,122,--,134,84,20,22,***,***


In [76]:
Graduate_Enrollment_Type

Unnamed: 0,School,ID,Indiana_Public_College,Indiana_Private_College_np,Indiana_Private_College_fp,Out_of_State_Public_College,Out_of_State_Private_College_(non-profit),Out_of_State_Private_College_(for-profit),Non_degree_Granting_School,Did_Not_Enroll_in_College
0,21stCenturyCharterSchoolofGary,4164,21,0,0,1,1,0,0,10
1,AcademyforInnovativeStudies,8270,15,0,1,1,0,0,0,52
2,AchieveVirtualEducationAcademy,5288,7,1,1,0,1,0,0,13
3,AdamsCentralHighSchool,0021,53,10,0,0,7,0,0,36
4,AdamsHighSchool,7505,134,15,0,20,23,1,3,96
5,AlexandriaMonroeHighSchool,5041,42,13,0,2,0,0,0,39
6,AndersonChristianSchool,C250,***,***,***,***,***,***,***,***
7,AndersonHighSchool,4945,151,17,5,9,6,0,0,183
8,AndersonPreparatoryAcademy,5092,34,4,0,0,3,0,0,16
9,AndreanHighSchool,B760,86,25,0,9,13,1,0,9


In [77]:
Graduate_Enrollment_College

Unnamed: 0,School,ID,Ball State University,Indiana State University,University of Southern Indiana,IU-Bloomington,IU-East,IU-Kokomo,IU-Northwest,IU-Purdue University-Indianapolis,IU-South Bend,IU-Southeast,IU-Purdue University-Fort Wayne,PU-Calumet Campus,PU-North Central Campus,PU-Polytechnic Statewide,PU-West Lafayette,Ivy Tech Community College,Vincennes University
0,21stCenturyCharterSchoolofGary,4164,2,4,0,1,0,0,0,7,0,0,0,1,0,0,0,3,3
1,AcademyforInnovativeStudies,8270,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,10,3
2,AchieveVirtualEducationAcademy,5288,***,***,***,***,***,***,***,***,***,***,***,***,***,***,***,***,***
3,AdamsCentralHighSchool,0021,9,1,0,2,0,0,0,0,0,0,26,0,0,0,6,9,0
4,AdamsHighSchool,7505,5,2,0,30,0,0,0,4,53,0,1,0,0,0,10,22,7
5,AlexandriaMonroeHighSchool,5041,17,2,0,0,0,4,0,3,0,0,0,0,0,1,1,14,0
6,AndersonChristianSchool,C250,***,***,***,***,***,***,***,***,***,***,***,***,***,***,***,***,***
7,AndersonHighSchool,4945,30,10,5,15,1,0,0,5,0,0,0,0,0,2,7,69,7
8,AndersonPreparatoryAcademy,5092,1,4,0,3,0,0,0,2,0,0,1,0,0,1,2,17,3
9,AndreanHighSchool,B760,6,4,1,23,0,0,7,6,0,0,0,14,1,0,19,5,0


In [78]:
Graduate_Enrollment_Degree

Unnamed: 0,School,ID,Bachelors Degree,Associate Degree,Award between 1 and 2 years,Award less than 1 year,Unclassified Undergraduate
0,21stCenturyCharterSchoolofGary,4164,14,7,0,0,0
1,AcademyforInnovativeStudies,8270,2,13,0,0,0
2,AchieveVirtualEducationAcademy,5288,***,***,***,***,***
3,AdamsCentralHighSchool,0021,34,9,0,0,10
4,AdamsHighSchool,7505,102,31,0,0,1
5,AlexandriaMonroeHighSchool,5041,27,14,0,1,0
6,AndersonChristianSchool,C250,***,***,***,***,***
7,AndersonHighSchool,4945,74,76,0,0,1
8,AndersonPreparatoryAcademy,5092,13,20,0,1,0
9,AndreanHighSchool,B760,79,6,0,0,1


## Step 4: Save DataFrames to .csv files

In [79]:
year = 2014

In [82]:
HS_Graduation_Stats['Year'] = year

In [84]:
HS_Graduation_Stats.to_csv('ICHE_scrape\HS_Grad_Stats_2014.csv', index=False)

In [85]:
HS_College_Enrollment_Stats['Yyear'] = year

In [86]:
HS_College_Enrollment_Stats.to_csv('ICHE_scrape\HS_College_Enrollment_Stats_2014.csv', index=False)

In [87]:
Graduate_Enrollment_Type['Year'] = year

In [88]:
Graduate_Enrollment_Type.to_csv('ICHE_scrape\Grad_Enroll_Type_2014.csv', index=False)

In [89]:
Graduate_Enrollment_College['Year'] = year

In [90]:
Graduate_Enrollment_College.to_csv('ICHE_scrape\Grad_Enroll_College_2014.csv', index=False)

In [91]:
Graduate_Enrollment_Degree['Year'] = year

In [92]:
Graduate_Enrollment_Degree.to_csv('ICHE_scrape\Grad_Enroll_Degree_2014.csv', index=False)