# Webscrape college applications

This notebook was developed following the `ColEarlyDecScrape.ipynb` prototype.

## Step One: Prepare Directory Information

In [1]:
import os
import sys
import datetime
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
# Load directory data.
IPEDSfile = pd.read_stata('../../statadata/IPEDSDirInfo02to16smlr.dta', preserve_dtypes=False)

# Keep 2 and 4yr institutions.
IPEDSfile['filter'] = np.where((IPEDSfile['sector']=='Public, 4-year or above') | 
                               (IPEDSfile['sector']=='Public, 2-year') |
                               (IPEDSfile['sector']=='Private not-for-profit, 4-year or above') |
                               (IPEDSfile['sector']=='Private not-for-profit, 2-year'), 1, 0)
IPEDSfile = IPEDSfile[IPEDSfile['filter']==1]

# Remove www. prefix from webaddress.
IPEDSfile['rootdom'] = IPEDSfile.loc[:, 'webaddr'].replace(regex=True, to_replace='www.', value='')

# Remove miscellaneous slashes & other from webaddress.
IPEDSfile['rootdom'] = IPEDSfile.loc[:, 'rootdom'].replace(regex=True, to_replace=r'/', value='')
IPEDSfile['rootdom'] = IPEDSfile.loc[:, 'rootdom'].replace(regex=True, to_replace=r'HTTPS:', value='')
IPEDSfile['rootdom'] = IPEDSfile.loc[:, 'rootdom'].replace(regex=True, to_replace=r'https:', value='')
IPEDSfile['rootdom'] = IPEDSfile.loc[:, 'rootdom'].replace(regex=True, to_replace=r'about', value='')

# Reset the data frame's index.
IPEDSfile = IPEDSfile.reset_index()

In [3]:
# This cell copes with unanticipated errors and/or connection time-out problems.
# Keep record of last unsuccessful iteration here:
#    Encountered CAPTCHA at stillman.edu               April 17, 2018
#    Encountered NaN after alaskapacific.edu           April 18, 2018
#    Encountered CAPTCHA at buc.edu                    April 18, 2018
#    Program STALLED at arapahoe.edu                   April 18, 2018
#    Program STALLED at howard.edu                     April 19, 2018
#    Took a break from project at babson.edu           April 19, 2018
#    Took a break from project at marrimack.edu        April 25, 2018

# Get user input regarding previous scrape attempts / error log above.
print('Enter the school starting school root domain name. No entry will start at beginning of the list.')
start_school = input()

# If user provided starting school, remove preceeding schools.
if start_school != '':
    new_school_loc = IPEDSfile[IPEDSfile['rootdom'] == start_school].index.tolist()[0]
    IPEDSfile = IPEDSfile[new_school_loc:]

Enter the school starting school root domain name. No entry will start at beginning of the list.
babson.edu


In [4]:
# Create list of domains to scrape & check results.
weblist = IPEDSfile['rootdom']
weblist.head()

1146     babson.edu
1147     Boston.edu
1148    baypath.edu
1149     becker.edu
1150    bentley.edu
Name: rootdom, dtype: object

## Step Two: Prepare Browser Object

In [5]:
from time import sleep
from selenium import webdriver
import os
import requests

# Short list of url domains set aside for testing purposes.
# weblist = ['babson.edu', 'boston.edu', 'uwec.edu', 'wisc.edu']
# weblist

In [6]:
# browser = webdriver.Chrome()
browser = webdriver.Firefox()
print('Loaded Browser Here')

Loaded Browser Here


# Step Three: Scrape For Each School

In [7]:
# Define a list to store record of errors.
log_list = []

# Iteratively loop through each institution's website.
for school in weblist:
    print('STARTING SCHOOL', str(school).upper())
    log_list.append('STARTING SCHOOL ' + str(school).upper())
    searchstr = ''.join((r'https://www.google.com/search?q=',
                         r'application+admission+AND+(printable+OR+paper+OR+mail)+site:', 
                         school, r'+filetype:pdf'))
    log_list.append('Search string is ' + searchstr)
    browser.get(searchstr)
    results = browser.find_elements_by_css_selector('h3 > a')

    # Test if there were results.
    if len(results) == 0:
        try:
            # Occassionally Google responds with CAPTCHA verification challenge.
            # If Google responds with CAPTCHA routine will wait for user to complete CAPTCHA challenge.
            check_captcha = browser.find_element_by_partial_link_text('Why did this happen')
            print('At school, ', school.lower(), 'Google responded with CAPTCHA - Will wait for user input.', end='\n\n')
            log_list.append(school + '-' + '.' + '-EncounteredCAPTCHA-' + '.')
            discarded_wait = input()
            results = browser.find_elements_by_css_selector('h3 > a')
        except:
            # Occasionally Google responds with no results.
            print('No results for school ', school.lower())
            log_list.append(school + '-' + '.' + '-NoResults-' + '.')
            sleep(.5)

    # Test the number of results. Download up to the first three results.
    if len(results) < 3:
        doc_count = len(results)
        log_list.append('There were {} results from Google. Will download all.'.format(len(results)))
        sleep(.1)
    else:
        doc_count = 3
        log_list.append('There were 10 or more results from Google. Will download first three.')
        sleep(.2)

    # From above if len(results) == 0 then doc_count will also be zero which will skip this loop.
    for i in range(doc_count):
        filelink = results[i].get_attribute('href')
        try:
            pdf = requests.get(filelink)
            fname = ''.join((school, str(i), '.pdf'))
            open(os.path.join('pprapps', fname), 'wb').write(pdf.content)
            log_list.append('Filelink = ' + filelink)
            log_list.append('Saved as : ' + fname)
            log_list.append('Header info :' + str(pdf.headers))
        except ConnectionError:
            print('There was a ConnectionError on the {}th iteration at : {}'.format(str(i), school.lower()))
            log_list.append(school + '-' + str(i) + '-ConnectionError-' + filelink)
            sleep(.1)
        except:
            print('There was an UnspecifiedError on the {}th iteration at : {}'.format(str(i), school))
            log_list.append(school + '-' + str(i) + '-UnspecifiedError-' + filelink)
            sleep(.1)

STARTING SCHOOL BABSON.EDU
http://www.babson.edu/Academics/centers/blank-center/bcerc/Documents/2017%20BCERC-University%20of%20Oklahoma%20Call%20for%20Papers.pdf
http://www.babson.edu/program/graduate/Documents/PDF/babson-application-steps.pdf
http://www.babson.edu/program/graduate/Documents/PDF/Babson%20MBA%20Application%20Steps.pdf
STARTING SCHOOL BOSTON.EDU
http://resources.boston.edu/docs/admission_application.pdf
There was an UnspecifiedError on the 0th iteration at : Boston.edu
STARTING SCHOOL BAYPATH.EDU
http://web.baypath.edu/ir/CDS_2011-2012(FINAL).pdf
http://www.baypath.edu/academics/~/media/Files/PDF/Academics/Graduate%20Programs%201516.ashx
http://www.baypath.edu/academics/~/media/Files/PDF/Academics/TAWC%20Catalog%2020142015%20FINAL%20012115.ashx
STARTING SCHOOL BECKER.EDU
http://www.becker.edu/wp-content/uploads/2011/12/International-Student-Application-Addendum1.pdf
http://www.becker.edu/wp-content/uploads/2011/11/International-Student-Application-Addendum.pdf
http://www

STARTING SCHOOL FRAMINGHAM.EDU
https://www.framingham.edu/Assets/uploads/about-fsu/office-of-institutional-research/_documents/cds-2015-2016.pdf
https://www.framingham.edu/Assets/uploads/admissions-and-aid/admissions/_documents/undergraduate/readmission-application.pdf
https://www.framingham.edu/Assets/uploads/student-life/veteran-services/_documents/incoming-veteran-checklist.pdf
STARTING SCHOOL BFIT.EDU
http://www.bfit.edu/ArticleDocuments/160/International%20Students_Application%202012-1013.pdf.aspx
http://www.bfit.edu/ArticleDocuments/167/Catalog%202005-06.pdf.aspx
http://www.bfit.edu/ArticleDocuments/167/Catalog%202010-11.pdf.aspx
STARTING SCHOOL GORDON.EDU
http://www.gordon.edu/download.cfm?id=2276
http://www.gordon.edu/download.cfm?id=1848
http://www.gordon.edu/download.cfm?id=858
STARTING SCHOOL GORDONCONWELL.EDU
http://www.gordonconwell.edu/charlotte/future/documents/cha-application.pdf
http://www.gordonconwell.edu/hmp/future/documents/Admissions-Application.pdf
http://www.gor

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



STARTING SCHOOL MERRIMACK.EDU
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-b54ceceee3f2>", line 12, in <module>
    browser.get(searchstr)
  File "C:\ProgramData\Anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 324, in get
    self.execute(Command.GET, {'url': url})
  File "C:\ProgramData\Anaconda3\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 310, in execute
    response = self.command_executor.execute(driver_command, params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 466, in execute
    return self._request(command_info[0], url, body=data)
  File "C:\ProgramData\Anaconda3\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 489, in _request
    self._conn.request(method, parsed_url.path,

KeyboardInterrupt: 

In [8]:
# Save the error log to a file for later reference.
with open('log_' + str(datetime.datetime.now())[2:16].replace(" ", "-").replace(":","") + '.log',
          mode='w') as logfile:
            print('This is the error log file from {}'.format(str(datetime.datetime.now())), file = logfile)
            for log_lines in log_list:
                print(log_lines, file = logfile)
logfile.close

<function TextIOWrapper.close>