# Webscrape college applications
## Step One: Prepare Directory Information

In [1]:
import sys

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [3]:
# Load directory data.
IPEDSfile = pd.read_stata('../../statadata/IPEDSDirInfo02to16smlr.dta', preserve_dtypes=False)
# Reduce to 2 and 4yr institutions.
IPEDSfile['filter'] = np.where((IPEDSfile['sector']=='Public, 4-year or above') | 
                               (IPEDSfile['sector']=='Public, 2-year') |
                               (IPEDSfile['sector']=='Private not-for-profit, 4-year or above') |
                               (IPEDSfile['sector']=='Private not-for-profit, 2-year'), 1, 0)
IPEDSfile = IPEDSfile[IPEDSfile['filter']==1]

In [4]:
# Remove www. prefix from webaddress.
IPEDSfile['rootdom'] = IPEDSfile.loc[:, 'webaddr'].replace(regex=True, to_replace='www.', value='')
# Remove miscellaneous slashes & other from webaddress.
IPEDSfile['rootdom'] = IPEDSfile.loc[:, 'rootdom'].replace(regex=True, to_replace=r'/', value='')
IPEDSfile['rootdom'] = IPEDSfile.loc[:, 'rootdom'].replace(regex=True, to_replace=r'HTTPS:', value='')

In [5]:
# IPEDSfile[['unitid','instnm','rootdom','filter','sector','iclevel']].head(n=15)

In [6]:
# IPEDSfile.describe()

In [7]:
# Create list of domains to scrape
weblist = IPEDSfile['rootdom']

## Step Two: Prepare Browser Object

In [8]:
import webbrowser
from time import sleep
from selenium import webdriver
import os
import requests

In [9]:
# browser = webdriver.Chrome()
browser = webdriver.Firefox()
print('Loaded Browser Here')

Loaded Browser Here


# Step Three: Scrape For Each School

In [10]:
error_list = []
for school in weblist:
    print('STARTING SCHOOL', str(school).upper())
    searchstr = ''.join((r'https://www.google.com/search?q=',
                         r'application+admission+AND+(printable+OR+paper+OR+mail)+site:', 
                         school, r'+filetype:pdf'))
    print('Search string is ', searchstr)
    browser.get(searchstr)
    results = browser.find_elements_by_css_selector('h3 > a')
    
    if len(results) < 3:
        doc_count = len(results)
        print('There were {} results from Google. Will download all.'.format(len(results)))
    else:
        doc_count = 3
        print('There were 10 or more results from Google. Will download first three.')
        
    if len(results) == 0:
        try:
            check_captcha = browser.find_element_by_partial_link_text('Why did this happen')
            print('At school, ', school.lower(), 'Google responded with CAPTCHA - will exit program.', end='\n\n')
            error_list.append(''.join((school, '-', '.', '-EncounteredCAPTCHA-', '.')))
            sys.exit()
        except:
            print('No results for school ', school.lower(), end='\n\n')
            error_list.append(''.join((school, '-', '.', '-NoResults-', '.')))
            sleep(.5)

    for i in range(doc_count):
        filelink = results[i].get_attribute('href')
        try:
            pdf = requests.get(filelink)
            fname = ''.join((school, str(i), '.pdf'))
            open(''.join((r'pprapps/', fname)), 'wb').write(pdf.content)
            print('Filelink = ' + filelink)
            print('Saved as : ' + fname)
            print('Header info :' + str(pdf.headers), end='\n\n')
        except ConnectionError:
            print('There was a ConnectionError on the {}th iteration at : {}'.format(str(i), school.lower()))
            error_list.append(''.join((school, '-', str(i), '-ConnectionError-', filelink)))
        except:
            print('There was an UnspecifiedError on the {}th iteration at : {}'.format(str(i), school))
            error_list.append(''.join((school, '-', str(i), '-UnspecifiedError-', filelink)))

STARTING SCHOOL AAMU.EDU
Search string is  https://www.google.com/search?q=application+admission+AND+(printable+OR+paper+OR+mail)+site:aamu.edu+filetype:pdf
There were 10 or more results from Google. Will download first three.
Filelink = http://www.aamu.edu/Admissions/UndergraduateAdmissions/apply/Documents/AdmApplication09.pdf
Saved as : aamu.edu0.pdf
Header info :{'Cache-Control': 'private,max-age=0', 'Content-Length': '12281215', 'Content-Type': 'application/pdf', 'Expires': 'Mon, 02 Apr 2018 22:34:42 GMT', 'Last-Modified': 'Mon, 06 Jun 2011 22:08:09 GMT', 'ETag': '"{4740143A-DBE2-4B20-AAFB-3440D0AE6BD0},2"', 'Server': 'Microsoft-IIS/7.5', 'SPRequestGuid': '274fdc24-0a35-46b2-be74-af69bf73d0e4', 'X-SharePointHealthScore': '0', 'ResourceTag': 'rt:4740143A-DBE2-4B20-AAFB-3440D0AE6BD0@00000000002', 'X-Content-Type-Options': 'nosniff', 'Content-Disposition': 'attachment; filename=AdmApplication09.pdf', 'X-Download-Options': 'noopen', 'Public-Extension': 'http://schemas.microsoft.com/rep

Filelink = http://www.alasu.edu/about-asu/institutional-planning-and-effectiveness/office-institutional-research/institutional-data/common-data-set-/download.aspx?id=12801
Saved as : alasu.edu2.pdf
Header info :{'Cache-Control': 'public', 'Content-Length': '128282', 'Content-Type': 'application/pdf', 'Server': 'Microsoft-IIS/7.0', 'X-AspNet-Version': '2.0.50727', 'Content-Disposition': 'attachment; filename="CDS2010_2011.pdf"', 'Set-Cookie': 'ASP.NET_SessionId=0eirnsr5mrhajlzakg4zi32h; path=/; HttpOnly', 'X-Powered-By': 'ASP.NET', 'Date': 'Tue, 17 Apr 2018 22:27:51 GMT'}

STARTING SCHOOL UA.EDU
Search string is  https://www.google.com/search?q=application+admission+AND+(printable+OR+paper+OR+mail)+site:ua.edu+filetype:pdf
There were 10 or more results from Google. Will download first three.
Filelink = https://www.law.ua.edu/misc/LLMInstructions2018.pdf
Saved as : ua.edu0.pdf
Header info :{'Date': 'Tue, 17 Apr 2018 22:35:08 GMT', 'Server': 'Apache', 'Last-Modified': 'Wed, 26 Jul 2017 17

There were 10 or more results from Google. Will download first three.
Filelink = http://www.auburn.edu/administration/business-finance/finaid/pdf/application-form.pdf
Saved as : auburn.edu0.pdf
Header info :{'Date': 'Tue, 17 Apr 2018 22:35:18 GMT', 'Server': 'Apache/2.2.15 (Red Hat)', 'Last-Modified': 'Thu, 31 Jan 2008 22:17:10 GMT', 'Accept-Ranges': 'bytes', 'Content-Length': '490193', 'Cache-Control': 'max-age=604800', 'Expires': 'Tue, 24 Apr 2018 22:35:18 GMT', 'Keep-Alive': 'timeout=15, max=100', 'Connection': 'Keep-Alive', 'Content-Type': 'application/pdf'}

Filelink = https://onlinedegrees.auburn.edu/business/documents/BSBA-Checklist.pdf
Saved as : auburn.edu1.pdf
Header info :{'Date': 'Tue, 17 Apr 2018 22:35:19 GMT', 'Server': 'Apache', 'Set-Cookie': 'grav-site-62d64ed=rhb2eun44im1q3bodod2otc6j2; expires=Tue, 17-Apr-2018 23:05:19 GMT; Max-Age=1800; path=/business; domain=onlinedegrees.auburn.edu; HttpOnly, grav-site-62d64ed=rhb2eun44im1q3bodod2otc6j2; expires=Tue, 17-Apr-2018 23

There was an UnspecifiedError on the 2th iteration at : faulkner.edu
STARTING SCHOOL GADSDENSTATE.EDU
Search string is  https://www.google.com/search?q=application+admission+AND+(printable+OR+paper+OR+mail)+site:gadsdenstate.edu+filetype:pdf
There were 10 or more results from Google. Will download first three.
There was an UnspecifiedError on the 0th iteration at : gadsdenstate.edu
Filelink = http://www.gadsdenstate.edu/sites/default/files/u23/Admission%20Policies%20and%20Procedures.pdf
Saved as : gadsdenstate.edu1.pdf
Header info :{'Date': 'Tue, 17 Apr 2018 22:35:30 GMT', 'Server': 'Apache/2.4.6 (CentOS)', 'X-Content-Type-Options': 'nosniff', 'Last-Modified': 'Mon, 11 Jul 2016 13:30:14 GMT', 'ETag': '"61fa8-5375c26ffb512"', 'Accept-Ranges': 'bytes', 'Content-Length': '401320', 'Cache-Control': 'max-age=1209600', 'Expires': 'Tue, 01 May 2018 22:35:30 GMT', 'Keep-Alive': 'timeout=5, max=100', 'Connection': 'Keep-Alive', 'Content-Type': 'application/pdf', 'Set-Cookie': 'Coyote-2-509ba8c0

Filelink = http://www.hcu.edu/wp-content/uploads/2015/05/2017-18-Catalog-04.24.17-PDF.pdf
Saved as : hcu.edu2.pdf
Header info :{'Server': 'nginx/1.12.2', 'Date': 'Tue, 17 Apr 2018 22:35:42 GMT', 'Content-Type': 'application/pdf', 'Content-Length': '560820', 'Connection': 'keep-alive', 'Last-Modified': 'Wed, 26 Apr 2017 13:46:28 GMT', 'Accept-Ranges': 'bytes'}

STARTING SCHOOL DRAKESTATE.EDU
Search string is  https://www.google.com/search?q=application+admission+AND+(printable+OR+paper+OR+mail)+site:drakestate.edu+filetype:pdf
There were 10 or more results from Google. Will download first three.
Filelink = http://drakestate.edu/Uploads/files/About/Complete%20Admissions%20Policy%20and%20Procedure%20Manaual%202010-2011.pdf
Saved as : drakestate.edu0.pdf
Header info :{'Content-Type': 'application/pdf', 'Last-Modified': 'Tue, 23 Aug 2011 18:54:16 GMT', 'Accept-Ranges': 'bytes', 'ETag': '"48b6e3ec661cc1:0"', 'Server': 'Microsoft-IIS/8.5', 'X-Powered-By': 'ASP.NET', 'Date': 'Tue, 17 Apr 2018 

Filelink = https://helen.jeffstateonline.com/JSCC/1314Dependent_Verification_worksheet.pdf
Saved as : jeffstateonline.com2.pdf
Header info :{'Date': 'Tue, 17 Apr 2018 22:35:58 GMT', 'ETag': '"360002-57f9f-4e9f60ad80b00"', 'Accept-Ranges': 'bytes', 'Content-Length': '360351', 'Content-Type': 'application/pdf', 'Content-Language': 'en', 'Connection': 'Keep-Alive', 'Keep-Alive': 'timeout=5, max=999', 'Server': 'Oracle-Application-Server-11g Oracle-Web-Cache-11g/11.1.1.4.0 (N;ecid=3230173396417227,0:1)', 'Last-Modified': 'Wed, 30 Oct 2013 14:21:00 GMT'}

STARTING SCHOOL CALHOUN.EDU
Search string is  https://www.google.com/search?q=application+admission+AND+(printable+OR+paper+OR+mail)+site:calhoun.edu+filetype:pdf
There were 10 or more results from Google. Will download first three.
Filelink = http://www.calhoun.edu/Content/Uploads/calhouninnovate.edu/files/Application.pdf
Saved as : calhoun.edu0.pdf
Header info :{'Content-Type': 'application/pdf', 'Last-Modified': 'Thu, 17 Sep 2015 18:03:

Filelink = https://marionmilitary.edu/wp-content/uploads/2017/11/2017-18-Registration-Instructions.pdf
Saved as : marionmilitary.edu1.pdf
Header info :{'Date': 'Tue, 17 Apr 2018 22:36:14 GMT', 'Server': 'Apache', 'X-Powered-By': 'PHP/5.6.34', 'Expires': 'Wed, 11 Jan 1984 05:00:00 GMT', 'Cache-Control': 'no-cache, must-revalidate, max-age=0', 'Link': '<https://marionmilitary.edu/wp-json/>; rel="https://api.w.org/"', 'X-TEC-API-VERSION': 'v1', 'X-TEC-API-ROOT': 'https://marionmilitary.edu/wp-json/tribe/events/v1/', 'X-TEC-API-ORIGIN': 'https://marionmilitary.edu', 'Keep-Alive': 'timeout=5, max=100', 'Connection': 'Keep-Alive', 'Transfer-Encoding': 'chunked', 'Content-Type': 'text/html; charset=UTF-8'}

Filelink = https://marionmilitary.edu/wp-content/uploads/2017/08/MMI-catalog-2010_11.pdf
Saved as : marionmilitary.edu2.pdf
Header info :{'Date': 'Tue, 17 Apr 2018 22:36:15 GMT', 'Server': 'Apache', 'Last-Modified': 'Thu, 17 Aug 2017 13:27:19 GMT', 'Accept-Ranges': 'bytes', 'Content-Length

Filelink = https://www.una.edu/international/admissions/International%20Student%20Application%20Form.pdf
Saved as : una.edu1.pdf
Header info :{'Date': 'Tue, 17 Apr 2018 22:36:25 GMT', 'Server': 'Apache/2.4.6 (CentOS) OpenSSL/1.0.2k-fips PHP/5.4.16', 'Strict-Transport-Security': 'max-age=15768000', 'Last-Modified': 'Thu, 12 Mar 2015 04:35:41 GMT', 'ETag': '"fc633-5110fea8f6140"', 'Accept-Ranges': 'bytes', 'Content-Length': '1033779', 'Cache-Control': 's-maxage=10', 'Keep-Alive': 'timeout=5, max=100', 'Connection': 'Keep-Alive', 'Content-Type': 'application/pdf'}

Filelink = https://www.una.edu/graduate/docs/AppGradSch9-22-11a.pdf
Saved as : una.edu2.pdf
Header info :{'Date': 'Tue, 17 Apr 2018 22:36:26 GMT', 'Server': 'Apache/2.4.6 (CentOS) OpenSSL/1.0.2k-fips PHP/5.4.16', 'Strict-Transport-Security': 'max-age=15768000', 'Last-Modified': 'Thu, 22 Mar 2018 05:17:05 GMT', 'ETag': '"44427-567f96b250bae"', 'Accept-Ranges': 'bytes', 'Content-Length': '279591', 'Cache-Control': 's-maxage=10', 

Filelink = https://www.bishop.edu/pdfs/nursing/Nursing_Application.pdf
Saved as : bishop.edu1.pdf
Header info :{'Date': 'Tue, 17 Apr 2018 22:36:42 GMT', 'Server': 'Apache/2.4.6 (CentOS) OpenSSL/1.0.2k-fips PHP/5.6.33', 'Strict-Transport-Security': 'max-age=63072000; includeSubDomains;', 'Last-Modified': 'Fri, 09 Dec 2016 21:37:16 GMT', 'ETag': '"5bc55-543408ec81b00"', 'Accept-Ranges': 'bytes', 'Content-Length': '375893', 'Cache-Control': 'max-age=2592000', 'Expires': 'Thu, 17 May 2018 22:36:42 GMT', 'X-Frame-Options': 'DENY', 'X-Content-Type-Options': 'nosniff', 'X-XSS-Protection': '1; mode=block;', 'Vary': 'User-Agent', 'Keep-Alive': 'timeout=5, max=100', 'Connection': 'Keep-Alive', 'Content-Type': 'application/pdf'}

Filelink = https://www.bishop.edu/pdfs/admissions/Enrollment%20Guide7-6_4web.pdf
Saved as : bishop.edu2.pdf
Header info :{'Date': 'Tue, 17 Apr 2018 22:36:43 GMT', 'Server': 'Apache/2.4.6 (CentOS) OpenSSL/1.0.2k-fips PHP/5.6.33', 'Strict-Transport-Security': 'max-age=6307

Filelink = http://snead.edu/sites/www/Uploads/files/About%20Us/ACS%20Nursing%20Application%20-%20Generic2018.pdf
Saved as : snead.edu0.pdf
Header info :{'Content-Type': 'application/pdf', 'Last-Modified': 'Thu, 01 Feb 2018 21:47:39 GMT', 'Accept-Ranges': 'bytes', 'ETag': '"a028dc47a69bd31:0"', 'Server': 'Microsoft-IIS/7.5', 'X-Powered-By': 'ASP.NET', 'Date': 'Tue, 17 Apr 2018 22:37:03 GMT', 'Content-Length': '359293'}

Filelink = http://www.snead.edu/sites/www/Uploads/files/Current%20Students/Admission%20Information%20Addendum.pdf
Saved as : snead.edu1.pdf
Header info :{'Content-Type': 'application/pdf', 'Last-Modified': 'Thu, 13 Jul 2017 22:52:39 GMT', 'Accept-Ranges': 'bytes', 'ETag': '"7b1d48ba2afcd21:0"', 'Server': 'Microsoft-IIS/7.5', 'X-Powered-By': 'ASP.NET', 'Date': 'Tue, 17 Apr 2018 22:37:03 GMT', 'Content-Length': '412806'}

Filelink = http://www.snead.edu/sites/www/Uploads/files/Financial%20Aid/15-16_Scholarship_Application.pdf
Saved as : snead.edu2.pdf
Header info :{'Conten

NameError: name 'time' is not defined

In [15]:
for errors in range(len(error_list)):
    print(error_list[errors])
    
# Encountered CAPTCHA at stillman.edu

faulknerstate.edu-0-UnspecifiedError-http://www.faulknerstate.edu/media/faulkner_state/content_assets/documents/admissions/UPDATED_APPLICATION_1-26-15.pdf
faulknerstate.edu-1-UnspecifiedError-http://www.faulknerstate.edu/media/faulkner_state/content_assets/documents/programs/majors_and_departments/nursing/Coastal_Alabama_Nursing_Application_1_15_17[1].pdf
faulknerstate.edu-2-UnspecifiedError-http://www.faulknerstate.edu/sites/www/Uploads/files/Programs/Application_for_Admission.pdf
faulkner.edu-2-UnspecifiedError-https://ww2.faulkner.edu/sharedmedia/Admissions/TRAD_ReferenceForms.pdf
gadsdenstate.edu-0-UnspecifiedError-https://www.gadsdenstate.edu/sites/default/files/u36/intlapp.pdf
gadsdenstate.edu-2-UnspecifiedError-https://www.gadsdenstate.edu/sites/default/files/u34/Catalog/Enrollment%20Services.pdf
uwa.edu-1-UnspecifiedError-https://secure.uwa.edu/printapps/undecamp.pdf
stillman.edu-.-EncounteredCAPTCHA-.
stillman.edu-.-NoResults-.
