# Webscrape college applications

This notbook has been prepared as a prototype designed to test processes that will search college & university websites for pdf files that relate to early decision applications.

## Step One: Prepare Directory Information

This repository's Stata do file `ColAppScrape.do` uses the director file produced by the [StataIPEDSAll repository](https://github.com/adamrossnelson/StataIPEDSAll).

In [None]:
import sys
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [None]:
# Load directory data.
IPEDSfile = pd.read_stata('../../statadata/IPEDSDirInfo02to16smlr.dta', preserve_dtypes=False)

# Keep 2 and 4yr institutions.
IPEDSfile['filter'] = np.where((IPEDSfile['sector']=='Public, 4-year or above') | 
                               (IPEDSfile['sector']=='Public, 2-year') |
                               (IPEDSfile['sector']=='Private not-for-profit, 4-year or above') |
                               (IPEDSfile['sector']=='Private not-for-profit, 2-year'), 1, 0)
IPEDSfile = IPEDSfile[IPEDSfile['filter']==1]

In [None]:
# Remove www. prefix from webaddress.
IPEDSfile['rootdom'] = IPEDSfile.loc[:, 'webaddr'].replace(regex=True, to_replace='www.', value='')
# Remove miscellaneous slashes & other from webaddress.
IPEDSfile['rootdom'] = IPEDSfile.loc[:, 'rootdom'].replace(regex=True, to_replace=r'/', value='')
IPEDSfile['rootdom'] = IPEDSfile.loc[:, 'rootdom'].replace(regex=True, to_replace=r'HTTPS:', value='')

In [None]:
# Check results
IPEDSfile[['unitid','instnm','rootdom','filter','sector','iclevel']].head(n=15)

In [None]:
IPEDSfile.describe()

In [None]:
# Create list of domains
weblist = IPEDSfile['rootdom']

## Step Two: Prepare Browser Object

In [None]:
import webbrowser
from time import sleep
from selenium import webdriver
import os
import requests
import datetime

In [None]:
browser = webdriver.Firefox()
print('Loaded Browser Here')
# browser = webdriver.Chrome()

# Step Three: Scrape For Each School

In [None]:
# Define a list to store record of errors.
error_list = []

# Iteratively loop through each institution's website.
for school in weblist:
    print('STARTING SCHOOL', str(school).upper())
    searchstr = ''.join((r'https://www.google.com/search?as_q=early+decision&as_epq=&as_sitesearch=', 
                        school, '&as_filetype=pdf'))
    browser.get(searchstr)
    print('Search string is ', searchstr)
    results = browser.find_elements_by_css_selector('h3 > a')
    if len(results) == 0:
        try:
            check_captcha = browser.find_element_by_partial_link_text('Why did this happen')
            print('At school, ', school.lower(), 'Google responded with CAPTCHA - will exit program.', end='\n\n')
            error_list.append(''.join((school, '-', '.', '-EncounteredCAPTCHA-', '.')))
            sys.exit()
        else:
            print('No results for school ', school.lower(), end='\n\n')
            error_list.append(''.join((school, '-', '.', '-NoResults-', '.')))
    for i in range (len(results)):
        filelink = results[i].get_attribute('href')
        try:
            pdf = requests.get(filelink)
            fname = ''.join((school,str(i),'.pdf'))
            print('Filelink = ' + filelink)
            print('Saved as : ' + fname)
            print('Header info :' + str(pdf.headers), end='\n\n')
            open(''.join((r'pdfs/', fname)), 'wb').write(pdf.content)
        except ConnectionError:
            print('There was a ConnectionError on the {}th iteration at : {}'.format(str(i), school.lower()))
            error_list.append(''.join((school, '-', str(i), '-ConnectionError-', filelink)))
        except:
            print('There was an UnspecifiedError on the {}th iteration at : {}'.format(str(i), school))
            error_list.append(''.join((school, '-', str(i), '-UnspecifiedError-', filelink)))

In [None]:
# Save the error log to a file for later reference.
error_list = ['hihi','byebye','newnew']
with open(
    ''.join((
        'er_log_', str(datetime.datetime.now())[2:16].replace(" ", "-").replace(":",""), '.log')), 
        mode='w') as logfile:
            print('This is the error log file from {}'.format(str(datetime.datetime.now())), file = logfile)
            for error_lines in error_list:
                print(error_lines, file = logfile)
logfile.close