# Webscrape college applications
## Step One: Prepare Directory Information

In [None]:
import sys

In [None]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [None]:
# Load directory data.
pd.set_option('display.max_rows', 200)
IPEDSfile = pd.read_stata('../../statadata/IPEDSDirInfo02to16smlr.dta', preserve_dtypes=False)

# Reduce to 2 and 4yr institutions.
IPEDSfile['filter'] = np.where((IPEDSfile['sector']=='Public, 4-year or above') | 
                               (IPEDSfile['sector']=='Public, 2-year') |
                               (IPEDSfile['sector']=='Private not-for-profit, 4-year or above') |
                               (IPEDSfile['sector']=='Private not-for-profit, 2-year'), 1, 0)
IPEDSfile = IPEDSfile[IPEDSfile['filter']==1]

# Remove www. prefix from webaddress.
IPEDSfile['rootdom'] = IPEDSfile.loc[:, 'webaddr'].replace(regex=True, to_replace='www.', value='')
# Remove miscellaneous slashes & other from webaddress.
IPEDSfile['rootdom'] = IPEDSfile.loc[:, 'rootdom'].replace(regex=True, to_replace=r'/', value='')
IPEDSfile['rootdom'] = IPEDSfile.loc[:, 'rootdom'].replace(regex=True, to_replace=r'HTTPS:', value='')
IPEDSfile['rootdom'] = IPEDSfile.loc[:, 'rootdom'].replace(regex=True, to_replace=r'https:', value='')
IPEDSfile['rootdom'] = IPEDSfile.loc[:, 'rootdom'].replace(regex=True, to_replace=r'about', value='')
# IPEDSfile['rootdom'] = IPEDSfile.loc[:, 'rootdom'].replace(to_replace=NaN, value=)
# IPEDSfile = IPEDSfile.reindex(range(len(IPEDSfile)))
IPEDSfile = IPEDSfile.reset_index()

In [None]:
# Get user input regarding previous scrape attempts.
print('Enter the school starting school root domain name. No entry will start at beginning of the list.')
start_school = input()
# If user provided starting school, remove schools before the starting school.
if start_school != '':
    new_school_loc = IPEDSfile[IPEDSfile['rootdom'] == start_school].index.tolist()[0]
    IPEDSfile = IPEDSfile[new_school_loc:]

In [None]:
# Create list of domains to scrape & check results.
weblist = IPEDSfile['rootdom']
weblist.head(n=20)

## Step Two: Prepare Browser Object

In [None]:
import webbrowser
from time import sleep
from selenium import webdriver
import os
import requests

In [None]:
# browser = webdriver.Chrome()
browser = webdriver.Firefox()
print('Loaded Browser Here')

# Step Three: Scrape For Each School

In [None]:
error_list = []
for school in weblist:
    print('STARTING SCHOOL', str(school).upper())
    searchstr = ''.join((r'https://www.google.com/search?q=',
                         r'application+admission+AND+(printable+OR+paper+OR+mail)+site:', 
                         school, r'+filetype:pdf'))
    print('Search string is ', searchstr)
    browser.get(searchstr)
    results = browser.find_elements_by_css_selector('h3 > a')

    if len(results) == 0:
        try:
            check_captcha = browser.find_element_by_partial_link_text('Why did this happen')
            print('At school, ', school.lower(), 'Google responded with CAPTCHA - Will wait for user input.', end='\n\n')
            error_list.append(''.join((school, '-', '.', '-EncounteredCAPTCHA-', '.')))
            discarded_wait = input()
            results = browser.find_elements_by_css_selector('h3 > a')
        except:
            print('No results for school ', school.lower(), end='\n\n')
            error_list.append(''.join((school, '-', '.', '-NoResults-', '.')))
            sleep(.5)

    if len(results) < 3:
        doc_count = len(results)
        print('There were {} results from Google. Will download all.'.format(len(results)))
        sleep(.1)
    else:
        doc_count = 3
        print('There were 10 or more results from Google. Will download first three.')
        sleep(.2)

    for i in range(doc_count):
        filelink = results[i].get_attribute('href')
        try:
            pdf = requests.get(filelink)
            fname = ''.join((school, str(i), '.pdf'))
            open(''.join((r'pprapps/', fname)), 'wb').write(pdf.content)
            print('Filelink = ' + filelink)
            print('Saved as : ' + fname)
            print('Header info :' + str(pdf.headers), end='\n\n')
        except ConnectionError:
            print('There was a ConnectionError on the {}th iteration at : {}'.format(str(i), school.lower()))
            error_list.append(''.join((school, '-', str(i), '-ConnectionError-', filelink)))
            sleep(.1)
        except:
            print('There was an UnspecifiedError on the {}th iteration at : {}'.format(str(i), school))
            error_list.append(''.join((school, '-', str(i), '-UnspecifiedError-', filelink)))
            sleep(.1)

In [None]:
for errors in range(len(error_list)):
    print(error_list[errors])
    
# Encountered CAPTCHA at stillman.edu               April 17, 2018
# Encountered NaN after alaskapacific.edu           April 18, 2018
# Encountered CAPTCHA at buc.edu                    April 18, 2018
# Program STALLED at arapahoe.edu                   April 18, 2018