## Scraping of DOD pubs from DTIC


In [1]:
import os
import requests

locations = {'dod_directives':'dod_dir',
            'dod_instructions':'dod_instr'}

def ensure_dir(dir_name):
    INSTR_PATH = os.path.join(os.getcwd(),dir_name)
    try:
        os.mkdir(INSTR_PATH)
    except:
        pass
    
for loc in locations.items():
    ensure_dir(loc[1])

In [2]:
def save_file(doctype, url):
    fname = url.split("/")[-1]
    fpath = os.path.join(os.getcwd(), locations[doctype], fname)

    # check if already downloaded
    if not os.path.isfile(fpath):
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()
            with open(fpath, 'wb') as handle:
                for block in response.iter_content(1024):
                    handle.write(block)
        except Exception as e:
            print(e)
            pass
    else:
        print("Skipping {}".format(fpath))

In [54]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep

driver = webdriver.PhantomJS(executable_path='/usr/local/phantomjs/bin/phantomjs')
driver.implicitly_wait(0.5)


### We have to tell our crawler where to start - these are the links we want.

http://www.e-publishing.af.mil/#/?view=pubs&orgID=10141&catID=1&series=-1&modID=449&tabID=71

In [55]:
dodd_url = 'http://www.esd.whs.mil/Directives/issuances/dodd/'
dodi_url = 'http://www.esd.whs.mil/Directives/issuances/dodi/'

driver.get(dodi_url)

### Here we tell the crawler how to discover more search results and detect when it is done.

In [63]:
def rip_table():
    headers = get_hdrs()
    return get_recs(headers)

def get_hdrs():
    table = driver.find_element_by_class_name('dnnGrid')
    # get headers
    header_row = table.find_element_by_class_name('dnnGridHeader')
    headers = header_row.find_elements_by_css_selector('th')
    # get records
    ret = []
    for header in headers:
        ret.append(header.find_element_by_css_selector('a').text)
    return ret

def get_recs(headers):
    objs = []
    get_hdrs()
    table = driver.find_element_by_class_name('dnnGrid')        
    recs = table.find_elements_by_class_name('dnnGridItem')
    recs_alt = table.find_elements_by_class_name('dnnGridAltItem')
    for rec in recs:
        new_obj = {}
        tds = rec.find_elements_by_css_selector('td')
        new_obj['url'] = tds[0].find_element_by_css_selector('a').get_attribute('href')
        
        for td in zip(headers, tds):
            new_obj[td[0]] = td[1].text
        objs.append(new_obj)
    return objs
    #return [link.get_attribute('href') for link in links]

def get_next_button():
    pagination_buttons = driver.find_element_by_class_name('PagingTable').find_elements_by_css_selector('a')
    next_button = pagination_buttons[-1]
    spans = driver.find_element_by_class_name('PagingTable').find_elements_by_css_selector('span')
    if spans[0].text == 'First':
        return next_button
    elif spans[-1:].text == 'Last':
        return None

In [64]:
driver.find_element_by_class_name('PagingTable').find_elements_by_css_selector('span')

[<selenium.webdriver.remote.webelement.WebElement (session="453567e0-e68b-11e7-9e09-1b765e82268d", element=":wdc:1513887158744")>,
 <selenium.webdriver.remote.webelement.WebElement (session="453567e0-e68b-11e7-9e09-1b765e82268d", element=":wdc:1513887158745")>,
 <selenium.webdriver.remote.webelement.WebElement (session="453567e0-e68b-11e7-9e09-1b765e82268d", element=":wdc:1513887158746")>]

### Tie it all together..

In [67]:
def crawl():
    # Start with initial links..
    links = []
    links += rip_table()
    
    # Loop until it thinks it is done
    while True:
        nextbutton = get_next_button()
        if nextbutton:
            nextbutton.click()
            links += rip_table()
        else:
            return links


## Now we can download all the documents

In [68]:
all_links = crawl()

AttributeError: 'list' object has no attribute 'text'

In [None]:
all_links[3]

In [None]:
[save_file(link) for link in all_links]

In [None]:
print("We downloaded {} files".format(len(all_links)))

### Intermediate PDF -> txt not shown.. uninteresting.. here's a file

In [None]:
with open('/home/brian/usaf_instructiondestruction/afi_txt/afh10-222v14.txt') as fp:
    print(fp.read())