## Scraping of DOD pubs from DTIC


In [1]:
import os
import requests



def ensure_dir(dir_name):
    INSTR_PATH = os.path.join(os.getcwd(),dir_name)
    try:
        os.mkdir(INSTR_PATH)
    except:
        pass
    


NameError: name 'locations' is not defined

In [None]:
import pickle

locations = {'dod_directives': {'pickle':'dod_dir.pickle',
                                'dir':'dod_dir',
                                'url':'http://www.esd.whs.mil/Directives/issuances/dodd/'},
            'dod_instructions':{'pickle':'dod_instr.pickle',
                                'dir':'dod_instr',
                                'url':'http://www.esd.whs.mil/Directives/issuances/dodi/'}}

#for loc in locations.items():
#    ensure_dir(loc['dir'])

def get_directives(directive_type):
    try:
        data = load_directive_data(directive_type)
    except:
        print("Didn't find data.. retrieving..")
        thief = DirectiveThief()
        data = thief.scarf_data(locations[directive_type]['url'])
        save_directive_data(data,directive_type)
        return data
    print("Found existing data.. loading from {}".format(locations[directive_type]['pickle']))    
    return data
    


def save_directive_data(data, directive_type):
    with open(locations[directive_type]['pickle'],'wb') as fp:
        return pickle.dump(data, fp, protocol=pickle.HIGHEST_PROTOCOL)
        
def load_directive_data(directive_type):
    with open(locations[directive_type]['pickle'],'rb') as fp:
        return pickle.load(fp)    

In [5]:
def save_file(doctype, url):
    fname = url.split("/")[-1]
    fpath = os.path.join(os.getcwd(), locations[doctype], fname)

    # check if already downloaded
    if not os.path.isfile(fpath):
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()
            with open(fpath, 'wb') as handle:
                for block in response.iter_content(1024):
                    handle.write(block)
        except Exception as e:
            print(e)
            pass
    else:
        print("Skipping {}".format(fpath))

### We have to tell our crawler where to start - these are the links we want.

http://www.e-publishing.af.mil/#/?view=pubs&orgID=10141&catID=1&series=-1&modID=449&tabID=71

In [6]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys


class DirectiveThief(object):
    def __init__(self):
        pass
    
    def scarf_data(self, url):
        driver = self.setup_driver()
        driver.get(url)
        return self.crawl(driver)
    
    def setup_driver(self):
        driver = webdriver.PhantomJS(executable_path='/usr/local/phantomjs/bin/phantomjs')
        driver.implicitly_wait(0.25)
        return driver

    def crawl(self, driver):
        # Start with initial links..
        links = []
        links += self.get_data(driver)

        # Loop until it thinks it is done
        while True:
            nextbutton = self.get_next_button(driver)
            if nextbutton:
                nextbutton.click()
                links += self.get_data(driver)
            else:
                return links

    def get_data(self, driver):
        objs = []

        table = driver.find_element_by_class_name('dnnGrid')        
        table_rows = table.find_elements_by_css_selector('tr')
        # first is headers
        headers = []
        for header in table_rows[0].find_elements_by_css_selector('th'):
            headers.append(header.find_element_by_css_selector('a').text)

        for row in table_rows[1:]:
            new_obj = {}
            tds = row.find_elements_by_css_selector('td')
            new_obj['url'] = tds[0].find_element_by_css_selector('a').get_attribute('href')

            for td in zip(headers, tds):
                new_obj[td[0]] = td[1].text
            objs.append(new_obj)
        return objs
    
    def get_next_button(self, driver):
        pagination_buttons = driver.find_element_by_class_name('PagingTable').find_elements_by_css_selector('a')
        next_button = pagination_buttons[-2]
        spans = driver.find_element_by_class_name('PagingTable').find_elements_by_css_selector('span')

        if spans[-1].text == 'Last':
            return None
        else:
            return next_button


### Here we tell the crawler how to discover more search results and detect when it is done.

In [7]:
instr = get_directives('dod_instructions')



Found existing data.. loading from dod_instr.pickle


### Tie it all together..

In [None]:
directives = get_directives('dod_directives')

Found existing data.. loading from dod_dir.pickle
Didn't find data.. retrieving..


## Now we can download all the documents

In [None]:
data[5]

In [None]:
#[save_file(link) for link in all_links]

In [None]:
print("We downloaded {} files".format(len(all_links)))

### Intermediate PDF -> txt not shown.. uninteresting.. here's a file

In [None]:
with open('/home/brian/usaf_instructiondestruction/afi_txt/afh10-222v14.txt') as fp:
    print(fp.read())