## Scraping of DOD pubs from DTIC


In [1]:
import os
import requests





In [2]:
import re

from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

def convert_pdf_to_txt(inpath, outpath=None):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    with open(inpath, 'rb') as fd:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        maxpages = 0
        caching = True
        pagenos=set()
        for page in PDFPage.get_pages(fd, pagenos, maxpages=maxpages,caching=caching, check_extractable=True):
            interpreter.process_page(page)
        device.close()
        str = retstr.getvalue()
        retstr.close()
        output = str.replace('\n', '')
    if outpath:
        with open(outpath, 'w') as outf:
            outf.write(output)
        return outpath
    else:
        return output
    

In [3]:
import pickle

locations = {'dod_directives': {'pickle':'dod_dir.pickle',
                                'dir':'dod_dir',
                                'txt_dir':'dod_dir-txt',
                                'url':'http://www.esd.whs.mil/Directives/issuances/dodd/'},
            'dod_instructions':{'pickle':'dod_instr.pickle',
                                'dir':'dod_instr',
                                'txt_dir':'dod_instr-txt',
                                'url':'http://www.esd.whs.mil/Directives/issuances/dodi/'},
             'dod_manpubs':{'pickle':'dod_manpub.pickle',
                               'dir':'dod_manpub',
                               'txt_dir':'dod_manpub-txt',
                               'url':'http://www.esd.whs.mil/Directives/issuances/dodm/'},
             'dod_dtms':{'pickle':'dod_dtm.pickle',
                        'dir':'dod_dtm',
                        'txt_dir':'dod_dtm-txt',
                        'url':'http://www.esd.whs.mil/DD/DoD-Issuances/DTM/'},
             'dod_ais':{'pickle':'dod_ai.pickle',
                       'dir':'dod_ai',
                       'txt_dir':'dod_ai-txt',
                       'url':'http://www.esd.whs.mil/Directives/issuances/admin_inst/'}
            }

#for loc in locations.items():
#    ensure_dir(loc['dir'])

def get_directives(directive_type):
    try:
        data = load_directive_data(directive_type)
    except:
        print("Didn't find data for {}.. retrieving..".format(directive_type))
        thief = DirectiveThief()
        data = thief.scarf_data(locations[directive_type]['url'])
        save_directive_data(data,directive_type)
        return data
    print("Found existing data.. loading from {}".format(locations[directive_type]['pickle']))    
    return data
    


def save_directive_data(data, directive_type):
    with open(locations[directive_type]['pickle'],'wb') as fp:
        return pickle.dump(data, fp, protocol=pickle.HIGHEST_PROTOCOL)
        
def load_directive_data(directive_type):
    with open(locations[directive_type]['pickle'],'rb') as fp:
        return pickle.load(fp)    

### We have to tell our crawler where to start - these are the links we want.

http://www.e-publishing.af.mil/#/?view=pubs&orgID=10141&catID=1&series=-1&modID=449&tabID=71

In [9]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys


class DirectiveThief(object):
    def __init__(self):
        pass
    
    def scarf_data(self, url):
        driver = self.setup_driver()
        driver.get(url)
        return self.crawl(driver)
    
    def setup_driver(self):
        driver = webdriver.PhantomJS(executable_path='/usr/local/phantomjs/bin/phantomjs')
        driver.implicitly_wait(0.25)
        return driver

    def crawl(self, driver):
        # Start with initial links..
        records = []
        records += self.get_data(driver)

        # Loop until it thinks it is done
        while True:
            nextbutton = self.get_next_button(driver)
            if nextbutton:
                nextbutton.click()
                records += self.get_data(driver)
            else:
                return  {record['url'].split("/")[-1].split("?")[0]:record for record in records}
            

    def get_data(self, driver):
        objs = []

        table = driver.find_element_by_class_name('dnnGrid')        
        table_rows = table.find_elements_by_css_selector('tr')
        # first is headers
        headers = []
        for header in table_rows[0].find_elements_by_css_selector('th'):
            headers.append(header.find_element_by_css_selector('a').text)

        for row in table_rows[1:]:
            new_obj = {}
            tds = row.find_elements_by_css_selector('td')
            new_obj['url'] = tds[0].find_element_by_css_selector('a').get_attribute('href')

            for td in zip(headers, tds):
                new_obj[td[0]] = td[1].text
            objs.append(new_obj)
        return objs
    
    def get_next_button(self, driver):
        pagination_buttons = driver.find_element_by_class_name('PagingTable').find_elements_by_css_selector('a')
        
        try:
            next_button = pagination_buttons[-2]
            spans = driver.find_element_by_class_name('PagingTable').find_elements_by_css_selector('span')
        except IndexError:
            return None
        if spans[-1].text == 'Last':
            return None
        else:
            return next_button


### Here we tell the crawler how to discover more search results and detect when it is done.

In [10]:
from tqdm import tqdm


class DirectiveCorpus(object):
    def __init__(self, datadict, directive_type):
        self.records = datadict
        self.raw_data_dir = os.path.join(os.getcwd(),locations[directive_type]['dir'])
        self.txt_data_dir = os.path.join(os.getcwd(),locations[directive_type]['txt_dir'])
        self.ensure_dir(self.raw_data_dir)
        
        
    def ensure_dir(self, dir_name):
        INSTR_PATH = os.path.join(os.getcwd(), dir_name)
        try:
            os.makedirs(INSTR_PATH)
        except:
            pass
    
    def assemble(self):
        self.download_all()
        self.convert_all()
        return self.assemble_to_corpus()
    
    def download_all(self):
        for record in tqdm(self.records):
            self.save_record(record)

    def convert_all(self):
        for record in tqdm(self.records):
            self.convert_to_txt(record)            

    def load_record(self, record):
        txt_fpath = os.path.join(self.txt_data_dir, fname)
        return self.load_file(txt_fpath)
        
        
    def load_file(self, fname):       
        with open(fname, 'r') as myfile:
            contents = re.sub(r'[\t\n\r\x0b\x0c]',' ', myfile.read())
            return contents
            #return re.sub("\s+",' ', contents)
        
    def assemble_to_corpus(self):
        new_recs = []
        for record in tqdm(self.records):
            record['contents'] = self.load_record(record)
            new_recs.append(record)
        return new_recs

                         
    def set_pdf_fpath(self, record, fpath):
        self.records[record]['pdf_path']= fpath

    def set_txt_fpath(self, record, fpath):
        self.records[record]['txt_path']= fpath
        
        
    def convert_to_txt(self, record):
        fpath = os.path.join(self.txt_data_dir, fname)
        pdf_path = os.path.join(self.raw_data_dir, fname)
        try:
            convert_pdf_to_txt(pdf_path, fpath)
            #self.set_txt_fpath(record, fpath)
        except Exception as e:
            print(e)
        
    def save_record(self, record):
        fname = record['url'].split("/")[-1].split("?")[0] # some have version crap.. we don't care about this right now
        fpath = os.path.join(self.raw_data_dir, fname)
        # check if already downloaded
        if not os.path.isfile(fpath):
            try:
                response = requests.get(record['url'], stream=True)
                response.raise_for_status()
                with open(fpath, 'wb') as handle:
                    for block in response.iter_content(1024):
                        handle.write(block)
                #self.set_pdf_fpath(record, fpath)
            except Exception as e:
                print(e)
                pass
        else:
            pass
            #print("Skipping {}".format(fpath))


### Tie it all together..

In [11]:
ais = get_directives('dod_ais')

Found existing data.. loading from dod_ai.pickle


In [12]:
ais

[{'CH. #': ' ',
  'CH. Date': ' ',
  'EXP. Date': ' ',
  'Issuance #': 'AI 1',
  'Issuance Date': '10/19/2006',
  'Issuance Subject': 'Telecommunications Service for National Capital Region',
  'OPR': 'WHS',
  'url': 'http://www.esd.whs.mil/Portals/54/Documents/DD/issuances/ai/a001p.pdf'},
 {'CH. #': 'CH 1',
  'CH. Date': '4/20/2017',
  'EXP. Date': ' ',
  'Issuance #': 'AI 2',
  'Issuance Date': '2/22/2012',
  'Issuance Subject': 'Employment of Experts and Consultants',
  'OPR': 'WHS',
  'url': 'http://www.esd.whs.mil/Portals/54/Documents/DD/issuances/ai/ai002p.pdf'},
 {'CH. #': ' ',
  'CH. Date': ' ',
  'EXP. Date': ' ',
  'Issuance #': 'AI 8',
  'Issuance Date': '12/16/2016',
  'Issuance Subject': 'Disciplinary and Adverse Actions',
  'OPR': 'WHS',
  'url': 'http://www.esd.whs.mil/Portals/54/Documents/DD/issuances/ai/AI08_2016.pdf'},
 {'CH. #': 'CH 1',
  'CH. Date': '7/6/2017',
  'EXP. Date': ' ',
  'Issuance #': 'AI 9',
  'Issuance Date': '11/6/2013',
  'Issuance Subject': 'Process

In [6]:
instr = get_directives('dod_instructions')
directives = get_directives('dod_directives')
manpubs = get_directives('dod_manpubs')
dtm = get_directives('dod_dtms')
ais = get_directives('dod_ais')

Didn't find data for dod_instructions.. retrieving..


KeyboardInterrupt: 

## Now we can download all the documents

In [None]:
ais_corpus = DirectiveCorpus(ais,'dod_ais')
dtm_corpus = DirectiveCorpus(get_directives('dod_dtms'),'dod_dtms')

In [None]:
dtm_corpus.download_all()