## Scraping of DOD pubs from DTIC


In [1]:
import os
import requests
import re

from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

def convert_pdf_to_txt(inpath, outpath=None):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    with open(inpath, 'rb') as fd:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        maxpages = 0
        caching = True
        pagenos=set()
        for page in PDFPage.get_pages(fd, pagenos, maxpages=maxpages,caching=caching, check_extractable=True):
            interpreter.process_page(page)
        device.close()
        str = retstr.getvalue()
        retstr.close()
        output = str.replace('\n', '')
    if outpath:
        with open(outpath, 'w') as outf:
            outf.write(output)
        return outpath
    else:
        return output
    

In [2]:
import pickle

locations = {'dod_directives': {'pickle':'dod_dir.pickle',
                                'dir':'dod_dir',
                                'txt_dir':'dod_dir-txt',
                                'url':'http://www.esd.whs.mil/Directives/issuances/dodd/'},
            'dod_instructions':{'pickle':'dod_instr.pickle',
                                'dir':'dod_instr',
                                'txt_dir':'dod_instr-txt',
                                'url':'http://www.esd.whs.mil/Directives/issuances/dodi/'},
             'dod_manpubs':{'pickle':'dod_manpub.pickle',
                               'dir':'dod_manpub',
                               'txt_dir':'dod_manpub-txt',
                               'url':'http://www.esd.whs.mil/Directives/issuances/dodm/'},
             'dod_dtms':{'pickle':'dod_dtm.pickle',
                        'dir':'dod_dtm',
                        'txt_dir':'dod_dtm-txt',
                        'url':'http://www.esd.whs.mil/DD/DoD-Issuances/DTM/'},
             'dod_ais':{'pickle':'dod_ai.pickle',
                       'dir':'dod_ai',
                       'txt_dir':'dod_ai-txt',
                       'url':'http://www.esd.whs.mil/Directives/issuances/admin_inst/'}
            }


def get_directives(directive_type):
    if os.path.exists(locations[directive_type]['pickle']):
        print("Found existing data.. loading from {}".format(locations[directive_type]['pickle']))    
        return load_directive_data(directive_type)
    else:
        print("Didn't find data for {}.. retrieving..".format(directive_type))
        thief = DirectiveThief()
        data = thief.scarf_data(locations[directive_type]['url'])
        save_directive_data(data,directive_type)
        return data

def save_directive_data(data, directive_type):
    with open(locations[directive_type]['pickle'],'wb') as fp:
        return pickle.dump(data, fp, protocol=pickle.HIGHEST_PROTOCOL)
        
def load_directive_data(directive_type):
    with open(locations[directive_type]['pickle'],'rb') as fp:
        return pickle.load(fp)    

### We have to tell our crawler where to start - these are the links we want.

http://www.e-publishing.af.mil/#/?view=pubs&orgID=10141&catID=1&series=-1&modID=449&tabID=71

In [3]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys


class DirectiveThief(object):
    def __init__(self):
        pass
    
    def scarf_data(self, url):
        driver = self.setup_driver()
        driver.get(url)
        return self.crawl(driver)
    
    def setup_driver(self):
        driver = webdriver.PhantomJS(executable_path='/usr/local/phantomjs/bin/phantomjs')
        driver.implicitly_wait(0.25)
        return driver

    def crawl(self, driver):
        # Start with initial links..
        records = []
        records += self.get_data(driver)

        # Loop until it thinks it is done
        while True:
            nextbutton = self.get_next_button(driver)
            if nextbutton:
                nextbutton.click()
                records += self.get_data(driver)
            else:
                return  {record['url'].split("/")[-1].split("?")[0]:record for record in records}
            

    def get_data(self, driver):
        objs = []

        table = driver.find_element_by_class_name('dnnGrid')        
        table_rows = table.find_elements_by_css_selector('tr')
        # first is headers
        headers = []
        for header in table_rows[0].find_elements_by_css_selector('th'):
            headers.append(header.find_element_by_css_selector('a').text)

        for row in table_rows[1:]:
            new_obj = {}
            tds = row.find_elements_by_css_selector('td')
            new_obj['url'] = tds[0].find_element_by_css_selector('a').get_attribute('href')
            new_obj['fname'] = new_obj['url'].split("/")[-1].split("?")[0]

            for td in zip(headers, tds):
                new_obj[td[0]] = td[1].text
            objs.append(new_obj)
        return objs
    
    def get_next_button(self, driver):
        pagination_buttons = driver.find_element_by_class_name('PagingTable').find_elements_by_css_selector('a')
        
        try:
            next_button = pagination_buttons[-2]
            spans = driver.find_element_by_class_name('PagingTable').find_elements_by_css_selector('span')
        except IndexError:
            return None
        if spans[-1].text == 'Last':
            return None
        else:
            return next_button


### Here we tell the crawler how to discover more search results and detect when it is done.

In [4]:
from tqdm import tqdm


class DirectiveCorpus(object):
    def __init__(self, datadict, directive_type):
        self.records = datadict
        self.raw_data_dir = os.path.join(os.getcwd(),locations[directive_type]['dir'])
        self.txt_data_dir = os.path.join(os.getcwd(),locations[directive_type]['txt_dir'])
        self.ensure_dir(self.raw_data_dir)
        
        
    def ensure_dir(self, dir_name):
        INSTR_PATH = os.path.join(os.getcwd(), dir_name)
        try:
            os.makedirs(INSTR_PATH)
        except:
            pass
    
    def assemble(self):
        self.download_all()
        self.convert_all()
        self.load_contents()
        return self.as_list()
    
    def download_all(self):
        for k,v in tqdm(self.records.items()):
            self.save_record(k,v)

    def convert_all(self):
        for k,v in tqdm(self.records.items(),desc='Converting to Txt'):
            self.convert_to_txt(k,v)            

    def load_record(self, record):
        return self.load_file(record['txt_path'])
        
        
    def load_file(self, fname):       
        with open(fname, 'r') as myfile:
            contents = re.sub(r'[\t\n\r\x0b\x0c]',' ', myfile.read())
            return contents
            #return re.sub("\s+",' ', contents)
        
    def load_contents(self):
        for k,v in tqdm(self.records.items(),desc='Loading'):
            try:
                self.records[k]['contents'] = self.load_record(v)
            except KeyError as e:
                print("error loading {} - contents are {}".format(k,v))
                
    
    def as_list(self):
        thelist = []
        for k,v in tqdm(self.records.items(), desc='Exporting'):
            thelist.append(v)
        return thelist

                         
    def set_pdf_fpath(self, record_id, fpath):
        self.records[record_id]['pdf_path']= fpath

    def set_txt_fpath(self, record_id, fpath):
        self.records[record_id]['txt_path']= fpath
        
        
    def convert_to_txt(self, record_id, record):
        txt_name = record['fname'].split(".pdf")[0] + ".txt"
        fpath = os.path.join(self.txt_data_dir, txt_name)
        pdf_path = os.path.join(self.raw_data_dir, record['fname'])

        if not os.path.exists(fpath):
            try:
                try:
                    os.makedirs(self.txt_data_dir)
                except FileExistsError:
                    pass
                convert_pdf_to_txt(pdf_path, fpath)
                self.set_txt_fpath(record_id, fpath)
            except Exception as e:
                print(e)
        else:
            self.set_txt_fpath(record_id, fpath)
            pass
        
            #print("Skipping {}".format(fpath))
        
    def save_record(self, record_id, record):
        
        fpath = os.path.join(self.raw_data_dir, record['fname'])
        #print(fpath)
        # check if already downloaded
        if not os.path.isfile(fpath):
            try:
                response = requests.get(record['url'], stream=True)
                response.raise_for_status()
                with open(fpath, 'wb') as handle:
                    for block in response.iter_content(1024):
                        handle.write(block)
                #self.set_pdf_fpath(record, fpath)
            except Exception as e:
                print(e)
                pass
        else:
            pass
            #print("Skipping {}".format(fpath))


### Tie it all together..

In [5]:

def save_pickle(data, path, fname):
    if not os.path.isdir(path):
        os.makedirs(path)
    fpath = os.path.join(path, fname)
    with open(fpath,'wb') as fp:
        return pickle.dump(data, fp, protocol=pickle.HIGHEST_PROTOCOL)
        
def load_pickle(fpath):
    with open(fpath,'rb') as fp:
        return pickle.load(fp)
    
    

In [6]:
instr = get_directives('dod_instructions')
instr_corpus = DirectiveCorpus(instr,'dod_instructions').assemble()
save_pickle(instr_corpus,"/home/brian/corpora","dod_instructions.pickle")

Didn't find data for dod_instructions.. retrieving..


 26%|██▌       | 198/772 [00:59<02:53,  3.31it/s]

[Errno 21] Is a directory: '/home/brian/usaf_instructiondestruction/dod_instr/'


 29%|██▉       | 222/772 [01:22<03:24,  2.70it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 29%|██▉       | 223/772 [01:42<04:12,  2.18it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 29%|██▉       | 226/772 [02:02<04:55,  1.85it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 30%|██▉       | 231/772 [02:22<05:33,  1.62it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 32%|███▏      | 250/772 [02:52<06:00,  1.45it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 33%|███▎      | 253/772 [03:12<06:34,  1.31it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 33%|███▎      | 258/772 [03:32<07:03,  1.21it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 34%|███▍      | 263/772 [03:52<07:29,  1.13it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 37%|███▋      | 288/772 [04:12<07:04,  1.14it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 41%|████      | 318/772 [04:32<06:29,  1.17it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 44%|████▍     | 340/772 [04:52<06:11,  1.16it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 44%|████▍     | 341/772 [05:12<06:35,  1.09it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 45%|████▍     | 344/772 [05:32<06:53,  1.03it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 45%|████▍     | 345/772 [05:52<07:16,  1.02s/it]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 45%|████▌     | 348/772 [06:12<07:34,  1.07s/it]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 62%|██████▏   | 476/772 [07:02<04:22,  1.13it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 62%|██████▏   | 477/772 [07:22<04:33,  1.08it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 63%|██████▎   | 486/772 [07:25<04:22,  1.09it/s]

404 Client Error: Not Found for url: http://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodi/510581p.pdf


 63%|██████▎   | 489/772 [07:42<04:27,  1.06it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 70%|██████▉   | 540/772 [08:12<03:31,  1.10it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 73%|███████▎  | 564/772 [08:32<03:09,  1.10it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 73%|███████▎  | 567/772 [08:52<03:12,  1.06it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


100%|██████████| 772/772 [09:58<00:00,  1.29it/s]
Converting to Txt:  25%|██▌       | 196/772 [09:54<29:05,  3.03s/it]

[Errno 21] Is a directory: '/home/brian/usaf_instructiondestruction/dod_instr/'


Converting to Txt:  29%|██▊       | 221/772 [11:09<27:50,  3.03s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O200016v1_dodi_2016.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O200016v2_dodi_2016.pdf'


Converting to Txt:  29%|██▉       | 225/772 [11:12<27:15,  2.99s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O200022p.pdf'


Converting to Txt:  30%|██▉       | 230/772 [11:20<26:42,  2.96s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O200027p.pdf'


Converting to Txt:  32%|███▏      | 249/772 [11:59<25:10,  2.89s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O300008p.pdf'


Converting to Txt:  33%|███▎      | 252/772 [12:04<24:54,  2.87s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O300013_dodi_2017.pdf'


Converting to Txt:  33%|███▎      | 256/772 [12:09<24:31,  2.85s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O300205p.pdf'


Converting to Txt:  34%|███▍      | 262/772 [12:23<24:08,  2.84s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O302043p.pdf'


Converting to Txt:  37%|███▋      | 287/772 [13:20<22:33,  2.79s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O311507p.pdf'


Converting to Txt:  41%|████      | 317/772 [14:14<20:26,  2.70s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O330004p.pdf'


Converting to Txt:  44%|████▍     | 340/772 [14:38<18:36,  2.58s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O360003p.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O360702p.pdf'


Converting to Txt:  44%|████▍     | 343/772 [14:39<18:20,  2.56s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O371002p.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O371003_dodi_2017.pdf'


Converting to Txt:  45%|████▍     | 347/772 [14:42<18:00,  2.54s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O378001p.pdf'


Converting to Txt:  59%|█████▉    | 456/772 [20:19<14:05,  2.67s/it]

Unknown operator: '\x06'


Converting to Txt:  61%|██████▏   | 473/772 [20:55<13:13,  2.65s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O510094p.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O510095p.pdf'


Converting to Txt:  63%|██████▎   | 484/772 [21:07<12:34,  2.62s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/510581p.pdf'


Converting to Txt:  63%|██████▎   | 487/772 [21:12<12:24,  2.61s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O511011p.pdf'


Converting to Txt:  70%|██████▉   | 539/772 [22:52<09:53,  2.55s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O521063p.pdf'


Converting to Txt:  71%|███████   | 545/772 [23:04<09:36,  2.54s/it]

Unknown operator: '\x06'


Converting to Txt:  71%|███████   | 546/772 [23:04<09:33,  2.54s/it]

Unknown operator: '\x06'


Converting to Txt:  73%|███████▎  | 563/772 [23:44<08:48,  2.53s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O524021p.pdf'


Converting to Txt:  73%|███████▎  | 565/772 [23:46<08:42,  2.52s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O524024p.pdf'


Converting to Txt:  75%|███████▌  | 580/772 [24:05<07:58,  2.49s/it]

Unknown operator: '\x06'


Converting to Txt:  80%|███████▉  | 616/772 [25:18<06:24,  2.46s/it]

Unknown operator: '\x06'


Converting to Txt:  81%|████████▏ | 628/772 [25:42<05:53,  2.46s/it]

Unknown operator: '\x06'


Converting to Txt:  82%|████████▏ | 636/772 [25:53<05:32,  2.44s/it]

Unknown operator: '\x06'


Converting to Txt: 100%|██████████| 772/772 [34:25<00:00,  2.68s/it]
Loading: 100%|██████████| 772/772 [00:00<00:00, 2875.12it/s]
Exporting: 100%|██████████| 772/772 [00:00<00:00, 815617.81it/s]

error loading  - contents are {'url': 'http://www.esd.whs.mil/Directives/issuances/140025/', 'fname': '', 'Issuance #': 'DoDI 1400.25', 'Issuance Date': ' ', 'Issuance Subject': 'DoD Civilian Personnel Management System', 'CH. #': ' ', 'CH. Date': ' ', 'Exp. Date': ' ', 'Related Memo.': ' ', 'OPR': 'USD(P&R)'}
error loading O200016v1_dodi_2016.pdf - contents are {'url': 'https://directives.whs.mil/issuances/O200016v1_dodi_2016.pdf', 'fname': 'O200016v1_dodi_2016.pdf', 'Issuance #': 'DoDI O-2000.16 Volume 1', 'Issuance Date': '11/17/2016', 'Issuance Subject': 'DoD Antiterrorism (AT) Program Implementation: DoD At Standards\n(This website is not authorized to post controlled documents. DoD PKI certificate required to access this document.)', 'CH. #': 'CH 1', 'CH. Date': '5/5/2017', 'Exp. Date': ' ', 'Related Memo.': ' ', 'OPR': 'USD(P) 703-571-9255'}
error loading O200016v2_dodi_2016.pdf - contents are {'url': 'https://directives.whs.mil/issuances/O200016v2_dodi_2016.pdf', 'fname': 'O200




In [7]:
ais = get_directives('dod_ais')
ais_corpus = DirectiveCorpus(ais,'dod_ais').assemble()
save_pickle(ais_corpus,"/home/brian/corpora","dod_ais.pickle")

  0%|          | 0/50 [00:00<?, ?it/s]

Found existing data.. loading from dod_ai.pickle


100%|██████████| 50/50 [00:14<00:00,  3.55it/s]
Converting to Txt: 100%|██████████| 50/50 [00:00<00:00, 38374.24it/s]
Loading: 100%|██████████| 50/50 [00:00<00:00, 2185.54it/s]
Exporting: 100%|██████████| 50/50 [00:00<00:00, 145939.60it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_ai/a116p_FOUO.pdf'
error loading a116p_FOUO.pdf - contents are {'url': 'https://directives.whs.mil/issuances/a116p_FOUO.pdf', 'fname': 'a116p_FOUO.pdf', 'Issuance #': 'AI O-116', 'Issuance Date': '10/21/2014', 'Issuance Subject': 'Post-balanced Survivability Assessment (P-BSA) Program\n(This website is not authorized to post controlled documents. DoD PKI certificate required to access this document.)', 'CH. #': ' ', 'CH. Date': ' ', 'EXP. Date': ' ', 'OPR': 'DCMO\n703-614-8888'}





In [8]:
manpubs = get_directives('dod_manpubs')
manpubs_corpus = DirectiveCorpus(manpubs,'dod_manpubs').assemble()
save_pickle(manpubs_corpus,"/home/brian/corpora","dod_manpubs.pickle")

  0%|          | 0/175 [00:00<?, ?it/s]

Found existing data.. loading from dod_manpub.pickle


  2%|▏         | 3/175 [00:19<19:04,  6.65s/it]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 11%|█▏        | 20/175 [00:29<03:51,  1.50s/it]

[Errno 21] Is a directory: '/home/brian/usaf_instructiondestruction/dod_manpub/'


 14%|█▎        | 24/175 [00:49<05:14,  2.08s/it]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 51%|█████▏    | 90/175 [01:30<01:25,  1.00s/it]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 71%|███████   | 124/175 [02:00<00:49,  1.03it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 98%|█████████▊| 172/175 [02:30<00:02,  1.15it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


100%|██████████| 175/175 [02:31<00:00,  1.15it/s]
Converting to Txt:   1%|          | 2/175 [00:25<36:09, 12.54s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_manpub/O100021_dodm_2017.pdf'


Converting to Txt:  11%|█         | 19/175 [03:22<27:44, 10.67s/it]

[Errno 21] Is a directory: '/home/brian/usaf_instructiondestruction/dod_manpub/'


Converting to Txt:  13%|█▎        | 22/175 [03:38<25:21,  9.94s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_manpub/O302044M.pdf'


Converting to Txt:  51%|█████     | 89/175 [15:25<14:54, 10.40s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_manpub/O465011_dodm_2017.pdf'


Converting to Txt:  70%|███████   | 123/175 [20:24<08:37,  9.96s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_manpub/O520513p.pdf'


Converting to Txt:  74%|███████▎  | 129/175 [21:25<07:38,  9.97s/it]

Unknown operator: '\x03'


Converting to Txt:  98%|█████████▊| 171/175 [31:00<00:43, 10.88s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_manpub/O853001M.pdf'


Converting to Txt: 100%|██████████| 175/175 [31:38<00:00, 10.85s/it]
Loading: 100%|██████████| 175/175 [00:00<00:00, 950.85it/s] 
Exporting: 100%|██████████| 175/175 [00:00<00:00, 533432.56it/s]


error loading O100021_dodm_2017.pdf - contents are {'url': 'https://directives.whs.mil/issuances/O100021_dodm_2017.pdf', 'fname': 'O100021_dodm_2017.pdf', 'Issuance #': 'DoDM O-1000.21', 'Issuance Date': '3/6/2017', 'Issuance Subject': 'Passport And Passport Agent Services\n(This website is not authorized to post controlled documents. DoD PKI certificate required to access this document.)', 'CH. #': ' ', 'CH. Date': ' ', 'Exp. Date': ' ', 'Related Memo.': ' ', 'OPR': 'DCMO 703-601-6128'}
error loading  - contents are {'url': 'http://www.esd.whs.mil/Directives/issuances/414025m/', 'fname': '', 'Issuance #': 'DoD 4140.25-M\nVolume 1-3\n(Date Varies)', 'Issuance Date': ' ', 'Issuance Subject': 'DoD Management Of Bulk Petroleum Products, Natural Gas, And Coal', 'CH. #': ' ', 'CH. Date': ' ', 'Exp. Date': ' ', 'Related Memo.': ' ', 'OPR': 'USD(AT&L) 703-697-2525'}
error loading O302044M.pdf - contents are {'url': 'https://directives.whs.mil/issuances/O302044M.pdf', 'fname': 'O302044M.pdf', 

In [9]:
dtm = get_directives('dod_dtms')
dtms_corpus = DirectiveCorpus(dtm,'dod_dtms').assemble()
save_pickle(dtms_corpus,"/home/brian/corpora","dod_dtms.pickle")

Didn't find data for dod_dtms.. retrieving..


100%|██████████| 17/17 [00:09<00:00,  1.80it/s]
Converting to Txt: 100%|██████████| 17/17 [00:27<00:00,  1.64s/it]
Loading: 100%|██████████| 17/17 [00:00<00:00, 3995.69it/s]
Exporting: 100%|██████████| 17/17 [00:00<00:00, 180514.35it/s]


In [10]:
directives = get_directives('dod_directives')
dir_corpus = DirectiveCorpus(directives,'dod_directives').assemble()
save_pickle(dir_corpus,"/home/brian/corpora","dod_directives.pickle")

Didn't find data for dod_directives.. retrieving..


 30%|██▉       | 94/314 [00:00<00:00, 546.34it/s]

404 Client Error: Not Found for url: http://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/332505p.pdf


100%|██████████| 314/314 [00:20<00:00, 15.42it/s]
Converting to Txt:   0%|          | 0/314 [00:00<?, ?it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


Converting to Txt:  30%|███       | 95/314 [02:01<04:40,  1.28s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_dir/332505p.pdf'


Converting to Txt:  41%|████▏     | 130/314 [02:47<03:56,  1.29s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_dir/O510019p.pdf'


Converting to Txt: 100%|██████████| 314/314 [07:10<00:00,  1.37s/it]
Loading: 100%|██████████| 314/314 [00:00<00:00, 4601.57it/s]
Exporting: 100%|██████████| 314/314 [00:00<00:00, 681330.29it/s]

error loading 332505p.pdf - contents are {'url': 'http://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/332505p.pdf', 'fname': '332505p.pdf', 'Issuance #': 'DoDD C-3325.05', 'Issuance Date': '2/25/2000', 'Issuance Subject': 'Classified Title', 'CH. #': ' ', 'CH. Date': ' ', 'EXP. Date': ' ', 'Related Memo.': ' ', 'OPR': 'USD(P) 703-571-9255'}
error loading O510019p.pdf - contents are {'url': 'https://directives.whs.mil/issuances/O510019p.pdf', 'fname': 'O510019p.pdf', 'Issuance #': 'DoDD O-5100.19', 'Issuance Date': '11/12/2014', 'Issuance Subject': 'Critical Information Communications (CRITICOMM) System\n(This website is not authorized to post controlled documents. DoD PKI certificate required to access this document.)', 'CH. #': ' ', 'CH. Date': ' ', 'EXP. Date': ' ', 'Related Memo.': ' ', 'OPR': 'USD(I)'}



