## Scraping of DOD pubs from DTIC


In [1]:
import os
import requests
import re

from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

def convert_pdf_to_txt(inpath, outpath=None):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    with open(inpath, 'rb') as fd:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        maxpages = 0
        caching = True
        pagenos=set()
        for page in PDFPage.get_pages(fd, pagenos, maxpages=maxpages,caching=caching, check_extractable=True):
            interpreter.process_page(page)
        device.close()
        str = retstr.getvalue()
        retstr.close()
        output = str.replace('\n', '')
    if outpath:
        with open(outpath, 'w') as outf:
            outf.write(output)
        return outpath
    else:
        return output
    

In [16]:
import pickle

locations = {'dod_directives': {'pickle':'dod_dir.pickle',
                                'dir':'dod_dir',
                                'txt_dir':'dod_dir-txt',
                                'url':'http://www.esd.whs.mil/Directives/issuances/dodd/'},
            'dod_instructions':{'pickle':'dod_instr.pickle',
                                'dir':'dod_instr',
                                'txt_dir':'dod_instr-txt',
                                'url':'http://www.esd.whs.mil/Directives/issuances/dodi/'},
             'dod_manpubs':{'pickle':'dod_manpub.pickle',
                               'dir':'dod_manpub',
                               'txt_dir':'dod_manpub-txt',
                               'url':'http://www.esd.whs.mil/Directives/issuances/dodm/'},
             'dod_dtms':{'pickle':'dod_dtm.pickle',
                        'dir':'dod_dtm',
                        'txt_dir':'dod_dtm-txt',
                        'url':'http://www.esd.whs.mil/DD/DoD-Issuances/DTM/'},
             'dod_ais':{'pickle':'dod_ai.pickle',
                       'dir':'dod_ai',
                       'txt_dir':'dod_ai-txt',
                       'url':'http://www.esd.whs.mil/Directives/issuances/admin_inst/'},
             'dod_issuances': {'pickle':'dod_issuances.pickle',
                        'dir':'dod_issuances',
                        'txt_dir':'dod_issuances-txt',
                        'url':'http://www.esd.whs.mil/DD/DoD-Issuances/140025/'}
            }


def get_directives(directive_type):
    if os.path.exists(locations[directive_type]['pickle']):
        print("Found existing data.. loading from {}".format(locations[directive_type]['pickle']))    
        return load_directive_data(directive_type)
    else:
        print("Didn't find data for {}.. retrieving..".format(directive_type))
        thief = DirectiveThief()
        data = thief.scarf_data(locations[directive_type]['url'])
        save_directive_data(data,directive_type)
        return data

def save_directive_data(data, directive_type):
    with open(locations[directive_type]['pickle'],'wb') as fp:
        return pickle.dump(data, fp, protocol=pickle.HIGHEST_PROTOCOL)
        
def load_directive_data(directive_type):
    with open(locations[directive_type]['pickle'],'rb') as fp:
        return pickle.load(fp)    

### We have to tell our crawler where to start - these are the links we want.

http://www.e-publishing.af.mil/#/?view=pubs&orgID=10141&catID=1&series=-1&modID=449&tabID=71

In [3]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys


class DirectiveThief(object):
    def __init__(self):
        pass
    
    def scarf_data(self, url):
        driver = self.setup_driver()
        driver.get(url)
        return self.crawl(driver)
    
    def setup_driver(self):
        driver = webdriver.PhantomJS(executable_path='/usr/local/phantomjs/bin/phantomjs')
        driver.implicitly_wait(0.25)
        return driver

    def crawl(self, driver):
        # Start with initial links..
        records = []
        records += self.get_data(driver)

        # Loop until it thinks it is done
        while True:
            nextbutton = self.get_next_button(driver)
            if nextbutton:
                nextbutton.click()
                records += self.get_data(driver)
            else:
                return  {record['url'].split("/")[-1].split("?")[0]:record for record in records}
            

    def get_data(self, driver):
        objs = []

        table = driver.find_element_by_class_name('dnnGrid')        
        table_rows = table.find_elements_by_css_selector('tr')
        # first is headers
        headers = []
        for header in table_rows[0].find_elements_by_css_selector('th'):
            headers.append(header.find_element_by_css_selector('a').text)

        for row in table_rows[1:]:
            new_obj = {}
            tds = row.find_elements_by_css_selector('td')
            new_obj['url'] = tds[0].find_element_by_css_selector('a').get_attribute('href')
            new_obj['fname'] = new_obj['url'].split("/")[-1].split("?")[0]

            for td in zip(headers, tds):
                new_obj[td[0]] = td[1].text
            objs.append(new_obj)
        return objs
    
    def get_next_button(self, driver):
        pagination_buttons = driver.find_element_by_class_name('PagingTable').find_elements_by_css_selector('a')
        
        try:
            next_button = pagination_buttons[-2]
            spans = driver.find_element_by_class_name('PagingTable').find_elements_by_css_selector('span')
        except IndexError:
            return None
        if spans[-1].text == 'Last':
            return None
        else:
            return next_button


### Here we tell the crawler how to discover more search results and detect when it is done.

In [4]:
from tqdm import tqdm


class DirectiveCorpus(object):
    def __init__(self, datadict, directive_type):
        self.records = datadict
        self.raw_data_dir = os.path.join(os.getcwd(),locations[directive_type]['dir'])
        self.txt_data_dir = os.path.join(os.getcwd(),locations[directive_type]['txt_dir'])
        self.ensure_dir(self.raw_data_dir)
        
        
    def ensure_dir(self, dir_name):
        INSTR_PATH = os.path.join(os.getcwd(), dir_name)
        try:
            os.makedirs(INSTR_PATH)
        except:
            pass
    
    def assemble(self):
        self.download_all()
        self.convert_all()
        self.load_contents()
        return self.as_list()
    
    def download_all(self):
        for k,v in tqdm(self.records.items()):
            self.save_record(k,v)

    def convert_all(self):
        for k,v in tqdm(self.records.items(),desc='Converting to Txt'):
            self.convert_to_txt(k,v)            

    def load_record(self, record):
        return self.load_file(record['txt_path'])
        
        
    def load_file(self, fname):       
        with open(fname, 'r') as myfile:
            contents = re.sub(r'[\t\n\r\x0b\x0c]',' ', myfile.read())
            return contents
            #return re.sub("\s+",' ', contents)
        
    def load_contents(self):
        for k,v in tqdm(self.records.items(),desc='Loading'):
            try:
                self.records[k]['contents'] = self.load_record(v)
            except KeyError as e:
                print("error loading {} - contents are {}".format(k,v))
                
    
    def as_list(self):
        thelist = []
        for k,v in tqdm(self.records.items(), desc='Exporting'):
            thelist.append(v)
        return thelist

                         
    def set_pdf_fpath(self, record_id, fpath):
        self.records[record_id]['pdf_path']= fpath

    def set_txt_fpath(self, record_id, fpath):
        self.records[record_id]['txt_path']= fpath
        
        
    def convert_to_txt(self, record_id, record):
        txt_name = record['fname'].split(".pdf")[0] + ".txt"
        fpath = os.path.join(self.txt_data_dir, txt_name)
        pdf_path = os.path.join(self.raw_data_dir, record['fname'])

        if not os.path.exists(fpath):
            try:
                try:
                    os.makedirs(self.txt_data_dir)
                except FileExistsError:
                    pass
                convert_pdf_to_txt(pdf_path, fpath)
                self.set_txt_fpath(record_id, fpath)
            except Exception as e:
                print(e)
        else:
            self.set_txt_fpath(record_id, fpath)
            pass
        
            #print("Skipping {}".format(fpath))
        
    def save_record(self, record_id, record):
        
        fpath = os.path.join(self.raw_data_dir, record['fname'])
        #print(fpath)
        # check if already downloaded
        if not os.path.isfile(fpath):
            try:
                response = requests.get(record['url'], stream=True)
                response.raise_for_status()
                with open(fpath, 'wb') as handle:
                    for block in response.iter_content(1024):
                        handle.write(block)
                #self.set_pdf_fpath(record, fpath)
            except Exception as e:
                print(e)
                pass
        else:
            pass
            #print("Skipping {}".format(fpath))


### Tie it all together..

In [5]:

def save_pickle(data, path, fname):
    if not os.path.isdir(path):
        os.makedirs(path)
    fpath = os.path.join(path, fname)
    with open(fpath,'wb') as fp:
        return pickle.dump(data, fp, protocol=pickle.HIGHEST_PROTOCOL)
        
def load_pickle(fpath):
    with open(fpath,'rb') as fp:
        return pickle.load(fp)
    
    

In [6]:
instr = get_directives('dod_instructions')
instr_corpus = DirectiveCorpus(instr,'dod_instructions').assemble()
save_pickle(instr_corpus,"/home/brian/corpora","dod_instructions.pickle")

 26%|██▌       | 197/772 [00:00<00:00, 1069.68it/s]

Found existing data.. loading from dod_instr.pickle
[Errno 21] Is a directory: '/home/brian/usaf_instructiondestruction/dod_instr/'
('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 29%|██▉       | 223/772 [00:36<01:29,  6.16it/s]  

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 29%|██▉       | 226/772 [00:56<02:15,  4.02it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 30%|██▉       | 231/772 [01:16<02:58,  3.03it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 32%|███▏      | 250/772 [01:36<03:21,  2.60it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 33%|███▎      | 253/772 [01:56<03:58,  2.18it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 33%|███▎      | 258/772 [02:16<04:31,  1.89it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 34%|███▍      | 263/772 [02:36<05:02,  1.68it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 37%|███▋      | 288/772 [02:56<04:56,  1.63it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 41%|████      | 318/772 [03:16<04:40,  1.62it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 44%|████▍     | 339/772 [03:36<04:36,  1.57it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 44%|████▍     | 341/772 [03:56<04:58,  1.44it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 45%|████▍     | 344/772 [04:16<05:19,  1.34it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 45%|████▍     | 345/772 [04:36<05:42,  1.25it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 45%|████▌     | 348/772 [04:56<06:01,  1.17it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 62%|██████▏   | 476/772 [05:16<03:16,  1.50it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 62%|██████▏   | 477/772 [05:36<03:28,  1.42it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 63%|██████▎   | 486/772 [05:36<03:18,  1.44it/s]

404 Client Error: Not Found for url: http://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodi/510581p.pdf


 63%|██████▎   | 488/772 [05:56<03:27,  1.37it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 70%|██████▉   | 540/772 [06:16<02:41,  1.43it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 73%|███████▎  | 564/772 [06:36<02:26,  1.42it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


100%|██████████| 772/772 [06:56<00:00,  1.85it/s]
Converting to Txt:   0%|          | 0/772 [00:00<?, ?it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
[Errno 21] Is a directory: '/home/brian/usaf_instructiondestruction/dod_instr/'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O200016v1_dodi_2016.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O200016v2_dodi_2016.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O200022p.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O200027p.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O300008p.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O300013_dodi_2017.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O300205p.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O302043p

Converting to Txt:  59%|█████▉    | 456/772 [00:00<00:00, 2056.07it/s]

Unknown operator: '\x06'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O510094p.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O510095p.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/510581p.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O511011p.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O521063p.pdf'


Converting to Txt:  71%|███████   | 545/772 [00:00<00:00, 711.30it/s] 

Unknown operator: '\x06'
Unknown operator: '\x06'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O524021p.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_instr/O524024p.pdf'


Converting to Txt:  77%|███████▋  | 594/772 [00:01<00:00, 392.55it/s]

Unknown operator: '\x06'
Unknown operator: '\x06'


Converting to Txt: 100%|██████████| 772/772 [00:02<00:00, 355.14it/s]
Loading:   0%|          | 0/772 [00:00<?, ?it/s]

Unknown operator: '\x06'
Unknown operator: '\x06'


Loading:  70%|███████   | 541/772 [00:00<00:00, 2682.12it/s]

error loading  - contents are {'url': 'http://www.esd.whs.mil/Directives/issuances/140025/', 'fname': '', 'Issuance #': 'DoDI 1400.25', 'Issuance Date': ' ', 'Issuance Subject': 'DoD Civilian Personnel Management System', 'CH. #': ' ', 'CH. Date': ' ', 'Exp. Date': ' ', 'Related Memo.': ' ', 'OPR': 'USD(P&R)'}
error loading O200016v1_dodi_2016.pdf - contents are {'url': 'https://directives.whs.mil/issuances/O200016v1_dodi_2016.pdf', 'fname': 'O200016v1_dodi_2016.pdf', 'Issuance #': 'DoDI O-2000.16 Volume 1', 'Issuance Date': '11/17/2016', 'Issuance Subject': 'DoD Antiterrorism (AT) Program Implementation: DoD At Standards\n(This website is not authorized to post controlled documents. DoD PKI certificate required to access this document.)', 'CH. #': 'CH 1', 'CH. Date': '5/5/2017', 'Exp. Date': ' ', 'Related Memo.': ' ', 'OPR': 'USD(P) 703-571-9255'}
error loading O200016v2_dodi_2016.pdf - contents are {'url': 'https://directives.whs.mil/issuances/O200016v2_dodi_2016.pdf', 'fname': 'O200

Loading: 100%|██████████| 772/772 [00:00<00:00, 2498.36it/s]
Exporting: 100%|██████████| 772/772 [00:00<00:00, 410861.91it/s]


In [14]:
ais = get_directives('dod_ais')
ais_corpus = DirectiveCorpus(ais,'dod_ais').assemble()
save_pickle(ais_corpus,"/home/brian/corpora","dod_ais.pickle")

  0%|          | 0/50 [00:00<?, ?it/s]

Found existing data.. loading from dod_ai.pickle


100%|██████████| 50/50 [00:19<00:00,  2.62it/s]
Converting to Txt: 100%|██████████| 50/50 [00:00<00:00, 11933.94it/s]
Loading: 100%|██████████| 50/50 [00:00<00:00, 2350.49it/s]
Exporting: 100%|██████████| 50/50 [00:00<00:00, 252061.54it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_ai/a116p_FOUO.pdf'
error loading a116p_FOUO.pdf - contents are {'url': 'https://directives.whs.mil/issuances/a116p_FOUO.pdf', 'fname': 'a116p_FOUO.pdf', 'Issuance #': 'AI O-116', 'Issuance Date': '10/21/2014', 'Issuance Subject': 'Post-balanced Survivability Assessment (P-BSA) Program\n(This website is not authorized to post controlled documents. DoD PKI certificate required to access this document.)', 'CH. #': ' ', 'CH. Date': ' ', 'EXP. Date': ' ', 'OPR': 'DCMO\n703-614-8888'}





In [13]:
manpubs = get_directives('dod_manpubs')
manpubs_corpus = DirectiveCorpus(manpubs,'dod_manpubs').assemble()
save_pickle(manpubs_corpus,"/home/brian/corpora","dod_manpubs.pickle")

  0%|          | 0/175 [00:00<?, ?it/s]

Found existing data.. loading from dod_manpub.pickle


  2%|▏         | 3/175 [00:19<18:59,  6.62s/it]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 11%|█▏        | 20/175 [00:20<02:38,  1.03s/it]

[Errno 21] Is a directory: '/home/brian/usaf_instructiondestruction/dod_manpub/'


 14%|█▎        | 24/175 [00:39<04:10,  1.66s/it]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 51%|█████▏    | 90/175 [00:59<00:56,  1.50it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


 71%|███████   | 124/175 [01:19<00:32,  1.55it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


100%|██████████| 175/175 [01:39<00:00,  1.75it/s]
Converting to Txt:   0%|          | 0/175 [00:00<?, ?it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_manpub/O100021_dodm_2017.pdf'
[Errno 21] Is a directory: '/home/brian/usaf_instructiondestruction/dod_manpub/'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_manpub/O302044M.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_manpub/O465011_dodm_2017.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_manpub/O520513p.pdf'


Converting to Txt: 100%|██████████| 175/175 [00:00<00:00, 284.17it/s]
Loading:  53%|█████▎    | 92/175 [00:00<00:00, 905.72it/s]

Unknown operator: '\x03'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_manpub/O853001M.pdf'
error loading O100021_dodm_2017.pdf - contents are {'url': 'https://directives.whs.mil/issuances/O100021_dodm_2017.pdf', 'fname': 'O100021_dodm_2017.pdf', 'Issuance #': 'DoDM O-1000.21', 'Issuance Date': '3/6/2017', 'Issuance Subject': 'Passport And Passport Agent Services\n(This website is not authorized to post controlled documents. DoD PKI certificate required to access this document.)', 'CH. #': ' ', 'CH. Date': ' ', 'Exp. Date': ' ', 'Related Memo.': ' ', 'OPR': 'DCMO 703-601-6128'}
error loading  - contents are {'url': 'http://www.esd.whs.mil/Directives/issuances/414025m/', 'fname': '', 'Issuance #': 'DoD 4140.25-M\nVolume 1-3\n(Date Varies)', 'Issuance Date': ' ', 'Issuance Subject': 'DoD Management Of Bulk Petroleum Products, Natural Gas, And Coal', 'CH. #': ' ', 'CH. Date': ' ', 'Exp. Date': ' ', 'Related Memo.': ' ', 'OPR': 'USD(AT&L) 703-697-2525'}


Loading: 100%|██████████| 175/175 [00:00<00:00, 825.24it/s]
Exporting: 100%|██████████| 175/175 [00:00<00:00, 490972.04it/s]

error loading O853001M.pdf - contents are {'url': 'https://directives.whs.mil/issuances/O853001M.pdf', 'fname': 'O853001M.pdf', 'Issuance #': 'DoD O-8530.1-M', 'Issuance Date': '12/17/2003', 'Issuance Subject': 'Department Of Defense Computer Network Defense (CND) Service Provider Certification And Accreditation Program\n(This website is not authorized to post controlled documents. DoD PKI certificate required to access this document.)', 'CH. #': ' ', 'CH. Date': ' ', 'Exp. Date': ' ', 'Related Memo.': ' ', 'OPR': 'DoD CIO'}





In [11]:
dtm = get_directives('dod_dtms')
dtms_corpus = DirectiveCorpus(dtm,'dod_dtms').assemble()
save_pickle(dtms_corpus,"/home/brian/corpora","dod_dtms.pickle")

100%|██████████| 17/17 [00:00<00:00, 10484.22it/s]
Converting to Txt: 100%|██████████| 17/17 [00:00<00:00, 11826.70it/s]
Loading: 100%|██████████| 17/17 [00:00<00:00, 2803.35it/s]
Exporting: 100%|██████████| 17/17 [00:00<00:00, 58782.50it/s]

Found existing data.. loading from dod_dtm.pickle





In [12]:
directives = get_directives('dod_directives')
dir_corpus = DirectiveCorpus(directives,'dod_directives').assemble()
save_pickle(dir_corpus,"/home/brian/corpora","dod_directives.pickle")

 30%|██▉       | 94/314 [00:00<00:00, 583.84it/s]

Found existing data.. loading from dod_dir.pickle
404 Client Error: Not Found for url: http://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/332505p.pdf


100%|██████████| 314/314 [00:12<00:00, 24.48it/s]
Converting to Txt: 100%|██████████| 314/314 [00:00<00:00, 30365.48it/s]
Loading: 100%|██████████| 314/314 [00:00<00:00, 3557.38it/s]
Exporting: 100%|██████████| 314/314 [00:00<00:00, 505376.61it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_dir/332505p.pdf'
[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_dir/O510019p.pdf'
error loading 332505p.pdf - contents are {'url': 'http://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/332505p.pdf', 'fname': '332505p.pdf', 'Issuance #': 'DoDD C-3325.05', 'Issuance Date': '2/25/2000', 'Issuance Subject': 'Classified Title', 'CH. #': ' ', 'CH. Date': ' ', 'EXP. Date': ' ', 'Related Memo.': ' ', 'OPR': 'USD(P) 703-571-9255'}
error loading O510019p.pdf - contents are {'url': 'https://directives.whs.mil/issuances/O510019p.pdf', 'fname': 'O510019p.pdf', 'Issuance #': 'DoDD O-5100.19', 'Issuance Date': '11/12/2014', 'Issuance Subject': 'Critical Information Communications (CRITICOMM) System\n(This website is not authorized to post controlled documents. DoD PKI certificate required to access 




In [17]:
issuances = get_directives('dod_issuances')
issuances_corpus = DirectiveCorpus(directives,'dod_issuances').assemble()
save_pickle(dir_corpus,"/home/brian/corpora","dod_issuances.pickle")


Didn't find data for dod_issuances.. retrieving..


 30%|███       | 95/314 [00:24<00:56,  3.85it/s]

404 Client Error: Not Found for url: http://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/332505p.pdf


 42%|████▏     | 131/314 [00:46<01:05,  2.81it/s]

('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))


100%|██████████| 314/314 [01:36<00:00,  3.24it/s]
Converting to Txt:  30%|███       | 95/314 [02:00<04:38,  1.27s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_issuances/332505p.pdf'


Converting to Txt:  41%|████▏     | 130/314 [02:46<03:55,  1.28s/it]

[Errno 2] No such file or directory: '/home/brian/usaf_instructiondestruction/dod_issuances/O510019p.pdf'


Converting to Txt: 100%|██████████| 314/314 [07:05<00:00,  1.35s/it]
Loading: 100%|██████████| 314/314 [00:00<00:00, 4078.44it/s]
Exporting: 100%|██████████| 314/314 [00:00<00:00, 526383.48it/s]

error loading 332505p.pdf - contents are {'url': 'http://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/332505p.pdf', 'fname': '332505p.pdf', 'Issuance #': 'DoDD C-3325.05', 'Issuance Date': '2/25/2000', 'Issuance Subject': 'Classified Title', 'CH. #': ' ', 'CH. Date': ' ', 'EXP. Date': ' ', 'Related Memo.': ' ', 'OPR': 'USD(P) 703-571-9255'}
error loading O510019p.pdf - contents are {'url': 'https://directives.whs.mil/issuances/O510019p.pdf', 'fname': 'O510019p.pdf', 'Issuance #': 'DoDD O-5100.19', 'Issuance Date': '11/12/2014', 'Issuance Subject': 'Critical Information Communications (CRITICOMM) System\n(This website is not authorized to post controlled documents. DoD PKI certificate required to access this document.)', 'CH. #': ' ', 'CH. Date': ' ', 'EXP. Date': ' ', 'Related Memo.': ' ', 'OPR': 'USD(I)'}



