# Pulls DoDDs and DODIs Out Of PDFs

#### This reads in PDFs, pulls certain phrases from them which correspond to DoDDs and DODIs, and then finds those documents on google, downloads them, and puts the names and links in an Excel file

### Libraries

In [3]:
import PyPDF2
import os
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
#for this specific enviroment
#newDir='/home/abigailhaddad/PDFs'
newDir=r"C:\Users\admin\Documents\Python Scripts\cleaningPDFs"
#os.mkdir(newDir)
os.chdir(newDir)


### Functions

In [4]:
def pullTextFromPDF(fileToRead):
	pdfFileObj = open(fileToRead, 'rb')
	pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
	pageCount=pdfReader.numPages
	textOutput=""
	for pageNumber in range(0, pageCount):
		pageOutput=pdfReader.getPage(pageNumber).extractText()
		textOutput=textOutput+pageOutput
	return(textOutput)

def cleanText(textString):
	cleanedText=textString.replace("\n", "")
	return(cleanedText)

def cleanString(myString):
	myString=re.sub(r'[\(\)]', '', str(myString))
	return(myString)

def extractPhrases(cleanedText):
	listOfWords=cleanedText.split()
	regPhrases=[]
	ourWantPhrases=["dodi", "dodd", "directive", "instruction"]
	for wordNumber in range(0, len(listOfWords)):
		if any(phrase in listOfWords[wordNumber].lower() for phrase in ourWantPhrases):
			miniList=listOfWords[wordNumber-1:wordNumber+2]
			cleanPhrase=" ".join(miniList)
			if len(re.sub("\D", "", cleanPhrase))>0 and 'DoD' in cleanPhrase:
				startNumber=cleanPhrase.find('DoD')
				endnumber=re.match('.+([0-9])[^0-9]*$', cleanPhrase).start(1)+1
				regPhrases.append(cleanPhrase[startNumber:endnumber])
	regPhrases=list(map(cleanString, regPhrases))
	return(list(set(regPhrases)))


def takeFileNameGetPhrases(fileToRead):
	textOutput=pullTextFromPDF(fileToRead)
	cleanedText=cleanText(textOutput)
	phrases=extractPhrases(cleanedText)
	return(phrases)

def makeSumChartbyAgency(dictOfPhrases):
    listOfAgencies=[i.split()[0] for i in dictOfPhrases.keys()]
    listOfLists=[]
    for phrases in dictOfPhrases.values():
        longPhrase=[i for i in phrases if len(i)>0]
        phrase=[i.split()[-1] for i in longPhrase]
        listOfLists.append(phrase)
    countOfRegs=[len(set(i)) for i in listOfLists]
    dictOfCounts=dict(zip(listOfAgencies, countOfRegs))
    s  = pd.Series(dictOfCounts,index=dictOfCounts.keys())
    return(s)

def getUniqueList(dictOfPhrases):
    oneList=[]
    for phrases in dictOfPhrases.values():
        longPhrase=[i for i in phrases if len(i)>0]
        phrase=[i.split()[-1] for i in longPhrase]
        for bit in phrase:
            oneList.append(bit)
    oneUniqueList=list(set(oneList))
    return(oneUniqueList)

def getFirstPDFLink(fileName):
    linkSearch=fileName+"site:www.esd.whs.mil"
    search = "http://google.com/search?q=" + linkSearch
    r = requests.get(search)
    soup = BeautifulSoup(r.text, "html.parser")
    found=0
    for link in soup.findAll('a'):
        if "pdf" in (link.get('href')) and found==0:
            found=1
            pdfLink=(link.get('href'))
    return(pdfLink)
    
def cleanBadLink(pdfLink):
    startNumber=pdfLink.find('http')
    endNumber=pdfLink.find('.pdf')+4
    return(pdfLink[startNumber:endNumber])

def pullPDF(cleanLink):
    r = requests.get(cleanLink, allow_redirects=True)
    name=cleanLink.split("/")[-1]
    open(name, 'wb').write(r.content)

def fromNameGetFile(fileName):
    try:
        pdfLink=getFirstPDFLink(fileName)
        print(pdfLink)
        cleanLink=cleanBadLink(pdfLink)
        print(cleanLink)
        pullPDF(cleanLink)
        print("got pdf")
        print("")
    except:
        pass


### Get Phrases From PDFs

In [6]:
listOfPDFs=[i for i in os.listdir() if "pdf" in i.lower()]
listOfPhrases=list(map(takeFileNameGetPhrases, listOfPDFs))
dictOfPhrases=dict(zip(listOfPDFs, listOfPhrases))
s=makeSumChartbyAgency(dictOfPhrases)
s

203008p.pdf        12
204002_2014.pdf    18
416550e.pdf         2
501502p.pdf        13
502501p.pdf        15
511121p.pdf         2
512409p.pdf         7
514402p.pdf         9
523020p.pdf         7
dtype: int64

In [None]:
### Write List do Excel File

In [8]:
writer = pd.ExcelWriter('output.xlsx')
for PDF in listOfPDFs:
    s.to_excel(writer, "Summary")
    tabName=PDF.split(" ")[0]
    print(tabName)
    mySeries=pd.Series(dictOfPhrases[PDF])
    print(mySeries)
    if len(mySeries)>0:
        mySeries.to_excel(writer, tabName)
writer.save()

203008p.pdf
0     DoD Instruction 5230.29
1                DoDD 5106.01
2     DoD Instruction 4140.01
3     DoD Instruction 2040.02
4     DoD Instruction 2030.08
5                 DoDI 5505.2
6       DoD Directive 5105.72
7                DoDI 4140.01
8                DoDI 2040.02
9        DoD Directive 5111.1
10               DoDI 2030.08
11    DoD Instruction 5505.02
12               DoDI 5230.29
13               DoDI 5230.24
14    DoD Instruction 5230.24
15                DoDD 5111.1
16               DoDD 5105.72
17               DoDI 4160.28
18      DoD Directive 5106.01
19    DoD Instruction 4160.28
20      DoD Directive 5230.25
21               DoDD 5230.25
dtype: object
204002_2014.pdf
0         DoD Directive 5230.11
1          DoD Directive 5530.3
2                  DoDI 5200.39
3                  DoDI 5025.01
4       DoD Instruction 2010.06
5       DoD Instruction 2040.02
6     DoD Instruction O-5240.24
7       DoD Instruction 2000.25
8       DoD Instruction 5000.02
9         

### Look at Unique Count

In [13]:
oneList=getUniqueList(dictOfPhrases)
(len(oneList))



60

### Download Sample Names

In [15]:
testNames=mySeries
testNames

0    DoD Directive 5230.11
1     DoD Directive 5530.3
2                DoDD 5230
3     DoD Directive 5230.9
4    DoD Directive 5230.20
5    DoD Directive 5230.25
6             DoDD 5230.20
7    DoD Directive 5230.24
dtype: object

In [16]:
for thing in testNames:
    fromNameGetFile(thing)



/url?q=https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/540011p.pdf&sa=U&ved=0ahUKEwiH6LDN0fbhAhW2D2MBHYEEAycQFggoMAM&usg=AOvVaw1sdGKrJ0EO4um1Yh8pxHZK
https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/540011p.pdf
got pdf

/url?q=https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/540007p.pdf&sa=U&ved=0ahUKEwid1fHN0fbhAhWB2-AKHTj-CM8QFggjMAM&usg=AOvVaw0kkuiN7LkMuzL5lnb_ZZ2e
https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/540007p.pdf
got pdf

/url?q=http://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/416550e.pdf&sa=U&ved=0ahUKEwiE6r_O0fbhAhUC8hQKHaxIBgAQFgghMAQ&usg=AOvVaw2dWvmUJJFdzqxq1DcpNLkc
http://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/416550e.pdf
got pdf

/url?q=http://www.esd.whs.mil/Portals/54/Documents/DD/Issuances/dodi/523029p.pdf&sa=U&ved=0ahUKEwirpo3P0fbhAhV98OAKHQmSCToQFggdMAE&usg=AOvVaw0UnfX2tq_AVbxUvPGnBlDW
http://www.esd.whs.mil/Portals/54/Documents/DD/Issuances/dodi/523029p.pdf
got pdf

/url?q=h

### Flatten Dictionary With All of the Document Names and Pull Them

In [19]:
allPhrases = list(set(list([item for sublist in listOfPhrases for item in sublist])))
allCleanLinks=list(map(fromNameGetFile, allPhrases))

/url?q=https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodi/502501p.pdf&sa=U&ved=0ahUKEwibpIOA0vbhAhXdUBUIHfJSCP0QFggtMAQ&usg=AOvVaw0e-qAVdpF-uFdxikEqhsJ_
https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodi/502501p.pdf
got pdf

/url?q=https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodi/891001p.pdf&sa=U&ved=0ahUKEwjOkbuA0vbhAhUBonEKHaDmBXIQFggUMAA&usg=AOvVaw3aoFbuHuXWWmPZcliz2dhG
https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodi/891001p.pdf
got pdf

/url?q=https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/540007p.pdf&sa=U&ved=0ahUKEwjknvqA0vbhAhWTTxUIHTmyCcYQFggjMAM&usg=AOvVaw2qbjCwyXpMO-PyQ-YvAN-p
https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/540007p.pdf
got pdf

/url?q=https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/540011p.pdf&sa=U&ved=0ahUKEwjkpaSB0vbhAhXCRBUIHQ6dBvEQFggjMAI&usg=AOvVaw3PC5uG4OUyOmWT0pddItJ3
https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/540011p.pdf
got pdf

/url

got pdf

/url?q=https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/510582p.pdf&sa=U&ved=0ahUKEwjamLiO0vbhAhXtVRUIHTrfAI4QFggUMAA&usg=AOvVaw3pIf5jQiK_hSfBYYhRw49_
https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/510582p.pdf
got pdf

/url?q=https://strategicmedicaltraining.com/2018/03/28/http-www-esd-whs-mil-portals-54-documents-dd-issuances-dodi-132224p-pdfver2018-03-16-074408-137/&sa=U&ved=0ahUKEwiV6_SO0vbhAhXZRhUIHQJ8BsAQwW4IPjAK&usg=AOvVaw2zK8Bn-zBWP_XmwQNUiI1J

/url?q=http://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/416550e.pdf&sa=U&ved=0ahUKEwj53I6P0vbhAhWCURUIHaa9BvIQFgghMAQ&usg=AOvVaw0HcvusLHXp0o67mFUgtUDV
http://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/416550e.pdf
got pdf

/url?q=https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/510582p.pdf&sa=U&ved=0ahUKEwiphMyP0vbhAhURuHEKHXLsBOcQFggUMAA&usg=AOvVaw0vBaksQuoU4Fgs62K75GFx
https://www.esd.whs.mil/Portals/54/Documents/DD/issuances/dodd/510582p.pdf
got pdf

/url?q=ht

### Puts The Report Names and Links in an Excel File

In [20]:
df=pd.DataFrame({"Report": allPhrases, "Link": allCleanLinks})
df.to_excel("Links.xlsx")
