In [1]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
import urllib.request
import io
import re
import pandas as pd 
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
import time

In [2]:
def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def get_all_website_links(url,urlSearch):
    """
    Returns all URLs that is found on `url` in which it belongs to the same website
    """
    # all URLs of `url`
    urls = set()
    # domain name of the URL without the protocol
    domain_name = urlparse(url).netloc
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    
    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            # href empty tag
            continue
        # join the URL if it's relative (not absolute link)
        href = urljoin(url, href)
        parsed_href = urlparse(href)
        # remove URL GET parameters, URL fragments, etc.
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
        if not is_valid(href):
            # not a valid URL
            continue
        if urlSearch not in href:
            continue
        if "pdf" in href:
            continue
        if href in internal_urls:
            # already in the set
            continue
        if domain_name not in href:
            # external link
            if href not in external_urls:
                #print(f"{GRAY}[!] External link: {href}{RESET}")
                listofurls.append(href)
                external_urls.add(href)
            continue
        #print(f"{GREEN}[*] Internal link: {href}{RESET}")
        listofurls.append(href)
        urls.add(href)
        internal_urls.add(href)
    return urls

def crawl(url,max_urls,urlSearch):
    """
    Crawls a web page and extracts all links.
    You'll find all links in `external_urls` and `internal_urls` global set variables.
    params:
        max_urls (int): number of max urls to crawl, default is 30.
    """
    global total_urls_visited
    total_urls_visited += 1
    links = get_all_website_links(url,urlSearch)
    for link in links:
        if total_urls_visited > max_urls:
            break
        crawl(link,max_urls,urlSearch)


def convert_pdf_to_txt(path):
    url = path
    openfile = urllib.request.urlopen(url).read()
    memoryFile = io.BytesIO(openfile)
    parser = PDFParser(memoryFile)
    doc = PDFDocument(parser)
    parser.set_document(doc)
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)
        layout = device.get_result()
        txt = ""
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                txt += lt_obj.get_text()
    return(txt) 

def convert_txt_to_df(txt,search):
    textdata = re.split(r'\n', txt)
    df = pd.DataFrame(textdata) 
    searchdf = df[[0]].apply(lambda x: x.str.contains(search,regex=True)).any(axis=1)
    return df[searchdf]

def time_convert(sec):
    mins = sec // 60
    sec = sec % 60
    hours = mins // 60
    mins = mins % 60
    print("Time Lapsed = {0}:{1}:{2}".format(int(hours),int(mins),sec))
    
def get_pdf_FlightAware(airportlist,max_urls,urlSearch):
    for apt in airportlist:
        crawl("https://flightaware.com/resources/airport/" + apt + "/APD/AIRPORT+DIAGRAM",max_urls,urlSearch)
        print("Completed: " + apt)
        
def parase_pdf_FlightAware(listofurls,search):
    dfappended = pd.DataFrame()
    lenofUrls = len(listofurls)
    ct = 0;
    for url in listofurls:
        textdata = convert_pdf_to_txt(url + '/pdf')
        df = convert_txt_to_df(textdata,search)
        df["path"] = url + '/pdf'
        dfappended = pd.concat([dfappended,df], axis=0)
        ct += 1
        status = (ct/lenofUrls) * 100
        print('Completed: ' + str(round(status,2)) + '%')
    return dfappended

In [3]:
# start timer
start_time = time.time()
# init the colorama module
colorama.init()
GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET
# initialize the set of links (unique links)
internal_urls = set()
external_urls = set()
# number of urls visited so far will be stored here
total_urls_visited = 0
listofurls = []

In [4]:
# get urls list from given airports
#"BIKF","EINN",
#"KABQ","KACY","KAFW","KAMA","KATL","KAUS","KAVL","KBDL","KBHM","KBNA","KBOS","KBTR","KBUR"
#,"KBWI","KCAK","KCHS","KCLE","KCLT","KCMH","KCOS","KCRW","KDAL","KDEN","KDFW","KDSM","KDTW","KELP","KEWR"
#,"KFAR","KFLL","KFSD","KGJT","KGSO","KGSP","KGUS","KIAD","KIAG","KIAH","KICT","KIND","KJAX","KLAS","KLAX"
#,"KLBB","KLBE","KLCQ","KLGA","KLIT","KLWV","KMCI","KMCN","KMCO","KMGM","KMIA","KMOB","KMSP","KMSY","KMYR"
#,"KOAK","KOKC","KOMA","KORD","KPBG","KPBI","KPDX","KPHL","KPHX","KPIT","KPUB","KPVD","KRDU","KRFD","KRIC"
#,"KRNO","KROW","KRST","KRSW","KSAN","KSEA","KSGU","KSMF","KSTL","KTLH","KTPA","KTUL","KTYS","KYIP"
#,"MBPV","MDPC","MDSD","MDST","MGGT","MHLM","MKJP","MKJS","MMSD","MMUN","MNMG","MPTO","MROC","MSLP","MTCH"
#,"MTPP","MWCR","MYNN","SEGU","SKAR","SKBO","SKCG","SKCL","SKRG","SPJC","TIST","TISX","TJBQ","TJSJ","TNCA"
#,"TNCM"

# airportlist = ["KABQ","KACY","KAFW","KAMA","KATL","KAUS","KAVL","KBDL","KBHM","KBNA","KBOS","KBTR","KBUR"
# ,"KBWI","KCAK","KCHS","KCLE","KCLT","KCMH","KCOS","KCRW","KDAL","KDEN","KDFW","KDSM","KDTW","KELP","KEWR"
# ,"KFAR","KFLL","KFSD","KGJT","KGSO","KGSP","KGUS","KIAD","KIAG","KIAH","KICT","KIND","KJAX","KLAS","KLAX"
# ,"KLBB","KLBE","KLCQ","KLGA","KLIT","KLWV","KMCI","KMCN","KMCO","KMGM","KMIA","KMOB","KMSP","KMSY","KMYR"
# ,"KOAK","KOKC","KOMA","KORD","KPBG","KPBI","KPDX","KPHL","KPHX","KPIT","KPUB","KPVD","KRDU","KRFD","KRIC"
# ,"KRNO","KROW","KRST","KRSW","KSAN","KSEA","KSGU","KSMF","KSTL","KTLH","KTPA","KTUL","KTYS","KYIP"]

airportlist = ["KDEN","KMCO"]
get_pdf_FlightAware(airportlist,max_urls=50,urlSearch="IAP") # specify url search pattern

Completed: KDEN
Completed: KMCO


In [5]:
# go through list of urls for all airports
df = parase_pdf_FlightAware(listofurls = listofurls,search='TCH|GP 3.|GS 3.|.[.]..°') # specify pdf search
df.to_csv("output.csv")
display(df)

Completed: 1.49%
Completed: 2.99%
Completed: 4.48%
Completed: 5.97%
Completed: 7.46%
Completed: 8.96%
Completed: 10.45%
Completed: 11.94%
Completed: 13.43%
Completed: 14.93%
Completed: 16.42%
Completed: 17.91%
Completed: 19.4%
Completed: 20.9%
Completed: 22.39%
Completed: 23.88%
Completed: 25.37%
Completed: 26.87%
Completed: 28.36%
Completed: 29.85%
Completed: 31.34%
Completed: 32.84%
Completed: 34.33%
Completed: 35.82%
Completed: 37.31%
Completed: 38.81%
Completed: 40.3%
Completed: 41.79%
Completed: 43.28%
Completed: 44.78%
Completed: 46.27%
Completed: 47.76%
Completed: 49.25%
Completed: 50.75%
Completed: 52.24%
Completed: 53.73%
Completed: 55.22%
Completed: 56.72%
Completed: 58.21%
Completed: 59.7%
Completed: 61.19%
Completed: 62.69%
Completed: 64.18%
Completed: 65.67%
Completed: 67.16%
Completed: 68.66%
Completed: 70.15%
Completed: 71.64%
Completed: 73.13%
Completed: 74.63%
Completed: 76.12%
Completed: 77.61%
Completed: 79.1%
Completed: 80.6%
Completed: 82.09%
Completed: 83.58%
Comp

Unnamed: 0,0,path
259,(VGSI Angle 3.00/TCH 68).,https://flightaware.com/resources/airport/KDEN...
278,GS 3.00°,https://flightaware.com/resources/airport/KDEN...
279,TCH 55,https://flightaware.com/resources/airport/KDEN...
293,GS 3.00°,https://flightaware.com/resources/airport/KDEN...
294,TCH 52,https://flightaware.com/resources/airport/KDEN...
298,(VGSI Angle 3.00/TCH 70).,https://flightaware.com/resources/airport/KDEN...
189,(VGSI Angle 3.00/TCH 71).,https://flightaware.com/resources/airport/KDEN...
322,GS 3.00°,https://flightaware.com/resources/airport/KDEN...
323,TCH 60,https://flightaware.com/resources/airport/KDEN...
205,(VGSI Angle 3.00/TCH 71).,https://flightaware.com/resources/airport/KDEN...


In [6]:
# end timer
end_time = time.time()
time_lapsed = end_time - start_time
time_convert(time_lapsed)

Time Lapsed = 0:13:52.23050570487976
