In [67]:
# packages to import

from tika import parser 
import pandas as pd 
import os
import re
import folium

#from pathlib import Path

In [36]:
# First step to define a function that will parse out pdf and return coordinates
def extract(pdf_path):
    """
    This function will parse the PDF into a string and from there it will extract 
    out gps coordiantes from the document
    """
    # opening up PDF with tika parser
    parsed_pdf = parser.from_file(pdf_path)
    
    # saving content of PDF
    pdf_data = parsed_pdf['content'] 

    # get rid of all white space from parsed PDF
    cleaned_data = "".join(pdf_data.split())

    # pattern for gps coordinates
    pattern = re.compile('''(\d{1,3}°\d{1,3}′\d{1,3}.\d{1,3}′′[A-Za-z],\d{1,3}°\d{1,3}′\d{1,3}.\d{1,3}′′[A-Za-z])''')  

    # match the pattern to the parsed data.
    gps_coords = pattern.findall(cleaned_data)
    
    # split list into latitude and longitude
    split_coords = []

    for elem in gps_coords:
        lst = elem.split(",")
        lst2 = [lst[0], lst[1]]
        split_coords.append(lst2)

    # convert dictionary to data frame 
    coords_df = pd.DataFrame(split_coords, columns=['Latitude', 'Longitude'])
    
    # split strings in data frame to degrees, minutes, seconds
    coords_df[['Lat_deg']] = coords_df['Latitude'].str.split("°").str[0]
    coords_df[['Lat_min']] = coords_df['Latitude'].str.split("°").str[1].str.split("′").str[0]
    coords_df[['Lat_sec']] = coords_df['Latitude'].str.split("′").str[1].str.split("′′").str[0]
    coords_df[['Lat_hem']] = coords_df['Latitude'].str.split("′′").str[1]
    coords_df[['Lon_deg']] = coords_df['Longitude'].str.split("°").str[0]
    coords_df[['Lon_min']] = coords_df['Longitude'].str.split("°").str[1].str.split("′").str[0]
    coords_df[['Lon_sec']] = coords_df['Longitude'].str.split("′").str[1].str.split("′′").str[0]
    coords_df[['Lon_hem']] = coords_df['Longitude'].str.split("′′").str[1]
    
    return(coords_df)
    

In [60]:
coords_df = extract(os.path.realpath("../project/documents/MurphyRTL2017.pdf"))

In [55]:
# Second convert DMS to decimal degrees
# define function to convert to decimalDegrees
"""
This function converts GPS coordinates in degrees, minutes, seconds 
to decimal degrees
"""
def decimaldegree(degree, minutes, seconds, hemisphere):
    if hemisphere.lower() in ["w", "s", "west", "south"]:
        factor = -1.0
    elif hemisphere.lower() in ["n", "e", "north", "east"]:
        factor = 1.0
    else:
        raise ValueError("invalid hemisphere")

# check the order of operations in your code
    return factor * (float(degree) + float(minutes)/60 + float(seconds)/3600)

In [64]:
# apply that function along to rows, using lambda to specify the columns to use as input
coords_df['Lat_dd'] = coords_df.apply(
    lambda row: decimalDegree(row['Lat_deg'], row['Lat_min'], row['Lat_sec'], row['Lat_hem']),
    axis=1, result_type='expand'
)

coords_df['Lon_dd'] = coords_df.apply(
    lambda row: decimalDegree(row['Lon_deg'], row['Lon_min'], row['Lon_sec'], row['Lon_hem']),
    axis=1, result_type='expand'
)

print(coords_df)

          Latitude       Longitude Lat_deg Lat_min Lat_sec Lat_hem Lon_deg  \
0    21°52′09.3′′S   43°21′16.1′′E      21      52    09.3       S      43   
1    23°10′28.2′′S   43°57′42.2′′E      23      10    28.2       S      43   
2   23°34′52.93′′S  43°49′57.24′′E      23      34   52.93       S      43   
3   23°29′12.79′′S   44°4′26.52′′E      23      29   12.79       S      44   
4   23°31′46.15′′S   44°5′31.27′′E      23      31   46.15       S      44   
5   23°31′23.79′′S   44°9′38.34′′E      23      31   23.79       S      44   
6   23°34′28.73′′S  44°19′41.53′′E      23      34   28.73       S      44   
7   25°10′15.07′′S   44°32′8.32′′E      25      10   15.07       S      44   
8    25°5′56.38′′S   44°37′3.07′′E      25       5   56.38       S      44   
9    25°5′11.88′′S  44°37′15.40′′E      25       5   11.88       S      44   
10   24°57′7.18′′S  45°10′15.67′′E      24      57    7.18       S      45   
11   24°36′8.14′′S  45°33′17.81′′E      24      36    8.14      

In [82]:
# Third plot the decimal degree points
# need to specify Lat_dd and Lon_dd columns to plot 

#Create the Map
pdf_map = folium.Map(coords_df[['Lat_dd', 'Lon_dd']].mean().values.tolist())
#You Markler the point in Map
for lat, lon in zip(coords_df['Lat_dd'], coords_df['Lon_dd']):
    folium.Marker([lat, lon]).add_to(pdf_map)
    
sw = coords_df[['Lat_dd', 'Lon_dd']].min().values.tolist()
ne = coords_df[['Lat_dd', 'Lon_dd']].max().values.tolist()

pdf_map.fit_bounds([sw, ne]) 

pdf_map

#pdf_map.save('pdf_map.html')
