In [231]:
# import required libraries
import re
from geopy import Nominatim
nom = Nominatim(user_agent="something") # geopy needs an user_agent argument to work

In [232]:
# set up rd2wgs function
def rd2wgs (x,y):
    """Calculate WGS84 coordinates"""
    x = int(x)
    y = int(y)

    dX = (x - 155000) * pow(10, - 5)
    dY = (y - 463000) * pow(10, - 5)

    SomN = (3235.65389 * dY) + (- 32.58297 * pow(dX, 2)) + (- 0.2475 * pow(dY, 2)) + (- 0.84978 * pow(dX, 2) * dY) + (- 0.0655 * pow(dY, 3)) + (- 0.01709 * pow(dX, 2) *pow(dY, 2)) + (- 0.00738 * dX) + (0.0053 * pow(dX, 4)) + (- 0.00039 * pow(dX, 2) *pow(dY, 3)) + (0.00033 * pow(dX, 4) * dY) + (- 0.00012 * dX * dY)

    SomE = (5260.52916 * dX) + (105.94684 * dX * dY) + (2.45656 * dX * pow(dY, 2)) + (- 0.81885 * pow(dX, 3)) + (0.05594 * dX * pow(dY, 3)) + (- 0.05607 * pow(dX, 3) * dY) + (0.01199 * dY) + (- 0.00256 * pow(dX, 3) *pow(dY, 2)) + (0.00128 * dX * pow(dY, 4)) + (0.00022 * pow(dY,2)) + (- 0.00022 * pow(dX, 2)) + (0.00026 * pow(dX, 5))

    lat = 52.15517 + (SomN / 3600)
    lon = 5.387206 + (SomE / 3600)

    return lat,lon

In [233]:
placenameList = []

In [234]:
# Set up placename function
def placenameFinder(title, text):
    for name in placenameList:
        if name in title:
            location = nom.geocode(name)
            docCoordinates.append([location.latitude, location.longitude])
        else:
            if name in text:
                location = nom.geocode(name)
                docCoordinates.append([location.latitude, location.longitude])

In [235]:
#load data
docs = ["D20650_RAAP-NOTITIE_274.txt",
        "D30384_Rapport_Heeg_Lytshuzen_35.txt",
        "D33465_rapport_vindplaatsen19-20-21.txt",
        "D38139_arcrapport-2009-129.txt",
        "D38273_ADC_73.txt",
        "D38275_RAP_937_4107029_Neerijnen_Waardenburg_Brugconstructies.txt",
        "D38312_arcrapport-2003-24_Middelburg_St._Laurens.txt",
        "D47635_Rap_2938_4131273_Hilversum_Jacob_van_Campenlaan_147_tot_193.txt",
        "D56558_12126389_IJS_HAM_ARC_Eindrapportage_archeologisch_karterend.txt",
        "D58053_GAR_1060_Klein_Canada_Beekbergen.txt"]

In [236]:
# set up location list
docLocations = []

In [237]:
# main script
for doc in docs:
    with open(doc, "r", encoding="utf-8") as f:
        docCoordinates = []

        # Finding all coordinates in text:
        text = f.read()
        docCoordinates += re.findall(r"[yxXY]\D{,4}\d{,4}[.,]\d{3},?\d?.{,4}[yxXY]\D{,4}\d{,4}[.,]\d{3},?\d?", text)
        docCoordinates += re.findall(r"\d{2,3}[.,]\d{2,3}.?/.?\d{2,3}[.,]\d{2,3}", text)
        docCoordinates += re.findall(r"\d{5,6}.?/.?\d{5,6}", text)

        if len(docCoordinates) == 0:
            placenameFinder(doc, text)
        else:
            # Converting coordinates to usable format
            for i, coordinate in enumerate(docCoordinates):
                coordinate = coordinate.replace(",", "").replace(".", "")
                coordinate = re.findall(r"\d+", coordinate)
                docCoordinates[i] = coordinate

        # Averaging coordinates of documents with multiple coordinates
        if len(docCoordinates) != 1:
            Xsum = 0
            Ysum = 0
            for c in docCoordinates:
                Xsum += int(c[0])
                Ysum += int(c[1])
            docCoordinates = [[Xsum/len(docCoordinates), Ysum/len(docCoordinates)]]

        # Converting Rijksdriehoek coordinates to WGS 84 coordinates
        docWGS84 = rd2wgs(docCoordinates[0][0], docCoordinates[0][1])
        # Listing langtide and longitude of each document
        docLocations.append(str(doc) + ": lan: "+ str(docWGS84[0]) + ", long: " + str(docWGS84[1]))

In [238]:
# print output
for location in docLocations:
    print(location)

D20650_RAAP-NOTITIE_274.txt: lan: 52.23386199592088, long: 4.443157261934848
D30384_Rapport_Heeg_Lytshuzen_35.txt: lan: 52.977527603081725, long: 5.605656684474073
D33465_rapport_vindplaatsen19-20-21.txt: lan: 52.014658412660275, long: 5.180357590248186
D38139_arcrapport-2009-129.txt: lan: 51.52947086217353, long: 5.981876906634259
D38273_ADC_73.txt: lan: 52.08456542507743, long: 5.0443514919783
D38275_RAP_937_4107029_Neerijnen_Waardenburg_Brugconstructies.txt: lan: 51.83186627770101, long: 5.265587575190208
D38312_arcrapport-2003-24_Middelburg_St._Laurens.txt: lan: 51.5267476533225, long: 3.6031095573987324
D47635_Rap_2938_4131273_Hilversum_Jacob_van_Campenlaan_147_tot_193.txt: lan: 83.34749158885991, long: 80.6228793383521
D56558_12126389_IJS_HAM_ARC_Eindrapportage_archeologisch_karterend.txt: lan: 52.033138921412075, long: 5.039455105047961
D58053_GAR_1060_Klein_Canada_Beekbergen.txt: lan: 52.1623491336831, long: 5.968168531167361
