In [2]:
import csv
import os
import subprocess
import shlex
import tempfile

# Define the paths to the files and directories we'll use
home = os.getenv('HOME')
downloads_path = os.path.join(home, 'Downloads')
classpath = f"{downloads_path}/tika-app-2.9.1.jar:{downloads_path}/tika-parser-nlp-package-2.9.1.jar:{home}/src/location-ner-model"
sightings_file_path = os.path.join(downloads_path, 'bfro_with_new_cols.tsv')
output_file_path = os.path.join(downloads_path, 'part7_bfro_with_new_cols.tsv')

# Function to call Tika GeoTopicParser and parse the output
def parse_sighting_with_tika(file_path):
    command = f"java -classpath {classpath} org.apache.tika.cli.TikaCLI -m {file_path}"
    args = shlex.split(command)
    result = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    output = result.stdout.decode()

    location_data = {}
    for line in output.splitlines():
        if 'Geographic_NAME:' in line:
            location_data['Geographic_NAME'] = line.split(':', 1)[1].strip()
        elif 'Geographic_LATITUDE:' in line:
            location_data['Geographic_LATITUDE'] = line.split(':', 1)[1].strip()
        elif 'Geographic_LONGITUDE:' in line:
            location_data['Geographic_LONGITUDE'] = line.split(':', 1)[1].strip()
    return location_data

# Process the sightings and write the results to a new TSV
with open(sightings_file_path, mode='r', encoding='utf-8') as infile, \
     open(output_file_path, mode='w', encoding='utf-8', newline='') as outfile:

    reader = csv.DictReader(infile, delimiter='\t')
    fieldnames = reader.fieldnames + ['Geographic_NAME', 'Geographic_LATITUDE', 'Geographic_LONGITUDE']
    writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter='\t')
    writer.writeheader()

    for row_index, row in enumerate(reader):
        # Combine relevant columns from the TSV into a string to write to a .geot file
        sighting_description = (
            f"{row['County']} {row['State']} {row['Location Details']} "
            f"{row['Headline']} {row['Nearest Town']} {row['Nearest Road']}"
        )

        # Create a temporary .geot file with this sighting's description
        with tempfile.NamedTemporaryFile(mode='w+', suffix='.geot', delete=False) as temp_file:
            temp_file.write(sighting_description)
            temp_file_path = temp_file.name

        # Call the function to parse this .geot file
        location_data = parse_sighting_with_tika(temp_file_path)

        # Merge the original row data with the new location data
        row.update(location_data)

        # Write the merged data to the new TSV
        writer.writerow(row)

        # Clean up the temporary file
        os.remove(temp_file_path)

print(f"Done parsing all sightings. Enhanced data written to {output_file_path}")


Done parsing all sightings. Enhanced data written to /Users/pardibedirian/Downloads/bfro_with_new_cols_enhanced.tsv
