# Master ingest workflow for new data

In [None]:
import os
import shutil
import geopandas as gpd
import pandas as pd
from getpass import getpass
import sys
import CSVtoISO19139
import ISO19139toGBLjson
from glob import glob
from zipfile import ZipFile
import requests
import json
import Utilities as utils

## Provide password and file locations

In [None]:
csvfile_metadata_file = utils.checkInput("Location of metadata CSV file on Sequoia:  ")
dataset_location = utils.checkInput("Location of dataset: ")

print(f"\nUsing metadata in file: {csvfile_metadata_file}")
print(f"Using dataset: {dataset_location}\n")

access_rights = utils.checkInput("provide access rights must be public or restricted: ",directory_exists=False) # 'public' or 'restricted' only

geoserver_postgis_store = "UA_Library_geospatialData"
geoserver_primary_workspace = "UniversityLibrary"
metadata_repository_loc = "../gitrepos/edu.uarizona/"
postgresql_pw = getpass("Provide password for PostgreSQL Database 'geo':")
geoserver_user = "OGPAdmin"
geoserver_pw = getpass(f"Provide password for GeoServer user {geoserver_user}:  ")

## 1. Build dataset MD (csv -> xml)

Coverts a csv file of metadata information into a metadata file in the ISO 19139 schema

In [None]:
new_files = CSVtoISO19139.csvtoISO(csvfile_metadata_file, dataset_location)
renamed_ds = new_files["dataset"]
xml_file = new_files["metadata"]
base_file_name = os.path.basename(xml_file).split(".")[0]

## 2. Build Solr Metadata
Creates a new search metadata file in the Geoblacklight schema and and moves into the metadata folder. The layerid is hashed using the fvn32 algorithm and then the hash is parsed out to create a folder structure (e.g. `UniversityLibrary:Arizona_CAPAqueduct_2002` -> `9ZQLzg7g5y` -> `9ZQ/Lzg/7g/5y/`.

As well the xml file is copied to a new file (iso19139.xml and placed in the same directory.


In [None]:
geoblacklightMD_dict = ISO19139toGBLjson.isoToGBL(metadata_repository_loc, xml_file, renamed_ds,
                                                  rights=access_rights, 
                                                  institution="UArizona",
                                                  geoserver_workspace="UniversityLibrary",
                                                  tosolr="False")

## 3. Ingest data to PostGIS DB

For preview and subset download services from the geoportal, all vector data is ingested into a PostGIS database. New tables are placed in a schema named after their access permissions ('public' or 'restricted'). Raster files are not ingested, but read out of sequoia storage directly.

In [None]:
# not a tif or netCDF file (i.e. vector file) ingest to PostGIS
if not renamed_ds.endswith(".tif") and not renamed_ds.endswith(".nc"):
    epsg_code = utils.sendFileToPostGIS(renamed_ds, postgresql_pw, access_rights)
else:
    #epsg_code = input("Please provide epsg code for raster file")
    with rio.open(renamed_ds) as raster:
        crs = raster.crs.to_string() # "+init=epsg:26912"
        epsg_code = crs.split(":")[-1]
    

## 4. Package Metadata (XML) & Data in ZIP
The new xml file and now dataset are packaged in a zip file together

In [None]:
# if shapefile, find all associate files (prj, dbf, sbx, etc...)
if renamed_ds.endswith(".shp"):
    shapefile_parent_dir = os.path.abspath(os.path.join(renamed_ds, os.pardir))
    shapefile_fullname = os.path.basename(renamed_ds) # filename without path
    shapefile_name = shapefile_fullname.split(".")[0] # everthing before extension
    search_files = os.path.join(shapefile_parent_dir, shapefile_name) + ".*"
    datafiles = glob(search_files)
elif renamed_ds.endswith(".gpkg") or renamed_ds.endswith(".tif") or renamed_ds.endswith(".nc"):
    datafiles = [renamed_ds]
else:
    print(f"Unknown datasets type for file {renamed_ds}. Must be shapefile, geopackage, netcdf, or geotiff.")
    raise ValueError

#write list of files and xml metadata file to new zipfile
zip_file_name = base_file_name + ".zip"
with ZipFile(zip_file_name, 'w') as myzip:
    for file in datafiles:
        myzip.write(file, os.path.basename(file))
    myzip.write(xml_file, os.path.basename(xml_file))

## 6. Move to archival destination on sequoia
The zip file is placed on sequoia in a directory corresponding to the originators name and the file name.  If desired, a customized directory can be specified

In [None]:
sequoia_loc = os.path.join("/sequoia/UAL_Vault/GeoArchive", access_rights)
originator_abbreviation = utils.checkInput("Provide dataset originator abbreviation with no spaces (e.g. USGS or UA_Libraries):",  directory_exists=False)
sequoia_loc = os.path.join(sequoia_loc, originator_abbreviation)

sub_directories = base_file_name.split("_")
subdirectory = os.path.join(sub_directories[1], sub_directories[0], sub_directories[2])
out_path = os.path.join(sequoia_loc, subdirectory)

confirm = utils.checkInput(f"Archive directory for {zip_file_name} is set to: {out_path}\n Is this okay (Y or N or exit)?", directory_exists=False)
while confirm not in ["Y", "YES", "y", "yes", "exit"] :
    out_path = utils.checkInput("Specify correct archive directory for file {}", directory_exists=False)
    confirm = utils.checkInput(f"Archive directory for {zip_file_name} is set to: {out_path}\n Is this okay (Y or N or exit)?", directory_exists=False)
    
if confirm == "exit":
    raise ValueError

os.makedirs(out_path, exist_ok=True)
zip_file_opath = os.path.join(out_path, zip_file_name)

print(f"Moving zip archive {zip_file_name} to directory {zip_file_opath} for storage")
shutil.move(zip_file_name, zip_file_opath)

## 7. Register file in iRods 
Now that the file is archived on sequoia, register the file location to iRods.

In [None]:
# only register in iRODS if dataset is a single vector or raster dataset. If an image pyramid (e.g. NAIP data). Register manually
if not os.path.isdir(dataset_location):
    irods_virtualized_location = f"/UAL_dataZone/geospatial/{access_rights}/single_layer_datasets/" + zip_file_name
    ireg_command = "ireg -V {} {}".format(zip_file_opath, irods_virtualized_location)
    print("Registering zip archive in irods")
    print(ireg_command)
    os.system(ireg_command)

## 8. Publish Layer in GeoServer
Publish the layer in GeoServer through the GeoServer REST API. If a vector dataset, the layer is exposed through an exisiting PostGIS data store. If raster, we'll create the store for the tif file and publish the layer from the store.  In the case of raster datasets, they should have the data retiled (internall) for faster reading and [COG optimization](https://github.com/cogeotiff/cog-spec/blob/master/spec.md). For single geotiffs less than 40GB in size, generate internal overviews. Larger tiffs, or sets of files (e.g. NAIP imagery) should be served as a image pyramid, which will need to be optimized outside of this workflow.

In [None]:
auth = (geoserver_user, geoserver_pw)
headers = {'Content-Type': 'text/xml'}

if renamed_ds.endswith(".shp") or renamed_ds.endswith(".gpkg"):
    utils.postVectorLayer(base_file_name, epsg_code, geoserver_postgis_store, geoserver_primary_workspace, auth, headers)
elif renamed_ds.endswith(".tif"):
    #1. Create the Store in GeoServer to connect to data
    # On the server geo, sequoia is mounted from the GeoArchive folder as sequoia
    geo_relative_location = zip_file_opath.replace("/sequoia/UAL_Vault/GeoArchive", "/sequoia")
    utils.createGeoTiffDataStore(base_file_name, geoserver_primary_workspace, geo_relative_location, auth, headers)
    
    #2. Publish the layer from that store
    utils.publishTiffLayer(base_file_name, geoserver_primary_workspace, epsg_code, auth, headers)
else:
    print("Unknown dataset store type. NetCDF?")
        


## 9. Push to github repository (unfinished)

Search and dataset metadata that were built in step 2 were placed in the metadata repository which is also a github repo on OpenGeoMetadata. Commit the changes and push the new files and changes to layers.json. This section of code hasn't been tested yet.

In [None]:
os.chdir(metadata_repository_loc)
print("Adding all changed files...")
os.system("git add .")

commit_msg = """ "Added metadata files for layer {}" """.format(base_file_name)
commit_cmd = "git commit -m {}".format(commit_msg)
print("Commiting changes to github repository...")
os.system(commit_cmd)

push_cmd = "git push origin master"
print("Pushing changes to https://github.com/OpenGeoMetadata/edu.uarizona...")
os.system(push_cmd)

## 10. Ingest to Solr

Now that all data is ingested into archive and exposed through Geoserver and iRods, ingest the metadatafile to Solr to make discoverable in the portal

In [None]:
solr_loc = "http://geo.library.arizona.edu:8983/solr/UAL_GeospatialRecords"
solrURL = solr_loc + "/update?commit=true"
# modify dict to be interpreted by Solr correctly
solrDict = {"add": {"doc": geoblacklightMD_dict}}
# turn python dictionary to json string
solrString = json.dumps(solrDict, indent=4, sort_keys=False)
# Set URL Put headers

headers = {"content-type": "application/json"}
print("Pushing record to Solr at {} ...".format(solrURL))
r = requests.post(solrURL, data=solrString, headers=headers)

Register Data to CrossRef (Ask Jeff)
in XML file that gets generated
get filepath from SDE Dataset URI (at the bottom)

[https://doi.crossref.org/servlet/useragent](https://doi.crossref.org/servlet/useragent)