This notebook lays out and runs the process for USNVC source data into a MongoDB store.

In [1]:
import os,requests,uuid
import pandas as pd
from IPython.display import display
from bis2 import dd

For now, I dumped the files here into the working code repo to deal with them locally. Eventually, we will want to put these into their own source code repo or somehow deal with the version control dynamic on the files. I output the file list here to see what all I'm dealing with.

In [2]:
usnvcExportFolder = "USNVC v2.02 export 2018-03"
display (os.listdir(usnvcExportFolder))
usnvcDefinitionTablesFolder = "usnvc_definition_tables"
display (os.listdir(usnvcDefinitionTablesFolder))

['unitObsoleteParent.txt',
 'reference.txt',
 'unit.txt',
 'UnitXReference.txt',
 'unitObsoleteName.txt',
 'UnitXEcoregionUsfs2007.txt',
 'UnitXSubnation.txt',
 'unitPredecessor.txt',
 'd_usfs_ecoregion1994.txt',
 'unitDescription.txt',
 'd_dist_confidence.txt',
 'd_occurrence_status.txt',
 'UnitXSimilarUnit.txt',
 'd_curr_presence_absence.txt',
 'UnitXEcoregionUsfs1994.txt',
 'd_usfs_ecoregion_level.txt',
 'd_usfs_ecoregion2007.txt']

['d_spatial_pattern.txt',
 'd_classif_confidence.txt',
 'd_classification_level.txt',
 'd_subnation.txt']

# Unit Attributes, Hierarchy, and Descriptions
The following code block merges the unit and unit description tables into one dataframe that serves as the core data for processing.

In [3]:
units = pd.read_csv(usnvcExportFolder+"/unit.txt", sep='\t', encoding = "ISO-8859-1", dtype={"element_global_id":str,"parent_id":str,"classif_confidence_id":int})
unitDescriptions = pd.read_csv(usnvcExportFolder+"/unitDescription.txt", sep='\t', encoding = "ISO-8859-1", dtype={"element_global_id":str})
codes_classificationConfidence = pd.read_csv(usnvcDefinitionTablesFolder+"/d_classif_confidence.txt", sep='\t', encoding = "ISO-8859-1")

nvcsUnits = pd.merge(left=units,right=unitDescriptions, left_on='element_global_id', right_on='element_global_id')
nvcsUnits = pd.merge(left=nvcsUnits,right=codes_classificationConfidence, left_on='classif_confidence_id', right_on='D_CLASSIF_CONFIDENCE_ID')
print (nvcsUnits.dtypes)

del units
del unitDescriptions

element_global_id             object
parent_id                     object
d_classification_level_id      int64
elementuid                   float64
classificationcode            object
databasecode                  object
status                        object
colloquialname                object
scientificname                object
formattedscientificname       object
translatedname                object
hierarchylevel                object
unitsort                      object
usstatus                      object
typeconceptsentence           object
parentkey                     object
parentname                    object
typeconcept                   object
diagnosticcharacteristics     object
rationale                     object
classificationcomments        object
similarnvctypescomments       object
physiognomy                   object
floristics                    object
plotcount                    float64
dynamics                      object
environment                   object
r

# Unit References
The following dataframes assemble the unit by unit references into a merged dataframe for later query and processing when building the unit documents.

In [11]:
unitByReference = pd.read_csv(usnvcExportFolder+"/UnitXReference.txt", sep='\t', encoding = "ISO-8859-1", dtype={"element_global_id":str,"reference_id":str})
references = pd.read_csv(usnvcExportFolder+"/reference.txt", sep='\t', encoding = "ISO-8859-1", dtype={"reference_id":str})
unitReferences = pd.merge(left=unitByReference,right=references, left_on='reference_id', right_on='reference_id')

print (unitReferences.dtypes)

del unitByReference
del references

element_global_id    object
reference_id         object
shortcitation        object
fullcitation         object
dtype: object


# Unit Predecessors
The following codeblock retrieves the unit predecessors for processing.

In [5]:
unitPredecessors = pd.read_csv(usnvcExportFolder+"/UnitPredecessor.txt", sep='\t', encoding = "ISO-8859-1", dtype={"element_global_id":str,"predecessor_id":str})

print(unitPredecessors.dtypes)

element_global_id            object
predecessor_id               object
predecessorcode              object
predecessorname              object
predecessorsciname           object
predecessorcolloquialname    object
lineagedate                  object
lineagenote                  object
lineageauthorizedby          object
dtype: object


# Obsolete records
The following codeblock retrieves the two tables that contain references to obsolete units or names. We may want to examine this in future versions to move from simply capturing notes about obsolescence to keeping track of what is actually changing. Alternatively, we can keep with a whole dataset versioning construct if that works better for the community, but as soon as we start minting individual DOIs for the units, making them citable, that changes the dynamic in how we manage the data moving forward.

In [6]:
obsoleteUnits = pd.read_csv(usnvcExportFolder+"/UnitObsoleteName.txt", sep='\t', encoding = "ISO-8859-1", dtype={"element_global_id":str})
print (obsoleteUnits.dtypes)

obsoleteParents = pd.read_csv(usnvcExportFolder+"/UnitObsoleteParent.txt", sep='\t', encoding = "ISO-8859-1", dtype={"element_global_id":str})
print (obsoleteParents.dtypes)


element_global_id    object
obsoletename         object
obsoletenote         object
obsoletedate         object
obsoleteauthority    object
dtype: object
element_global_id     object
obsoleteparentcode    object
obsoletedivision      object
obsoleteparentname    object
obsoletenote          object
obsoletedate          object
obsoleteauthority     object
dtype: object


# Unit Distribution - Nations and Subnations
The following codeblock assembles the four tables that make up all the code references for the unit by unit distribution at the national level and then in North American states and provinces. I played around with adding a little bit of value to the nations structure by looking up names and setting up objects that contain name, abbreviation, uncertainty (true/false), and an info API reference. But I also kept the original raw string/list of national abbreviations. That process would be a lot smarter if I did it here by pulling together a distinct list of all referenced nation codes/abbreviations and then building a lookup dataframe on those. I'll revisit at some point or if the code bogs down, but the REST API call is pretty quick.

In [7]:
unitXSubnation = pd.read_csv(usnvcExportFolder+"/UnitXSubnation.txt", sep='\t', encoding = "ISO-8859-1", dtype=str)
codes_CurrentPresAbs = pd.read_csv(usnvcExportFolder+"/d_curr_presence_absence.txt", sep='\t', encoding = "ISO-8859-1", dtype=str)
codes_DistConfidence = pd.read_csv(usnvcExportFolder+"/d_dist_confidence.txt", sep='\t', encoding = "ISO-8859-1", dtype=str)
codes_Subnations = pd.read_csv(usnvcDefinitionTablesFolder+"/d_subnation.txt", sep='\t', encoding = "ISO-8859-1", dtype=str)

nvcsDistribution = pd.merge(left=unitXSubnation,right=codes_CurrentPresAbs, left_on='d_curr_presence_absence_id', right_on='D_CURR_PRESENCE_ABSENCE_ID')
nvcsDistribution = pd.merge(left=nvcsDistribution,right=codes_DistConfidence, left_on='d_dist_confidence_id', right_on='D_DIST_CONFIDENCE_ID')
nvcsDistribution = pd.merge(left=nvcsDistribution,right=codes_Subnations, left_on='subnation_id', right_on='subnation_id')

print (nvcsDistribution.dtypes, nvcsDistribution.size)

del unitXSubnation
del codes_CurrentPresAbs
del codes_DistConfidence
del codes_Subnations

element_global_id             object
subnation_id                  object
d_curr_presence_absence_id    object
d_dist_confidence_id          object
D_CURR_PRESENCE_ABSENCE_ID    object
CURR_PRESENCE_ABSENCE_DESC    object
CURR_PRESENCE_ABSENCE_CD      object
D_DIST_CONFIDENCE_ID          object
DIST_CONFIDENCE_CD            object
DIST_CONFIDENCE_DESC          object
iso_nation_cd                 object
subnation_code                object
subnation_name                object
dtype: object 427336


# USFS Ecoregions
There is a coded list of USFS Ecoregion information in the unit descriptions, but this would have to be parsed and referenced out anyway and the base information seems to come through a "unitX..." set of tables. This codeblock sets those data up for processing.

In [8]:
unitXUSFSEcoregion1994 = pd.read_csv(usnvcExportFolder+"/UnitXEcoregionUsfs1994.txt", sep='\t', encoding = "ISO-8859-1", dtype=str)
codes_USFSEcoregions1994 = pd.read_csv(usnvcExportFolder+"/d_usfs_ecoregion1994.txt", sep='\t', encoding = "ISO-8859-1", dtype=str)

unitXUSFSEcoregion2007 = pd.read_csv(usnvcExportFolder+"/UnitXEcoregionUsfs2007.txt", sep='\t', encoding = "ISO-8859-1", dtype=str)
codes_USFSEcoregions2007 = pd.read_csv(usnvcExportFolder+"/d_usfs_ecoregion2007.txt", sep='\t', encoding = "ISO-8859-1", dtype=str)

codes_OccurrenceStatus = pd.read_csv(usnvcExportFolder+"/d_occurrence_status.txt", sep='\t', encoding = "ISO-8859-1", dtype=str)

usfsEcoregionDistribution1994 = pd.merge(left=unitXUSFSEcoregion1994,right=codes_USFSEcoregions1994, left_on='usfs_ecoregion_id', right_on='USFS_ECOREGION_ID')
usfsEcoregionDistribution1994 = pd.merge(left=usfsEcoregionDistribution1994,right=codes_OccurrenceStatus, left_on='d_occurrence_status_id', right_on='D_OCCURRENCE_STATUS_ID')

usfsEcoregionDistribution2007 = pd.merge(left=unitXUSFSEcoregion2007,right=codes_USFSEcoregions2007, left_on='usfs_ecoregion_2007_id', right_on='usfs_ecoregion_2007_id')
usfsEcoregionDistribution2007 = pd.merge(left=usfsEcoregionDistribution2007,right=codes_OccurrenceStatus, left_on='d_occurrence_status_id', right_on='D_OCCURRENCE_STATUS_ID')

print (usfsEcoregionDistribution1994.dtypes)
print ("----------")
print (usfsEcoregionDistribution2007.dtypes)

del unitXUSFSEcoregion1994
del codes_USFSEcoregions1994
del unitXUSFSEcoregion2007
del codes_USFSEcoregions2007
del codes_OccurrenceStatus

element_global_id            object
usfs_ecoregion_id            object
d_occurrence_status_id       object
USFS_ECOREGION_ID            object
PARENT_USFS_ECOREGION_ID     object
D_USFS_ECOREGION_LEVEL_ID    object
USFS_ECOREGION_NAME          object
USFS_ECOREGION_CLASS_CD      object
USFS_ECOREGION_CONCAT_CD     object
D_OCCURRENCE_STATUS_ID       object
OCCURRENCE_STATUS_CD         object
OCCURRENCE_STATUS_DESC       object
dtype: object
----------
element_global_id                object
usfs_ecoregion_2007_id           object
d_occurrence_status_id           object
parent_usfs_ecoregion_2007_id    object
d_usfs_ecoregion_level_id        object
usfs_ecoregion_2007_name         object
usfs_ecoregion_2007_concat_cd    object
D_OCCURRENCE_STATUS_ID           object
OCCURRENCE_STATUS_CD             object
OCCURRENCE_STATUS_DESC           object
dtype: object


I'm concluding (for the moment) that we should use element_global_id as a unique and persistent opaque identifier for a given unit in this data system over time. We still need to decide if we will adopt this as the core identifier for our purposes or assign our own for absolute certainty since we don't really know what all rules apply in the NatureServe Biotics system but suspect some level of squishiness given the multiple identifiers that seem to be in their data now. The "elementuid" is just a "2." with the element_global_id - don't know what that means, so maybe we can ignore it unless someone tells us something useful.

Both databasecode and classificationcode (don't understand the significance of the attribute names) have some part to play in establishing display titles for the units in the application. Certain databasecode values also appear in the one posted proceedings document from the ESA panel, where they seem to be used as meaningful shorthand (kind of a PITA, but as long as its consistent...). It appears that classificationcode is prefixed on Class, Subclass, Formation, and Division names and databasecode is prefixed on Macrogroup, Group, Alliance, and Association units to produce working full titles.

A different set of rules seems to apply for the Cultural units where the current app seems to be conflicting in terms of what is shown at a unit document level and what shows up in search results. Argh! We'll see if we can get a consistent ruleset nailed down that relates the identifiers to title and shorthand conventions so we better know how to package and present.

parent_id appears to be a reference to a parent element_global_id, establishing the hierarchy, though we need to be careful about number handling with Pandas so I explicitly made both those IDs strings on the way in.

Given what we've discovered so far, we probably have enough information to start laying out the unit data structure we want. Assuming we might make one overall pass at establishing initial records, at least, I build this into a list of documents. A few conventions I'm playing with here:

* Assign more human-friendly attribute names to the things that we will display to people, but retain a few of the "ugly names" for things that have special meaning in the data assembly process
* Build logical buckets of information into sub-documents, but don't go too crazy in deeply nesting the structure (at least for now as we're working with better understanding the data)

There appears to be some messiness in the values, so the cleanString function can be tweaked as we work through to fix problems.

In [24]:
def cleanString(text):
    replacements = {'&amp;': '&','&lt;':'<','&gt;':'>'}
    for x,y in replacements.items():
        text = text.replace(x, y)
    return (text)

def getHierarchyFromDF(element_global_id):
    # Assumes the full dataframe exists in memory here already
    thisUnitData = nvcsUnits.loc[nvcsUnits["element_global_id"] == str(element_global_id), ["element_global_id","parent_id","hierarchylevel","classificationcode","databasecode","translatedname","colloquialname","unitsort","DISPLAY_ORDER"]]
    
    immediateChildren = nvcsUnits.loc[nvcsUnits["parent_id"] == str(element_global_id), ["element_global_id","parent_id","hierarchylevel","classificationcode","databasecode","translatedname","colloquialname","unitsort","DISPLAY_ORDER"]]

    parentID = thisUnitData["parent_id"].values[0]

    ancestors = []
    while type(parentID) is str:
        ancestor = nvcsUnits.loc[nvcsUnits["element_global_id"] == str(parentID), ["element_global_id","parent_id","hierarchylevel","classificationcode","databasecode","translatedname","colloquialname","unitsort","DISPLAY_ORDER"]]
        ancestors = ancestors + ancestor.to_dict("records")
        parentID = ancestor["parent_id"].values[0]
        
    hierarchyList = []
    for record in ancestors+thisUnitData.to_dict("records")+immediateChildren.to_dict("records"):
        if record["hierarchylevel"] in ["Class","Subclass","Formation","Division"]:
            record["Display Title"] = record["classificationcode"]+" "+record["colloquialname"]+" "+record["hierarchylevel"]
        elif record["hierarchylevel"] in ["Macrogroup","Group"]:
            record["Display Title"] = record["classificationcode"]+" "+record["translatedname"]
        else:
            record["Display Title"] = record["databasecode"]+" "+record["translatedname"]
        hierarchyList.append(record)
    
    return hierarchyList

In [29]:
nvcsUnitDocs = []
for index,row in nvcsUnits.iterrows():
    #if index > 19:
    #    break
    unitDoc = {"Identifiers":{},"Overview":{},"Hierarchy":{},"Vegetation":{},"Environment":{},"Distribution":{},"Plot Sampling and Analysis":{},"Confidence Level":{},"Conservation Status":{},"Hierarchy":{},"Concept History":{},"Synonymy":{},"Authorship":{},"References":[]}

    unitDoc["_id"] = str(uuid.uuid4())

    unitDoc["Identifiers"]["element_global_id"] = row["element_global_id"]
    unitDoc["Identifiers"]["Database Code"] = row["databasecode"]
    unitDoc["Identifiers"]["Classification Code"] = row["classificationcode"]

    unitDoc["Overview"]["Scientific Name"] = row["scientificname"]
    unitDoc["Overview"]["Formatted Scientific Name"] = cleanString(row["formattedscientificname"])
    unitDoc["Overview"]["Translated Name"] = row["translatedname"]
    if type(row["colloquialname"]) is str:
        unitDoc["Overview"]["Colloquial Name"] = row["colloquialname"]
    if type(row["typeconceptsentence"]) is str:
        unitDoc["Overview"]["Type Concept Sentence"] = cleanString(row["typeconceptsentence"])
    if type(row["typeconcept"]) is str:
        unitDoc["Overview"]["Type Concept"] = cleanString(row["typeconcept"])
    if type(row["diagnosticcharacteristics"]) is str:
        unitDoc["Overview"]["Diagnostic Characteristics"] = cleanString(row["diagnosticcharacteristics"])
    if type(row["rationale"]) is str:
        unitDoc["Overview"]["Rationale for Nonimal Species or Physiognomic Features"] = cleanString(row["rationale"])
    if type(row["classificationcomments"]) is str:
        unitDoc["Overview"]["Classification Comments"] = cleanString(row["classificationcomments"])
    if type(row["similarnvctypescomments"]) is str:
        unitDoc["Overview"]["Similar NVC Types"] = cleanString(row["similarnvctypescomments"])
    if type(row["othercomments"]) is str:
        unitDoc["Overview"]["Other Comments"] = cleanString(row["othercomments"])

    if row["hierarchylevel"] in ["Class","Subclass","Formation","Division"]:
        unitDoc["Overview"]["Display Title"] = row["classificationcode"]+" "+row["colloquialname"]+" "+row["hierarchylevel"]
    elif row["hierarchylevel"] in ["Macrogroup","Group"]:
        unitDoc["Overview"]["Display Title"] = row["classificationcode"]+" "+row["translatedname"]
    else:
        unitDoc["Overview"]["Display Title"] = row["databasecode"]+" "+row["translatedname"]
    
    if type(row["physiognomy"]) is str:
        unitDoc["Vegetation"]["Physiognomy and Structure"] = cleanString(row["physiognomy"])
    if type(row["floristics"]) is str:
        unitDoc["Vegetation"]["Floristics"] = cleanString(row["floristics"])
    if type(row["dynamics"]) is str:
        unitDoc["Vegetation"]["Dynamics"] = cleanString(row["dynamics"])
    
    if type(row["environment"]) is str:
        unitDoc["Environment"]["Environmental Description"] = cleanString(row["environment"])

    if type(row["spatialpattern"]) is str:
        unitDoc["Environment"]["Spatial Pattern"] = cleanString(row["spatialpattern"])

    if type(row["range"]) is str:
        unitDoc["Distribution"]["Geographic Range"] = row["range"]

    if type(row["nations"]) is str:
        unitDoc["Distribution"]["Nations"] = {"Raw List":row["nations"],"Nation Info":[]}
        for nation in unitDoc["Distribution"]["Nations"]["Raw List"].split(","):
            thisNation = {"Abbreviation":nation.replace("?","").strip()}
            if nation.endswith("?"):
                thisNation["Uncertainty"] = True
            else:
                thisNation["Uncertainty"] = False
            thisNation["Info API"] = "https://restcountries.eu/rest/v2/alpha/"+thisNation["Abbreviation"]
            thisNationInfo = requests.get(thisNation["Info API"]+"?fields=name").json()
            if "name" in thisNationInfo.keys():
                thisNation["Name"] = thisNationInfo["name"]
            unitDoc["Distribution"]["Nations"]["Nation Info"].append(thisNation)
    
    if type(row["subnations"]) is str:
        unitDoc["Distribution"]["Subnations"] = {"Raw List":row["subnations"]}

    thisDistribution = nvcsDistribution.loc[nvcsDistribution["element_global_id"] == row["element_global_id"]]
    if len(thisDistribution.index) > 0:
        unitDoc["Distribution"]["States/Provinces Raw Data"] = thisDistribution.to_dict("records")
    
    thisUSFSDistribution1994 = usfsEcoregionDistribution1994.loc[usfsEcoregionDistribution1994["element_global_id"] == row["element_global_id"]]
    if len(thisUSFSDistribution1994.index) > 0:
        unitDoc["Distribution"]["1994 USFS Ecoregion Raw Data"] = thisUSFSDistribution1994.to_dict("records")
    
    thisUSFSDistribution2007 = usfsEcoregionDistribution2007.loc[usfsEcoregionDistribution2007["element_global_id"] == row["element_global_id"]]
    if len(thisUSFSDistribution2007.index) > 0:
        unitDoc["Distribution"]["2007 USFS Ecoregion Raw Data"] = thisUSFSDistribution2007.to_dict("records")

    if type(row["tncecoregions"]) is int:
        unitDoc["Distribution"]["TNC Ecoregions"] = row["tncecoregions"]

    if type(row["omernikecoregions"]) is int:
        unitDoc["Distribution"]["Omernik Ecoregions"] = row["omernikecoregions"]

    if type(row["omernikecoregions"]) is int:
        unitDoc["Distribution"]["Omernik Ecoregions"] = row["omernikecoregions"]

    if type(row["federallands"]) is int:
        unitDoc["Distribution"]["Federal Lands"] = row["federallands"]

    if type(row["plotcount"]) is int:
        unitDoc["Plot Sampling and Analysis"]["Plot Count"] = row["plotcount"]
    if type(row["plotsummary"]) is str:
        unitDoc["Plot Sampling and Analysis"]["Plot Summary"] = row["plotsummary"]
    if type(row["plottypal"]) is str:
        unitDoc["Plot Sampling and Analysis"]["Plot Type"] = row["plottypal"]
    if type(row["plotarchived"]) is str:
        unitDoc["Plot Sampling and Analysis"]["Plot Archive"] = row["plotarchived"]
    if type(row["plotconsistency"]) is str:
        unitDoc["Plot Sampling and Analysis"]["Plot Consistency"] = row["plotconsistency"]
    if type(row["plotsize"]) is str:
        unitDoc["Plot Sampling and Analysis"]["Plot Size"] = row["plotsize"]
    if type(row["plotmethods"]) is str:
        unitDoc["Plot Sampling and Analysis"]["Plot Methods"] = row["plotmethods"]

    unitDoc["Confidence Level"]["Confidence Level"] = row["CLASSIF_CONFIDENCE_DESC"]
    if type(row["confidencecomments"]) is str:
        unitDoc["Confidence Level"]["Confidence Level Comments"] = cleanString(row["confidencecomments"])

    if type(row["grank"]) is str:
        unitDoc["Conservation Status"]["Global Rank"] = row["grank"]
    if type(row["grankreviewdate"]) is str:
        unitDoc["Conservation Status"]["Global Rank Review Date"] = row["grankreviewdate"]
    if type(row["grankauthor"]) is str:
        unitDoc["Conservation Status"]["Global Rank Author"] = row["grankauthor"]
    if type(row["grankreasons"]) is str:
        unitDoc["Conservation Status"]["Global Rank Reasons"] = row["grankreasons"]
        
    unitDoc["Hierarchy"]["parent_id"] = str(row["parent_id"])
    unitDoc["Hierarchy"]["hierarchylevel"] = row["hierarchylevel"]
    unitDoc["Hierarchy"]["d_classification_level_id"] = row["d_classification_level_id"]
    unitDoc["Hierarchy"]["unitsort"] = row["unitsort"]
    unitDoc["Hierarchy"]["parentkey"] = row["parentkey"]
    unitDoc["Hierarchy"]["parentname"] = row["parentname"]
    unitDoc["Hierarchy"]["Cached Hierarchy"] = getHierarchyFromDF(row["element_global_id"])

    
    if type(row["lineage"]) is str:
        unitDoc["Concept History"]["Concept Lineage"] = row["lineage"]
    
    thisUnitPredecessors = unitPredecessors.loc[unitPredecessors["element_global_id"] == row["element_global_id"]]
    if len(thisUnitPredecessors.index) > 0:
        unitDoc["Concept History"]["Predecessors Raw Data"] = thisUnitPredecessors.to_dict("records")

    thisUnitObsoleteUnits = obsoleteUnits.loc[obsoleteUnits["element_global_id"] == row["element_global_id"]]
    if len(thisUnitObsoleteUnits.index) > 0:
        unitDoc["Concept History"]["Obsolete Units Raw Data"] = thisUnitObsoleteUnits.to_dict("records")

    thisUnitObsoleteParents = obsoleteParents.loc[obsoleteParents["element_global_id"] == row["element_global_id"]]
    if len(thisUnitObsoleteParents.index) > 0:
        unitDoc["Concept History"]["Obsolete Parents Raw Data"] = thisUnitObsoleteParents.to_dict("records")

    if type(row["synonymy"]) is str:
        unitDoc["Synonymy"]["Synonymy"] = row["synonymy"]

    if type(row["primaryconceptsource"]) is str:
        unitDoc["Authorship"]["Concept Author"] = row["primaryconceptsource"]
    if type(row["descriptionauthor"]) is str:
        unitDoc["Authorship"]["Description Author"] = row["descriptionauthor"]
    if type(row["acknowledgements"]) is str:
        unitDoc["Authorship"]["Acknowledgements"] = row["acknowledgements"]
    if type(row["versiondate"]) is str:
        unitDoc["Authorship"]["Version Date"] = row["versiondate"]
    
    thisUnitReferences = unitReferences.loc[unitReferences["element_global_id"] == row["element_global_id"]]
    for index,row in thisUnitReferences.iterrows():
        unitDoc["References"].append({"Short Citation":row["shortcitation"],"Full Citation":row["fullcitation"]})

    nvcsUnitDocs.append(unitDoc)
#display (nvcsUnitDocs)

In [13]:
bis = dd.getDB("bis")
nvcsCollection = bis["NVCS"]

In [30]:
nvcsCollection.delete_many({})
nvcsCollection.insert_many(nvcsUnitDocs)

<pymongo.results.InsertManyResult at 0x112657cf0>