Since I've revamped all the TIR processors (yet again), I've eliminated the old notes here that are no longer fully relevant. This script now does what most of the TIR processors do, act continuously (or within a set limit) on every registered item in the TIR that does not yet have ITIS information cached.

I also significantly simplified this whole process by switching from the hstore to json data structure for the different "buckets" of cached information in the TIR. This allowed me to simply retrieve and process a matching document from the ITIS Solr service in its JSON format, pop out the properties that we don't want/need (or that were causing undue issues with the GC2 API and PostgreSQL), and repackage some of the information (hierarchy with ranks and vernacular names) into a more usable structure that takes advantage of JSON over a text string in need of constant parsing.

### To Do
Next, I need to add in a different route for this code that retrieves information from ITIS when the registration info in the TIR contains an already identified ITIS TSN. This will be for GAP species and other cases and will include not following the taxonomic information to a valid TSN, but simply recording when that is the case.

In [1]:
import requests,json
from IPython.display import display
from bis import bis
from bis import itis
from bis import tir
from bis2 import gc2

In [2]:
# Set up the actions/targets for this particular instance
thisRun = {}
thisRun["instance"] = "DataDistillery"
thisRun["db"] = "BCB"
thisRun["baseURL"] = gc2.sqlAPI(thisRun["instance"],thisRun["db"])
thisRun["commitToDB"] = True
thisRun["totalRecordsToProcess"] = 5
thisRun["totalRecordsProcessed"] = 0

numberWithoutTIRData = 1

while numberWithoutTIRData == 1 and thisRun["totalRecordsProcessed"] < thisRun["totalRecordsToProcess"]:

    q_recordToSearch = "SELECT id, \
        registration->>'source' AS source, \
        registration->>'followTaxonomy' AS followtaxonomy, \
        registration->>'taxonomicLookupProperty' AS taxonomiclookupproperty, \
        registration->>'scientificname' AS scientificname, \
        registration->>'tsn' AS tsn \
        FROM tir.tir \
        WHERE itis IS NULL \
        LIMIT 1"
    recordToSearch  = requests.get(thisRun["baseURL"]+"&q="+q_recordToSearch).json()
    
    numberWithoutTIRData = len(recordToSearch["features"])
    
    if numberWithoutTIRData == 1:
        tirRecord = recordToSearch["features"][0]
        
        # Set up a local data structure for storage and processing
        thisRecord = {}

        # Set data from query results
        thisRecord["id"] = tirRecord["properties"]["id"]
        thisRecord["source"] = tirRecord["properties"]["source"]
        thisRecord["followTaxonomy"] = tirRecord["properties"]["followtaxonomy"]
        thisRecord["taxonomicLookupProperty"] = tirRecord["properties"]["taxonomiclookupproperty"]
        thisRecord["tsn"] = tirRecord["properties"]["tsn"]
        thisRecord["scientificname"] = tirRecord["properties"]["scientificname"]
        thisRecord["scientificname_search"] = bis.cleanScientificName(thisRecord["scientificname"])

        # Set defaults for thisRecord
        thisRecord["matchMethod"] = "Not Matched"
        thisRecord["matchString"] = thisRecord["scientificname_search"]
        thisRecord["itisData"] = itis.packageITISJSON(thisRecord["matchMethod"],thisRecord["matchString"],0)
        thisRecord["numResults"] = 0
        itisDoc = {}

        if thisRecord["taxonomicLookupProperty"] == "scientificname" and len(thisRecord["scientificname_search"]) != 0:

            thisRecord["itisSearchURL"] = itis.getITISSearchURL(thisRecord["scientificname_search"],False,True)

            # Try an exact match search
            try:
                itisSearchResults = requests.get(thisRecord["itisSearchURL"]).json()
                thisRecord["numResults"] = len(itisSearchResults["response"]["docs"])
            except Exception as e:
                print (e)
                pass


            # If we got only a single record on an exact match search, set the method and proceed
            if thisRecord["numResults"] == 1:
                thisRecord["matchMethod"] = "Exact Match"
                itisDoc = itisSearchResults["response"]["docs"][0]

            # If we found nothing on an exact match search, try a fuzzy match
            elif thisRecord["numResults"] == 0:
                try:
                    thisRecord["itisSearchURL"] = itis.getITISSearchURL(thisRecord["scientificname_search"],True,True)
                    itisSearchResults = requests.get(thisRecord["itisSearchURL"]).json()
                    thisRecord["numResults"] = len(itisSearchResults["response"]["docs"])
                except Exception as e:
                    print (e)
                    pass
                if thisRecord["numResults"] == 1:
                    thisRecord["matchMethod"] = "Fuzzy Match"
                    itisDoc = itisSearchResults["response"]["docs"][0]

            # If we got a result but the usage is not accepted/invalid and we should follow taxonomy for this record, then retrieve the record for the accepted TSN
            if len(itisDoc) > 0 and itisDoc["usage"] in ["not accepted","invalid"] and thisRecord["followTaxonomy"]:
                thisRecord["itisSearchURL"] = itis.getITISSearchURL(itisDoc["acceptedTSN"][0],False,False)
                try:
                    itisSearchResults = requests.get(thisRecord["itisSearchURL"]).json()
                except Exception as e:
                    print (e)
                    pass
                if thisRecord["numResults"] == 1:
                    thisRecord["matchMethod"] = "Followed Accepted TSN"
                    itisDoc = itisSearchResults["response"]["docs"][0]

            # If we got an ITIS Doc returned, package the results
            if len(itisDoc) > 0:
                thisRecord["itisData"] = itis.packageITISJSON(thisRecord["matchMethod"],thisRecord["matchString"],itisDoc)

        elif thisRecord["taxonomicLookupProperty"] == "tsn" and thisRecord["tsn"] is not None:
            thisRecord["itisSearchURL"] = itis.getITISSearchURL(thisRecord["tsn"],False,False)
            itisSearchResults = requests.get(thisRecord["itisSearchURL"]).json()
            thisRecord["matchMethod"] = "TSN Query"
            thisRecord["matchString"] = thisRecord["tsn"]
            itisDoc = itisSearchResults["response"]["docs"][0]
            thisRecord["itisData"] = itis.packageITISJSON(thisRecord["matchMethod"],thisRecord["matchString"],itisDoc)
            
        display (thisRecord)
        if thisRun["commitToDB"]:
            print (tir.cacheToTIR(thisRun["baseURL"],thisRecord["id"],"itis",json.dumps(thisRecord["itisData"])))
        thisRun["totalRecordsProcessed"] = thisRun["totalRecordsProcessed"] + 1

        

{'followTaxonomy': 'true',
 'id': 20760,
 'itisData': {'MatchMethod': 'Exact Match',
  'MatchString': 'Potamogeton illinoensis',
  'cacheDate': '2017-09-08T12:52:08.899298',
  'commonnames': [{'language': 'English', 'name': 'Illinois pondweed'},
   {'language': 'French', 'name': "potamot de l''Illinois"}],
  'createDate': '1996-06-13 14:51:08',
  'hierarchy': ['Plantae',
   'Viridiplantae',
   'Streptophyta',
   'Embryophyta',
   'Tracheophyta',
   'Spermatophytina',
   'Magnoliopsida',
   'Lilianae',
   'Alismatales',
   'Potamogetonaceae',
   'Potamogeton',
   'Potamogeton illinoensis'],
  'kingdom': 'Plantae',
  'nameWInd': 'Potamogeton illinoensis',
  'nameWOInd': 'Potamogeton illinoensis',
  'parentTSN': '39005',
  'rank': 'Species',
  'synonymTSNs': ['39035:$504551$519504$519516$'],
  'synonyms': ['39035:$Potamogeton methyensis$Potamogeton angustifolius$Potamogeton lucens$'],
  'taxonomy': [{'name': 'Plantae', 'rank': 'Kingdom'},
   {'name': 'Viridiplantae', 'rank': 'Subkingdom'}

{'auth_check': {'session': None, 'success': True, 'auth_level': 'Write', 'checked_relations': ['tir.tir']}, 'success': True, '_execution_time': 0.082, 'affected_rows': 1}


{'followTaxonomy': 'true',
 'id': 20761,
 'itisData': {'MatchMethod': 'Not Matched',
  'MatchString': 'Stenotrema macgregori',
  'cacheDate': '2017-09-08T12:52:09.816863'},
 'itisSearchURL': 'http://services.itis.gov/?wt=json&rows=10&q=nameWOInd:Stenotrema\\%20macgregori~0.5%20AND%20(usage:accepted%20OR%20usage:valid)',
 'matchMethod': 'Not Matched',
 'matchString': 'Stenotrema macgregori',
 'numResults': 0,
 'scientificname': 'Stenotrema macgregori',
 'scientificname_search': 'Stenotrema macgregori',
 'source': 'SGCN',
 'taxonomicLookupProperty': 'scientificname',
 'tsn': None}

{'auth_check': {'session': None, 'success': True, 'auth_level': 'Write', 'checked_relations': ['tir.tir']}, 'success': True, '_execution_time': 0.076, 'affected_rows': 1}


{'followTaxonomy': 'true',
 'id': 20762,
 'itisData': {'MatchMethod': 'Not Matched',
  'MatchString': 'Prunus angustifolia var angustifolia',
  'cacheDate': '2017-09-08T12:52:10.948017'},
 'itisSearchURL': 'http://services.itis.gov/?wt=json&rows=10&q=nameWOInd:Prunus\\%20angustifolia\\%20var\\%20angustifolia~0.5%20AND%20(usage:accepted%20OR%20usage:valid)',
 'matchMethod': 'Not Matched',
 'matchString': 'Prunus angustifolia var angustifolia',
 'numResults': 0,
 'scientificname': 'Prunus angustifolia var. angustifolia',
 'scientificname_search': 'Prunus angustifolia var angustifolia',
 'source': 'SGCN',
 'taxonomicLookupProperty': 'scientificname',
 'tsn': None}

{'auth_check': {'session': None, 'success': True, 'auth_level': 'Write', 'checked_relations': ['tir.tir']}, 'success': True, '_execution_time': 0.087, 'affected_rows': 1}


{'followTaxonomy': 'true',
 'id': 20763,
 'itisData': {'MatchMethod': 'Exact Match',
  'MatchString': 'Juncus brachycarpus',
  'cacheDate': '2017-09-08T12:52:12.264962',
  'commonnames': [{'language': 'English', 'name': 'whiteroot rush'}],
  'createDate': '1996-06-13 14:51:08',
  'hierarchy': ['Plantae',
   'Viridiplantae',
   'Streptophyta',
   'Embryophyta',
   'Tracheophyta',
   'Spermatophytina',
   'Magnoliopsida',
   'Lilianae',
   'Poales',
   'Juncaceae',
   'Juncus',
   'Juncus brachycarpus'],
  'kingdom': 'Plantae',
  'nameWInd': 'Juncus brachycarpus',
  'nameWOInd': 'Juncus brachycarpus',
  'parentTSN': '39220',
  'rank': 'Species',
  'taxonomy': [{'name': 'Plantae', 'rank': 'Kingdom'},
   {'name': 'Viridiplantae', 'rank': 'Subkingdom'},
   {'name': 'Streptophyta', 'rank': 'Infrakingdom'},
   {'name': 'Embryophyta', 'rank': 'Superdivision'},
   {'name': 'Tracheophyta', 'rank': 'Division'},
   {'name': 'Spermatophytina', 'rank': 'Subdivision'},
   {'name': 'Magnoliopsida', 'r

{'auth_check': {'session': None, 'success': True, 'auth_level': 'Write', 'checked_relations': ['tir.tir']}, 'success': True, '_execution_time': 0.08, 'affected_rows': 1}


{'followTaxonomy': 'true',
 'id': 20764,
 'itisData': {'MatchMethod': 'Exact Match',
  'MatchString': 'Euphagus carolinus',
  'cacheDate': '2017-09-08T12:52:13.914835',
  'commonnames': [{'language': 'Spanish', 'name': 'Tordo canadiense'},
   {'language': 'English', 'name': 'Rusty Blackbird'},
   {'language': 'French', 'name': 'quiscale rouilleux'}],
  'createDate': '1996-06-13 14:51:08',
  'hierarchy': ['Animalia',
   'Bilateria',
   'Deuterostomia',
   'Chordata',
   'Vertebrata',
   'Gnathostomata',
   'Tetrapoda',
   'Aves',
   'Passeriformes',
   'Icteridae',
   'Euphagus',
   'Euphagus carolinus'],
  'kingdom': 'Animalia',
  'nameWInd': 'Euphagus carolinus',
  'nameWOInd': 'Euphagus carolinus',
  'parentTSN': '179090',
  'rank': 'Species',
  'taxonomy': [{'name': 'Animalia', 'rank': 'Kingdom'},
   {'name': 'Bilateria', 'rank': 'Subkingdom'},
   {'name': 'Deuterostomia', 'rank': 'Infrakingdom'},
   {'name': 'Chordata', 'rank': 'Phylum'},
   {'name': 'Vertebrata', 'rank': 'Subphylu

{'auth_check': {'session': None, 'success': True, 'auth_level': 'Write', 'checked_relations': ['tir.tir']}, 'success': True, '_execution_time': 0.086, 'affected_rows': 1}
