In [1]:
import codecs
import re
from uuid import uuid4 as UUID
import uuid

from nquads import NQuads
from prestongraph import *

In [2]:
NQuads.Parse('<https://preston.guoda.bio> <http://purl.org/dc/terms/description> "Preston is a software program that finds, archives and provides access to biodiversity datasets."@en .')

[[('https://preston.guoda.bio',),
  ('http://purl.org/dc/terms/description',),
  ('Preston is a software program that finds, archives and provides access to biodiversity datasets.',)]]

In [3]:
NQuads.Parse('<https://idigbio.org> <http://www.w3.org/ns/prov#wasAssociatedWith> <daf3ee3f-8f3e-495e-b57f-bc93c8fccb2c> .')

[[('https://idigbio.org',),
  ('http://www.w3.org/ns/prov#wasAssociatedWith',),
  ('daf3ee3f-8f3e-495e-b57f-bc93c8fccb2c',)]]

In [4]:
NQuads.Parse('<hash://sha256/844e59241f5d0f44891ce46ea1816394baf49184698005c66de73e6163d49d3b> <http://www.w3.org/ns/prov#generatedAtTime> "2019-02-04T16:34:10.865Z"^^<http://www.w3.org/2001/XMLSchema#dateTime> .')

[[('hash://sha256/844e59241f5d0f44891ce46ea1816394baf49184698005c66de73e6163d49d3b',),
  ('http://www.w3.org/ns/prov#generatedAtTime',),
  ('2019-02-04T16:34:10.865Z', 'http://www.w3.org/2001/XMLSchema#dateTime')]]

In [5]:
path = "../preston.acis.ufl.edu/data/05/a8/05a877bdb8617144fe166a13bf51828d4ad1bc11631c360b9e648a9f7df2bbcd"
path = "../preston.acis.ufl.edu/data/20/d3/20d36a6f879ba1dd797d4288a4f2e32719d3c674156194c2765a3ec6b43f5e17"

allNQuads = []
with codecs.open(path, 'r', encoding='utf-8', errors='ignore') as file:
    nQuads = NQuads.Parse(file.read())
    allNQuads += nQuads

In [6]:
fullIndex = Index(allNQuads)

List verbs read from the ingested logs

In [7]:
for x in sorted(fullIndex.verbs): print(x)

http://purl.org/dc/elements/1.1/format
http://purl.org/dc/terms/description
http://purl.org/pav/createdBy
http://purl.org/pav/hasVersion
http://purl.org/pav/previousVersion
http://www.w3.org/1999/02/22-rdf-syntax-ns#type
http://www.w3.org/ns/prov#generatedAtTime
http://www.w3.org/ns/prov#hadMember
http://www.w3.org/ns/prov#startedAtTime
http://www.w3.org/ns/prov#usedBy
http://www.w3.org/ns/prov#wasAssociatedWith
http://www.w3.org/ns/prov#wasGeneratedBy
http://www.w3.org/ns/prov#wasStartedBy


Find the UUID of the crawl activity

In [8]:
crawlNode = None
for t in fullIndex.verbLookup["http://www.w3.org/1999/02/22-rdf-syntax-ns#type"].triples:
    if t.object == "http://www.w3.org/ns/prov#Activity":
        crawlNode = t.subject
        break
crawlUUID = str(crawlNode)
print(crawlUUID)

e871efcd-c2f9-4e8e-ac3a-bc45943c3e65


In [9]:
exampleQuery = [
    x for x in fullIndex.triples if (
        x.subject.Type() == Value.Type.URL and
        x.verb == "http://purl.org/pav/hasVersion" and
        x.object.Type() == Value.Type.CONTENT
    )
]

for triple in exampleQuery[:3]:
    print(triple)

https://search.idigbio.org/v2/search/publishers	http://purl.org/pav/hasVersion	hash://sha256/3eff98d4b66368fd8d1f8fa1af6a057774d8a407a4771490beeb9e7add76f362
https://api.gbif.org/v1/dataset	http://purl.org/pav/hasVersion	hash://sha256/184886cc6ae4490a49a70b6fd9a3e1dfafce433fc8e3d022c89e0b75ea3cda0b
https://bms.gfbio.org/services/data-sources/	http://purl.org/pav/hasVersion	hash://sha256/ba4f1de1f97ef57c90d321b7bf36426ac4031fa3a312af2c22a538d0f4387a4c


To construct qualified generations from past logs:
1. Start at (`URL hasVersion HASH`)
1. Collect (`HASH X Y`) triples that follow
1. Follow the `previousVersion` chain in reverse to find the actual content generated by the crawl (the "latest version")

NOTE: recursively following the `previousVersion` chain doesn't always work since sometimes it's circular. Infinite recursion ensues.

In [10]:
# def GetLatestVersion(node):
#     previousVersions = [x for x in node.inwardTriples if x.verb == "http://purl.org/pav/previousVersion"]
#     if len(previousVersions) > 0:
#         return GetLatestVersion(previousVersions[0].subject)
#     else:
#         return node

# def MakeQualifiedGeneration(url, context, crawlUUID):
#     index = Index(context)

#     urlNode = index.nodeLookup[url]
#     firstVersion = [x for x in urlNode.outwardTriples if x.verb == "http://purl.org/pav/hasVersion"][0].object
#     latestVersion = GetLatestVersion(firstVersion)
    
#     qualGenUUID = UUID()

#     newLines = [
#         "<%s> <%s> <%s> ." % \
#             (str(latestVersion), "http://www.w3.org/ns/prov#qualifiedGeneration", str(qualGenUUID)),

#         "<%s> <%s> <%s> ." % \
#             (str(qualGenUUID), "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://www.w3.org/ns/prov#Generation"),

#         "<%s> <%s> <%s> ." % \
#             (str(qualGenUUID), "http://www.w3.org/ns/prov#activity", str(crawlUUID)),

#         "<%s> <%s> <%s> ." % \
#             (str(qualGenUUID), "http://www.w3.org/ns/prov#used", str(url)),
#     ]
    
#     return newLines

For now, let's assume the version furthest down the list of n-quads is the most recent. Is this a safe assumption?

In [11]:
def PrintQualifiedGeneration(url, latestVersion, crawlUUID):
    qualGenUUID = UUID()

    print("<%s> <%s> <%s> ." %
        (str(latestVersion), "http://www.w3.org/ns/prov#qualifiedGeneration", str(qualGenUUID))
    )

    print("<%s> <%s> <%s> ." %
        (str(qualGenUUID), "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://www.w3.org/ns/prov#Generation")
    )
    
    print("<%s> <%s> <%s> ." %
        (str(qualGenUUID), "http://www.w3.org/ns/prov#activity", str(crawlUUID))
    )

    print("<%s> <%s> <%s> ." %
        (str(qualGenUUID), "http://www.w3.org/ns/prov#used", str(url)),
    )

In [12]:
import sys
import io

path = "../preston.acis.ufl.edu/data/20/d3/20d36a6f879ba1dd797d4288a4f2e32719d3c674156194c2765a3ec6b43f5e17"

file = open(path, "r")
sys.stdin = io.StringIO(file.read())

In [13]:
url = None
latestVersion = None
crawlUUID = None
didOne = False
for line in sys.stdin:
    nQuads = NQuads.Parse(line)
    for nQuad in nQuads:
        triple = Triple.FromNQuad(nQuad)

        # Watch for newer versions for the current URL
        if (
            triple.subject.Type() == Value.Type.CONTENT and
            triple.verb == "http://purl.org/pav/previousVersion" and
            triple.object.Type() == Value.Type.CONTENT
        ):
            latestVersion = str(triple.subject)

        # Watch for existing qualified generations
        elif (
            triple.subject.Type() == Value.Type.CONTENT and
            triple.verb == "http://www.w3.org/ns/prov#qualifiedGeneration" and
            triple.object.Type() == Value.Type.UUID
        ):
            # No need to log a new download event
            url = None

        # Check for the start of a download event
        elif (
            crawlUUID and
            triple.subject.Type() == Value.Type.URL and
            triple.verb == "http://purl.org/pav/hasVersion" and
            triple.object.Type() == Value.Type.CONTENT
        ):
            # Create a generation for the previous
            if url and latestVersion:
                PrintQualifiedGeneration(url, latestVersion, crawlUUID)
                didOne = True

            # Start reading triples for the next URL
            url = str(triple.subject)
            latestVersion = str(triple.object)

        # Check for a new crawl UUID
        elif triple.object == "http://www.w3.org/ns/prov#Activity":
            crawlUUID = str(triple.subject)
            url = None
            latestVersion = None

    if didOne:
        break

<hash://sha256/3eff98d4b66368fd8d1f8fa1af6a057774d8a407a4771490beeb9e7add76f362> <http://www.w3.org/ns/prov#qualifiedGeneration> <0631b15f-225f-460e-b9ca-c2f906f4cbe0> .
<0631b15f-225f-460e-b9ca-c2f906f4cbe0> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/ns/prov#Generation> .
<0631b15f-225f-460e-b9ca-c2f906f4cbe0> <http://www.w3.org/ns/prov#activity> <e871efcd-c2f9-4e8e-ac3a-bc45943c3e65> .
<0631b15f-225f-460e-b9ca-c2f906f4cbe0> <http://www.w3.org/ns/prov#used> <https://search.idigbio.org/v2/search/publishers> .


In [14]:
file.close()