In [1]:
import codecs
import re
from uuid import uuid4 as UUID
import uuid

Given `patch-qualified-generations/logs-to-patch` contains a list of hashes to patch, run
```shell
cat patch-qualified-generations/logs-to-patch | sed -r 's/^hash:\/\/sha256\/(\w{2})(\w{2})(\w{60})/data\/\1\/\2\/\1\2\3/' > patch-qualified-generations/filepaths
```
to get the filepaths for the hashes. Then run
```shell
cat patch-qualified-generations/filepaths | xargs -L 1 python3 patch-qualified-generations.py > patch-qualified-generations/new-lines
```

to save the new qualified generations to `patch-qualified-generations/new-lines` in n-quads format.

It takes a while.

The dream was to pass `preston ls` to the script, but I couldn't decide on a safe way to prevent redundant generations.

In [2]:
class Value:
    from enum import Enum
    class Type(Enum):
        ANY     = 0
        CONTENT = 1
        HASH    = CONTENT
        UUID    = 2
        URL     = 3
        RAW     = 4

    def __init__(self, text, valueType):
        assert type(text) == str
        assert type(valueType) == Value.Type
        self.text = text
        self.type = valueType
    
    def __lt__(self, other):
        return str(self) < str(other)

    def __eq__(self, other):
        t = type(other)
        if   t == str:
            return self.text == other
        elif t == Value.Type:
            return (other == Value.Type.ANY or self.type == other)
        elif t == Value:
            return (
                self.text == other.text and
                self.type == other.type
            )
        else:
            return False

    def __str__(self):
        return self.text
    
    def IsHash(self):
        return self.type == Value.Type.CONTENT

    def FromText(text):
        if (re.match('^hash:\/\/sha256\/.{64}$', text) or
            re.match('^https?:\/\/.*\.well-known\/genid\/\w{8}-\w{4}-\w{4}-\w{4}-\w{12}$', text)):
            type = Value.Type.CONTENT
        elif re.match('^\w{8}-\w{4}-\w{4}-\w{4}-\w{12}$', text):
            type = Value.Type.UUID
        elif re.match('^https?://', text):
            type = Value.Type.URL
        else:
            type = Value.Type.RAW

        return Value(text, type)

class Verb:

    def __init__(self, value):
        assert type(value) == Value
        self.value = value
        self.triples = []
    
    def __lt__(self, other):
        return str(self) < str(other)
    
    def __eq__(self, other):
        t = type(other)
        if  t == str:
            return self.value == other
        else:
            return self.value == other.value

    def __str__(self):
        return self.value.text

    def Text(self):
        return self.value.text

    def FromText(text, index=None):
        """Returns a node associated with *text*"""
        text = text.strip()
        
        # If the text is already indexed, use that
        if index and text in index.verbLookup:
            return index.verbLookup[text]

        value = Value.FromText(text)
        verb = Verb(value)

        # Update the index
        if index:
            index.verbLookup[text] = verb
            index.verbs.append(verb)

        return verb

class Node:

    def __init__(self, value=None):
        self.value = value
        self.inwardTriples = []
        self.outwardTriples = []
    
    def __lt__(self, other):
        return str(self) < str(other)
    
    def __eq__(self, other):
        t = type(other)
        if  t == str:
            return self.value == other
        else:
            return (
                self.value == other.value
            )

    def __str__(self):
        return self.value.text

    def Text(self):
        return self.value.text

    def Type(self):
        return self.value.type

    def IsHash(self):
        return self.value.type == Value.Type.CONTENT

    def FromText(text, index=None):
        """Returns a node associated with *text*"""
        text = text.strip()
        
        # If the text is already indexed, use that
        if index and text in index.nodeLookup:
            return index.nodeLookup[text]

        value = Value.FromText(text)
        node = Node()
        node.value = value

        # Update the index
        if index:
            index.nodeLookup[text] = node
            index.nodes.append(node)

        return node

class Triple:

    def __init__(self, subject, verb, object):
        assert type(subject) == Node
        assert type(verb) == Verb
        assert type(object) == Node
        self.subject = subject
        self.verb = verb
        self.object = object
    
    def __lt__(self, other):
        return str(self) < str(other)

    def __eq__(self, other):
        return (
            self.subject    == other.subject    and
            self.verb       == other.verb       and
            self.object     == other.object
        )

    def __str__(self):
        return str(self.subject) + '\t' + str(self.verb) + '\t' + str(self.object)

    def Subject(self):
        return parts[0]

    def Verb(self):
        return parts[1]

    def Object(self):
        return parts[2]

    def Matches(self, subject=None, verb=None, object=None):
        return (
            (subject    == None or self.subject == subject  ) and
            (verb       == None or self.verb    == verb     ) and
            (object     == None or self.object  == object   )
        )

    def FromNQuad(nQuad, index=None):
        """Returns a triple extracted from *nQuad*"""

        nQuadString = str(nQuad)

        # If the text is already indexed, use that
        if index and nQuadString in index.tripleLookup:
            return index.tripleLookup[nQuadString]

        subject = Node.FromText(nQuad[0][0], index)
        verb = Verb.FromText(nQuad[1][0], index)
        object = Node.FromText(nQuad[2][0], index)

        triple = Triple(subject, verb, object)

        # Make connections
        subject.outwardTriples.append(triple)
        object.inwardTriples.append(triple)
        verb.triples.append(triple)

        # Update the index
        if index:
            index.tripleLookup[nQuadString] = triple
            index.triples.append(triple)

        return triple

In [3]:
class Index:
    def __init__(self, nQuads):
        self.nodeLookup = dict()
        self.verbLookup = dict()
        self.tripleLookup = dict()

        self.nodes = list()
        self.verbs = list()
        self.triples = list()

        # Parse n-quads
        for nQuad in nQuads:
            Triple.FromNQuad(nQuad, index=self)

In [4]:
class NQuads:
    delimiters = {
        '<' : '>',
        '"' : '"'
    }

    # TODO: retain the "@en" flag on text values
    def Parse(text):
        nquads = []
        groups = []
        inGroup = False
        subgroupStart = None
        subgroups = []
        delimiter = ''
        for i, c in enumerate(text):
            if inGroup:
                if c == delimiter:
                    subgroup = text[subgroupStart : i]
                    subgroups.append(subgroup)
                    delimiter = ''
                elif delimiter == '':
                    # Treat back-to-back delimiters as one group
                    if c in NQuads.delimiters:
                        delimiter = NQuads.delimiters[c]
                        subgroupStart = i + 1
                    # Spaces only end the group when outside a pair of delimiters
                    elif c.isspace():
                        groups.append(tuple(subgroups))
                        inGroup = False
                        subgroups = []
            else:
                if c == '.':
                    nquads.append(groups)
                    groups = []
                elif c in NQuads.delimiters:
                    delimiter = NQuads.delimiters[c]
                    subgroupStart = i + 1
                    inGroup = True
        return nquads

In [5]:
NQuads.Parse('<https://preston.guoda.bio> <http://purl.org/dc/terms/description> "Preston is a software program that finds, archives and provides access to biodiversity datasets."@en .')

[[('https://preston.guoda.bio',),
  ('http://purl.org/dc/terms/description',),
  ('Preston is a software program that finds, archives and provides access to biodiversity datasets.',)]]

In [6]:
NQuads.Parse('<https://idigbio.org> <http://www.w3.org/ns/prov#wasAssociatedWith> <daf3ee3f-8f3e-495e-b57f-bc93c8fccb2c> .')

[[('https://idigbio.org',),
  ('http://www.w3.org/ns/prov#wasAssociatedWith',),
  ('daf3ee3f-8f3e-495e-b57f-bc93c8fccb2c',)]]

In [7]:
NQuads.Parse('<hash://sha256/844e59241f5d0f44891ce46ea1816394baf49184698005c66de73e6163d49d3b> <http://www.w3.org/ns/prov#generatedAtTime> "2019-02-04T16:34:10.865Z"^^<http://www.w3.org/2001/XMLSchema#dateTime> .')

[[('hash://sha256/844e59241f5d0f44891ce46ea1816394baf49184698005c66de73e6163d49d3b',),
  ('http://www.w3.org/ns/prov#generatedAtTime',),
  ('2019-02-04T16:34:10.865Z', 'http://www.w3.org/2001/XMLSchema#dateTime')]]

In [8]:
path = "../preston.acis.ufl.edu/data/05/a8/05a877bdb8617144fe166a13bf51828d4ad1bc11631c360b9e648a9f7df2bbcd"
path = "../preston.acis.ufl.edu/data/20/d3/20d36a6f879ba1dd797d4288a4f2e32719d3c674156194c2765a3ec6b43f5e17"

allNQuads = []
with codecs.open(path, 'r', encoding='utf-8', errors='ignore') as file:
    nQuads = NQuads.Parse(file.read())
    allNQuads += nQuads

In [9]:
fullIndex = Index(allNQuads)

List verbs read from the ingested logs

In [10]:
for x in sorted(fullIndex.verbs): print(x)

http://purl.org/dc/elements/1.1/format
http://purl.org/dc/terms/description
http://purl.org/pav/createdBy
http://purl.org/pav/hasVersion
http://purl.org/pav/previousVersion
http://www.w3.org/1999/02/22-rdf-syntax-ns#type
http://www.w3.org/ns/prov#generatedAtTime
http://www.w3.org/ns/prov#hadMember
http://www.w3.org/ns/prov#startedAtTime
http://www.w3.org/ns/prov#usedBy
http://www.w3.org/ns/prov#wasAssociatedWith
http://www.w3.org/ns/prov#wasGeneratedBy
http://www.w3.org/ns/prov#wasStartedBy


Find the UUID of the crawl activity

In [11]:
crawlNode = None
for t in fullIndex.verbLookup["http://www.w3.org/1999/02/22-rdf-syntax-ns#type"].triples:
    if t.object == "http://www.w3.org/ns/prov#Activity":
        crawlNode = t.subject
        break
crawlUUID = str(crawlNode)
print(crawlUUID)

e871efcd-c2f9-4e8e-ac3a-bc45943c3e65


In [12]:
exampleQuery = [
    x for x in fullIndex.triples if (
        x.subject.Type() == Value.Type.URL and
        x.verb == "http://purl.org/pav/hasVersion" and
        x.object.Type() == Value.Type.CONTENT
    )
]

for triple in exampleQuery[:3]:
    print(triple)

https://search.idigbio.org/v2/search/publishers	http://purl.org/pav/hasVersion	hash://sha256/3eff98d4b66368fd8d1f8fa1af6a057774d8a407a4771490beeb9e7add76f362
https://api.gbif.org/v1/dataset	http://purl.org/pav/hasVersion	hash://sha256/184886cc6ae4490a49a70b6fd9a3e1dfafce433fc8e3d022c89e0b75ea3cda0b
https://bms.gfbio.org/services/data-sources/	http://purl.org/pav/hasVersion	hash://sha256/ba4f1de1f97ef57c90d321b7bf36426ac4031fa3a312af2c22a538d0f4387a4c


To construct qualified generations from past logs:
1. Start at (`URL hasVersion HASH`)
1. Collect (`HASH X Y`) triples that follow
1. Follow the `previousVersion` chain in reverse to find the actual content generated by the crawl (the "latest version")

NOTE: recursively following the `previousVersion` chain doesn't always work since sometimes it's circular. Infinite recursion ensues.

In [13]:
# def GetLatestVersion(node):
#     previousVersions = [x for x in node.inwardTriples if x.verb == "http://purl.org/pav/previousVersion"]
#     if len(previousVersions) > 0:
#         return GetLatestVersion(previousVersions[0].subject)
#     else:
#         return node

# def MakeQualifiedGeneration(url, context, crawlUUID):
#     index = Index(context)

#     urlNode = index.nodeLookup[url]
#     firstVersion = [x for x in urlNode.outwardTriples if x.verb == "http://purl.org/pav/hasVersion"][0].object
#     latestVersion = GetLatestVersion(firstVersion)
    
#     qualGenUUID = UUID()

#     newLines = [
#         "<%s> <%s> <%s> ." % \
#             (str(latestVersion), "http://www.w3.org/ns/prov#qualifiedGeneration", str(qualGenUUID)),

#         "<%s> <%s> <%s> ." % \
#             (str(qualGenUUID), "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://www.w3.org/ns/prov#Generation"),

#         "<%s> <%s> <%s> ." % \
#             (str(qualGenUUID), "http://www.w3.org/ns/prov#activity", str(crawlUUID)),

#         "<%s> <%s> <%s> ." % \
#             (str(qualGenUUID), "http://www.w3.org/ns/prov#used", str(url)),
#     ]
    
#     return newLines

For now, let's assume the version furthest down the list of n-quads is the most recent. Is this a safe assumption?

In [14]:
def PrintQualifiedGeneration(url, latestVersion, crawlUUID):
    qualGenUUID = UUID()

    print("<%s> <%s> <%s> ." %
        (str(latestVersion), "http://www.w3.org/ns/prov#qualifiedGeneration", str(qualGenUUID))
    )

    print("<%s> <%s> <%s> ." %
        (str(qualGenUUID), "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://www.w3.org/ns/prov#Generation")
    )
    
    print("<%s> <%s> <%s> ." %
        (str(qualGenUUID), "http://www.w3.org/ns/prov#activity", str(crawlUUID))
    )

    print("<%s> <%s> <%s> ." %
        (str(qualGenUUID), "http://www.w3.org/ns/prov#used", str(url)),
    )

In [15]:
import sys
import io

path = "../preston.acis.ufl.edu/data/20/d3/20d36a6f879ba1dd797d4288a4f2e32719d3c674156194c2765a3ec6b43f5e17"

file = open(path, "r")
sys.stdin = io.StringIO(file.read())

In [16]:
url = None
latestVersion = None
crawlUUID = None
didOne = False
for line in sys.stdin:
    nQuads = NQuads.Parse(line)
    for nQuad in nQuads:
        triple = Triple.FromNQuad(nQuad)

        # Watch for newer versions for the current URL
        if (
            triple.subject.Type() == Value.Type.CONTENT and
            triple.verb == "http://purl.org/pav/previousVersion" and
            triple.object.Type() == Value.Type.CONTENT
        ):
            latestVersion = str(triple.subject)
        # Watch for existing qualified generations
        elif (
            triple.subject.Type() == Value.Type.CONTENT and
            triple.verb == "http://www.w3.org/ns/prov#qualifiedGeneration" and
            triple.object.Type() == Value.Type.UUID
        ):
            # No need to log a new download event
            url = None
        # Check for the start of a download event
        elif (
            crawlUUID and
            triple.subject.Type() == Value.Type.URL and
            triple.verb == "http://purl.org/pav/hasVersion" and
            triple.object.Type() == Value.Type.CONTENT
        ):
            # Create a generation for the previous
            if url and latestVersion:
                PrintQualifiedGeneration(url, latestVersion, crawlUUID)
                didOne = True

            # Start reading triples for the next URL
            url = str(triple.subject)
            latestVersion = None
        # Check for a new crawl UUID
        elif triple.object == "http://www.w3.org/ns/prov#Activity":
            crawlUUID = str(triple.subject)
            url = None
    if didOne:
        break

<hash://sha256/d8abe764baa8807af8c8f5034157945937fdaec9002a7b975df429d7538c4897> <http://www.w3.org/ns/prov#qualifiedGeneration> <b8933518-3351-4299-a713-7538606dd7e7> .
<b8933518-3351-4299-a713-7538606dd7e7> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/ns/prov#Generation> .
<b8933518-3351-4299-a713-7538606dd7e7> <http://www.w3.org/ns/prov#activity> <e871efcd-c2f9-4e8e-ac3a-bc45943c3e65> .
<b8933518-3351-4299-a713-7538606dd7e7> <http://www.w3.org/ns/prov#used> <https://api.gbif.org/v1/dataset> .


In [17]:
file.close()