In [2]:
# Next Steps
#
# Something is wrong with clustering. Either nodes exist in more than one cluster,
# or there's something wrong with the way colors are being applied to edges.

In [3]:
import pygraphviz as pgv
import xmltodict as xtd
import requests
from collections import OrderedDict
import random

In [4]:
# Read the RFC Index as XML and convert to a python Dict
#rfcIndexUrl = 'https://www.rfc-editor.org/in-notes/rfc-index.xml'
#xmlData = requests.get(rfcIndexUrl)
#dictData = xtd.parse(xmlData.text)
with open('rfc-index.xml','r') as xmlData:
    dictData = xtd.parse(xmlData.read())
rfcIndex = dictData['rfc-index']

In [5]:
# The RFC Index has the following elements of interest:
rfcIndex.keys()

odict_keys(['@xmlns', '@xmlns:xsi', '@xsi:schemaLocation', 'bcp-entry', 'fyi-entry', 'rfc-entry', 'rfc-not-issued-entry', 'std-entry'])

In [6]:
# Initialize a new digraph, "G", to represent BCPs and RFCs as nodes;
# is-also's, obsolescence, and updates as edges.
G = pgv.AGraph(directed=True)
G.graph_attr['label']="RFC Directed Graph"
G.graph_attr['overlap']='false'

In [22]:
def graphNodes(graph, rfcIndexNode, attributes):
    '''graphNodes(): PyGraphVizGraph ListOfOrderedDicts ListOfStr--> PyGraphVizGraph
    Purpose: for each element of rfcIndexNode, add its 'doc-id' key value as a node
             to graph, and for each key value in attributes, add the key values for
             that node's attribute to graph as an edge, returning the updated graph.'''
    for node in rfcIndexNode:
        # Add the node to the graph
        try:
            graph.add_node(node['doc-id'])
        except:
            print("error adding node %s. No 'doc-id' string?" % node)
        
        # Now, add edges
        for attr in attributes:
            try:
                if isinstance(node[attr], OrderedDict):
                    if isinstance(node[attr]['doc-id'],str):
                        graph.add_edge(node['doc-id'],node[attr]['doc-id'],label=attr)
                    elif isinstance(node[attr]['doc-id'],list):
                        for standard in node[attr]['doc-id']:
                            graph.add_edge(node['doc-id'],standard,label=attr)
                    else:
                        print("error: %s is neither an string nor a list for node %s" % (node[attr], node['doc-id'] ))
                else:
                    print("error: %s isn't an OrderedDict for node %s" % (node[attr], node['doc-id']))
            except KeyError:
                pass
            
    return(graph)

In [23]:
def graphNodes(graph, rfcIndexNode, attributes):
    '''graphNodes(): PyGraphVizGraph ListOfOrderedDicts ListOfStr--> PyGraphVizGraph
    Purpose: for each element of rfcIndexNode, add its 'doc-id' key value as a node
             to graph, and for each key value in attributes, add the key values for
             that node's attribute to graph as an edge, returning the updated graph.'''
    for node in rfcIndexNode:
        # Add the node to the graph
        try:
            graph.add_node(node['doc-id'])
        except:
            print("error adding node %s. No 'doc-id' string?" % node)
        
        # Now, add edges
        for attr in attributes:
            try:
                if isinstance(node[attr], OrderedDict):
                    if isinstance(node[attr]['doc-id'],str):
                        graph.add_edge(node['doc-id'],node[attr]['doc-id'],label=attr)
                    elif isinstance(node[attr]['doc-id'],list):
                        for standard in node[attr]['doc-id']:
                            graph.add_edge(node['doc-id'],standard,label=attr)
                    else:
                        print("error: %s is neither a string nor a list for node %s" % (node[attr], node['doc-id'] ))
                else:
                    print("error: %s isn't an OrderedDict for node %s" % (node[attr], node['doc-id']))
            except KeyError:
                pass
            
    return(graph)
                        

In [8]:
# Build the Graph
#G = graphNodes(G, rfcIndex['bcp-entry'], ['is-also'])
#G = graphNodes(G, rfcIndex['fyi-entry'], ['is-also'])
G = graphNodes(G, rfcIndex['rfc-entry'], ['is-also','obsoleted-by','updates'])

In [9]:
# Identify clusters of nodes. Here, a cluster is a subgraph of G having at least two verticies
# and an edge, along with all other verticies to which a path can be traced. A cluster is recorded
# as a Python set object containing all verticies in the set.

# N.B. This must be where the cluster brokenness originates. I don't think I'm adequately pulling
# information about edge memberships out of the for loop(s).
clusters = list()
edgeSets = [set(edge) for edge in G.edges()]
clusters.append(edgeSets[0]) # You need a seed value in order to iterate over the list
for edge in edgeSets:
    membersOf = list()
    for cluster in clusters:
        if cluster.intersection(edge):
            cluster.update(edge)
            membersOf.append(cluster)
    if len(membersOf) is 0:
        clusters.append(edge)

In [11]:
# Attempt to work through the clustering bug.
edgeSets = [set(edge) for edge in G.edges()]
for edge in edgeSets:
    for otherEdge in edgeSets:
        if otherEdge != edge:
            if otherEdge.intersection(edge):
                edge.update(otherEdge)
clusters = edgeSets

In [12]:
def genColorSet(colorCount):
    '''genColorSet(): Int --> SetOfRGBCodes
    Purpose: to generate a SetOfRGBCodes of count Int. Graphviz permits naming
    RGB colors in six-digit, zero-padded values. See https://graphviz.org/doc/info/attrs.html#d:colors
    '''
    colors = set()
    
    for i in range(0,colorCount):
        r = str(hex(random.randint(1,255)))[2:]
        g = str(hex(random.randint(0,255)))[2:]
        b = str(hex(random.randint(0,255)))[2:]
        if len(r) is 1:
            r = "0"+r
        if len(g) is 1:
            g = "0"+g
        if len(b) is 1:
            b = "0"+b
        color = "#"+r+g+b
        colors.add(color)
        
    return colors

In [14]:
# Next, color-code each cluster in a new dict, "clusterColors". Here, the key will
# be a cluster's color, and the value will be the cluster.
colorSet = genColorSet(len(clusters))
clusterColors = dict()
for cluster in clusters:
    clusterColors[colorSet.pop()] = cluster

In [15]:
# Then, for each edge, lookup and apply its cluster color to the Graph's
# nodes and edges
for color,cluster in clusterColors.items():
    for edge in G.edges():
        if edge[0] in cluster or edge[1] in cluster:
            e = G.get_edge(edge[0],edge[1])
            e.attr['color'] = color

In [16]:
# N.B. As of this writing, this takes several minutes to run.

# Generate the graph.
G.layout()
G.write('rfc-visualization.dot')
G.draw('rfc-visualization.svg')

In [17]:
# Attempting to debug the incomplete compilation of clusters
uniqueNodes = set()
for cluster in clusters:
    if 'RFC7274' in cluster:
        for member in cluster:
            uniqueNodes.add(member)

In [18]:
len(clusters)

3324

In [20]:
clusters

[{'RFC0003', 'RFC0010', 'RFC0016', 'RFC0024', 'RFC0027', 'RFC0030'},
 {'RFC0003', 'RFC0010', 'RFC0016', 'RFC0024', 'RFC0027', 'RFC0030'},
 {'RFC0003', 'RFC0010', 'RFC0016', 'RFC0024', 'RFC0027', 'RFC0030'},
 {'RFC0011', 'RFC0033', 'RFC0036', 'RFC0039', 'RFC0044', 'RFC0047'},
 {'RFC0003', 'RFC0010', 'RFC0016', 'RFC0024', 'RFC0027', 'RFC0030'},
 {'RFC0003', 'RFC0010', 'RFC0016', 'RFC0024', 'RFC0027', 'RFC0030'},
 {'RFC0020', 'STD0080'},
 {'RFC0003', 'RFC0010', 'RFC0016', 'RFC0024', 'RFC0027', 'RFC0030'},
 {'RFC0003', 'RFC0010', 'RFC0016', 'RFC0024', 'RFC0027', 'RFC0030'},
 {'RFC0003', 'RFC0010', 'RFC0016', 'RFC0024', 'RFC0027', 'RFC0030'},
 {'RFC0003', 'RFC0010', 'RFC0016', 'RFC0024', 'RFC0027', 'RFC0030'},
 {'RFC0003', 'RFC0010', 'RFC0016', 'RFC0024', 'RFC0027', 'RFC0030'},
 {'RFC0003', 'RFC0010', 'RFC0016', 'RFC0024', 'RFC0027', 'RFC0030'},
 {'RFC0003', 'RFC0010', 'RFC0016', 'RFC0024', 'RFC0027', 'RFC0030'},
 {'RFC0011', 'RFC0033', 'RFC0036', 'RFC0039', 'RFC0044', 'RFC0047'},
 {'RFC001