In [64]:
import networkx as nx
import re
import json
import os

In [7]:
# Function takes filename as input in text format and 
# returns a dictionary where the key is the subject of the triples and the values are all 
# the properties with objects


def getDic(filename):
    with open(filename, encoding="utf-8") as f:
        lines = f.readlines()

    lines=  list(set(lines))

    ddic={}
    for i in lines:
        i=i.strip('\n')
        sub=i.split(' ')[0].strip()
    #     print(sub)
        if sub not in ddic:
            ddic[sub]=[' '.join(i.split(' ')[1:])]
        else:
    #         print(i,ddic[sub])
            items=' '.join(i.split(' ')[1:])
    #         print(items,' '.join(ddic[sub]),'\n')
    #         if items not in ' '.join(ddic[sub]):
            ddic[sub].append(items)
    return ddic


In [43]:
# This function takes a list of tuples as input. The list of tuples are in the order of 
# (subj, subj class, obj, obj class, property connecting subj and obj). Using the input, a graph is generated with the 
# required labels (attributes) of nodes and edges


def addNodes(lst):
    G = nx.Graph()
    for i in lst:
        subj = i[0]
        subj_class = i[1]
        obj=i[2]
        obj_class=i[3]
        edge_label=i[4]
        G.add_node(subj, Class=subj_class)
        G.add_node(obj, Class=obj_class)
        G.add_edge(subj, obj, Label=edge_label)
    
    return G

In [39]:
# This function is used to extract quotes within strings. This is helpful in case of extracting labels of entities 
# so that they are human readable 

def getMatch(text):
    matches = re.findall(r'"(.+?)"',text)
    return ",".join(matches)

In [40]:
# This function takes a dic and a property (with their subj obj classes as input) and basically extracts all the 
# relevent tripes of the property. Transforms the instances in a tuple structure (to avoid repetition in case any) and finally 
# saves them all in a list. This list is then passed to the 'addNodes' function to generate a graph.

def genSubGraph(ddic,l):
    nodes=set()
    for key,value in ddic.items():
        for i in value:
            if l[2]==i.split(' ')[0]:
#                     Here there is a special condition here which is looking at location created property. 
#                     This is present here only to cater to the NISV dataset since it as yet to properly define locations in their 
#                     dataset.
                if l[2]=='sdo:locationCreated':
                    loc= getMatch(i)
                    loc= loc.split(' ')[0]
                    nodes.add((key,l[0],loc,l[1],l[2]))
                else:
                    for j in i.split(' '):
                        if 'gtaa:' in j:
                            nodes.add((key,l[0],j,l[1],l[2]))
                                

    nodes=list(nodes)
    return addNodes(nodes)




In [55]:
# This function takes a list of properties (with their classes defined) and returns a list of subgraphs for each of the property 
# in the list. At the first index of every item is the name of the property so we keep track of the subgraph types

def genGraph(ddic,prop):
    subg=[]
    for p in prop:
        s=genSubGraph(ddic,p)
        subg.append([s,p[2]])
    
    return subg

In [59]:
# Here is where we take the list of subgraph and generate the eigenvector centrality score (importance according to important links)
# for each of the nodes in each of the subgraphs iteratively. 

# The documentation for the function performing this computation can be found here:
#  https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.eigenvector_centrality.html

# This function returns a list of lists. The first item for each list is the name of the property and the second item 
# is the dictionary of scores. The key are the nodes and the value are the score generated by the computation

def getScore(subg):
    score=[]
    for g in subg:
        eigen = nx.eigenvector_centrality(g[0],max_iter=1500,tol=1.0e-5)
        score.append([g[1],eigen])
        
    return score

In [70]:
# Enmpties the directory of score we can write new score values if needed
def emptyScoreDir():
    dr = os.listdir('./score')
    for file in dr:
        os.remove('./score/'+file)

In [82]:
# saves all the scores in the score directory (after emptying) in the form of json jumps. These are files are named
# after each of the property we are using.

def saveScores(s):
    emptyScoreDir()
    for score in s:
        out_file = open("./score/"+score[0].split(':')[1]+".json", "w")
        json.dump(score[1], out_file)
        out_file.close()

In [None]:
# Here is when we start calling our functions:
    
    
ddic = getDic('items2.txt')

In [None]:
# ls = [['Creative Work','Person','sdo:byArtist'],['Creative Work','Organization','sdo:provider'],['Creative Work','Thing','sdo:genre'],['Creative Work','Person','sdo:creator'],['Creative Work','Person','sdo:mentions'],['Creative Work','Location','sdo:locationCreated'],['Creative Work','Organization','sdo:productionCompany']]
# ls = [['Creative Work','Person','sdo:byArtist'],['Creative Work','Organization','sdo:provider'],['Creative Work','Thing','sdo:genre'],['Creative Work','Location','sdo:locationCreated'],['Creative Work','Organization','sdo:productionCompany'],['Creative Work','Person','sdo:creator']]
ls = [['Creative Work','Person','sdo:byArtist'],['Creative Work','Person','sdo:creator'],['Creative Work','Person','sdo:mentions']]
G=genGraph(ddic,ls)

In [83]:
s = getScore(G)
saveScores(s)