In [29]:
import networkx as nx
import re
import json
import os
from random import choice

In [4]:
# Function takes filename as input in text format and 
# returns a dictionary where the key is the subject of the triples and the values are all 
# the properties with objects


def getDic(filename):
    with open(filename, encoding="utf-8") as f:
        lines = f.readlines()

    lines=  list(set(lines))

    ddic={}
    for i in lines:
        i=i.strip('\n')
        sub=i.split(' ')[0].strip()
    #     print(sub)
        if sub not in ddic:
            ddic[sub]=[' '.join(i.split(' ')[1:])]
        else:
    #         print(i,ddic[sub])
            items=' '.join(i.split(' ')[1:])
    #         print(items,' '.join(ddic[sub]),'\n')
    #         if items not in ' '.join(ddic[sub]):
            ddic[sub].append(items)
    return ddic


In [5]:
# This function takes a list of tuples as input. The list of tuples are in the order of 
# (subj, subj class, obj, obj class, property connecting subj and obj). Using the input, a graph is generated with the 
# required labels (attributes) of nodes and edges


def addNodes(lst):
    G = nx.Graph()
    for i in lst:
        subj = i[0]
        subj_class = i[1]
        obj=i[2]
        obj_class=i[3]
        edge_label=i[4]
        G.add_node(subj, Class=subj_class)
        G.add_node(obj, Class=obj_class)
        G.add_edge(subj, obj, Label=edge_label)
    
    return G

In [6]:
# This function is used to extract quotes within strings. This is helpful in case of extracting labels of entities 
# so that they are human readable 

def getMatch(text):
    matches = re.findall(r'"(.+?)"',text)
    return ",".join(matches)

In [7]:
# This function takes a dic and a property (with their subj obj classes as input) and basically extracts all the 
# relevent tripes of the property. Transforms the instances in a tuple structure (to avoid repetition in case any) and finally 
# saves them all in a list. This list is then passed to the 'addNodes' function to generate a graph.

def genSubGraph(ddic,l):
    nodes=set()
    for key,value in ddic.items():
        for i in value:
            if l[2]==i.split(' ')[0]:
#                     Here there is a special condition here which is looking at location created property. 
#                     This is present here only to cater to the NISV dataset since it as yet to properly define locations in their 
#                     dataset.
                if l[2]=='sdo:locationCreated':
                    loc= getMatch(i)
                    loc= loc.split(' ')[0]
                    nodes.add((key,l[0],loc,l[1],l[2]))
                else:
                    for j in i.split(' '):
                        if 'gtaa:' in j:
                            nodes.add((key,l[0],j,l[1],l[2]))
                                

    nodes=list(nodes)
    return addNodes(nodes)




In [8]:
# This function takes a list of properties (with their classes defined) and returns a list of subgraphs for each of the property 
# in the list. At the first index of every item is the name of the property so we keep track of the subgraph types

def genGraph(ddic,prop):
    subg={}
    for p in prop:
        s=genSubGraph(ddic,p)
        filename=p[2].split(':')[1]
        subg[filename]=s
#         subg.append([s,p[2]])
    
    return subg

In [9]:
# Here is where we take the list of subgraph and generate the eigenvector centrality score (importance according to important links)
# for each of the nodes in each of the subgraphs iteratively. 

# The documentation for the function performing this computation can be found here:
#  https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.eigenvector_centrality.html

# This function returns a list of lists. The first item for each list is the name of the property and the second item 
# is the dictionary of scores. The key are the nodes and the value are the score generated by the computation

def getScore(subg):
#     score=[]
    score={}
    for k,v in subg.items():
        eigen = nx.eigenvector_centrality(v,max_iter=1500,tol=1.0e-5)
        score[k]=eigen
        
    return score

In [10]:
# Enmpties the directory of score we can write new score values if needed
def emptyScoreDir():
    dr = os.listdir('./score')
    for file in dr:
        os.remove('./score/'+file)

In [11]:
# saves all the scores in the score directory (after emptying) in the form of json jumps. These are files are named
# after each of the property we are using.

def saveScores(s):
    emptyScoreDir()
    for k,v in s.items():
#         print(k)
        out_file = open("./score/"+k+".json", "w")
        json.dump(v, out_file)
        out_file.close()
        
#     for score in s:
#         out_file = open("./score/"+score[0].split(':')[1]+".json", "w")
#         json.dump(score[1], out_file)
#         out_file.close()

In [120]:
def getAverage(filename):
    
    f = open('./score/'+filename) 

    data = json.load(f) 
    s=0
    for k,v in data.items():
        s=s+v
    #     print(k,v)
    average = s/len(data)
    newD = {}
    for k,v in data.items():
        newD[k]=v/average

    return newD

def joinGraph(G):
    lst=[]
    av=[]
    dr = os.listdir('./score')
    for file in dr:
        avg=getAverage(file)
        filename = file.split('.')[0]
        graph = G[filename]
#         print(len(avg))
#         print(graph)
#         nx.set_node_attributes(graph,avg,name='score')
        lst.append(graph)
        av.append(avg)
        
#     finalGraph=nx.compose_all(lst)
#     I can do this in terateive fasion and combine graphs 2 at a time as well as their scores
# https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.operators.binary.compose.html#networkx.algorithms.operators.binary.compose
    final= nx.compose_all(lst)
    print(len(final))
    
    final_score={}
    for node in list(final.nodes(data=True)):
        sc=0
        nname=node[0]
        for d in av:
            if d.get(nname) is not None:
                sc = sc+ d[nname]
            else:
                sc= sc+0
                
        final_score[nname]=sc
    
    os.remove('./score/final_score.json')
    out_file = open("./score/final_score.json", "w")
    json.dump(final_score, out_file)
    out_file.close()
        
        
        
        
        
                
#             if nname not in final_score:
#                 final_score[nname]=sc
#             else:
#                 final_score[nname]=final_score[nname]+sc
        
    nx.set_node_attributes(final,final_score,name='score')
    return final
#     print(len(lst[2]))
#     return dic

In [13]:
# Here is when we start calling our functions:
    
ddic = getDic('./data/items2.txt')

In [14]:
# ls = [['Creative Work','Person','sdo:byArtist'],['Creative Work','Organization','sdo:provider'],['Creative Work','Thing','sdo:genre'],['Creative Work','Person','sdo:creator'],['Creative Work','Person','sdo:mentions'],['Creative Work','Location','sdo:locationCreated'],['Creative Work','Organization','sdo:productionCompany']]
# ls = [['Creative Work','Person','sdo:byArtist'],['Creative Work','Organization','sdo:provider'],['Creative Work','Thing','sdo:genre'],['Creative Work','Location','sdo:locationCreated'],['Creative Work','Organization','sdo:productionCompany'],['Creative Work','Person','sdo:creator']]
ls = [['Creative Work','Person','sdo:byArtist'],['Creative Work','Person','sdo:creator'],['Creative Work','Person','sdo:mentions']]
G=genGraph(ddic,ls)
s = getScore(G)

In [68]:
saveScores(s)


In [24]:
F=joinGraph(G)
# When i am combining graphs i notice that if two nodes are the same and they are attached by different properties then the 
# weight gets overwritten by the label

266154


In [110]:
def getLabel(key,ddic):
    ndic={}
    v = ddic[key]
#     print(v)
    for i in v:
        
        if 'skos:prefLabel'==i.split(' ')[0] or 'sdo:name'==i.split(' ')[0] or 'sdo:alternativeHeadline'==i.split(' ')[0] or ' rdfs:label'==i.split(' ')[0]:
            loc= doit(i)
            return loc
        else:
#             print(v)
            return "Not found"
            
    return ndic

def doit(text):      
  
  matches = re.findall(r'"(.+?)"',text)
  # matches is now ['String 1', 'String 2', 'String3']
  return ",".join(matches)

In [118]:
def getData(cw,F,ddic):
    print('current node:',cw,'\n')
    newdic={}
    for k,v in F[cw].items():
        v=v['Label']
        if v not in newdic:
            newdic[v]=[k]
        else:
            newdic[v].append(k)
    
    for key,value in newdic.items():
        print(key)
        for i in value:
            print(i)
        print('\n')
        
    print('highest = ',getHighScore(F[cw],F),'\n')
    return getHighScore(F[cw],F)
    
    
def getHighScore(nbors,F):
    score=0
    for k,v in nbors.items():
        if (F.nodes[k]['score']>score):
            score = F.nodes[k]['score']
    for k,v in nbors.items():
        nscore=F.nodes[k]['score']
        if nscore==score:
            cw=k
    return cw
    

def getChains(l,F,ddic):
    n=0
    while(n!=l):
        
        if n==0:
            start=choice(list(F.nodes()))

        if (F.nodes[start]['Class'] == 'Creative Work'):
            cw=start
            start =getData(cw,F,ddic)

        else:
            cw=''
            nbors=F[start]
            cw = getHighScore(nbors,F)
            start =getData(cw,F,ddic)
        n=n+1

        

    

In [119]:
getChains(2,F,ddic)

current node: <http://data.beeldengeluid.nl/id/program/2101608140127482531> 

sdo:byArtist
gtaa:114287


highest =  gtaa:114287 

current node: <http://data.beeldengeluid.nl/id/scene/2101702280762576124> 

sdo:byArtist
gtaa:114287
gtaa:101973


sdo:creator
gtaa:148182


highest =  gtaa:148182 



In [94]:
# for k, v in F['<http://data.beeldengeluid.nl/id/scene/2102203180968068824>'].items():
#     print(k,F.nodes[k]['score'])

# print(F.nodes['gtaa:1662509']['score'],F.nodes['gtaa:1671802']['score'],F.nodes['gtaa:261022']['score'])

gtaa:55158 305.7866904639572
gtaa:1712296 2.711359497601111
gtaa:261022 0.0016792905719149296
