In [1]:
import networkx as nx
import re
import json
import os
from random import choice
import rdflib
from SPARQLWrapper import SPARQLWrapper, JSON
import ssl
import matplotlib.pyplot as plt

In [2]:
# Function takes filename as input in text format and 
# returns a dictionary where the key is the subject of the triples and the values are all 
# the properties with objects


def getDic(filename):
    print('Loading data')
    with open(filename, encoding="utf-8") as f:
        lines = f.readlines()

    lines=  list(set(lines))

    ddic={}
    for i in lines:
        i=i.strip('\n')
        sub=i.split(' ')[0].strip()
    #     print(sub)
        if sub not in ddic:
            ddic[sub]=[' '.join(i.split(' ')[1:])]
        else:
    #         print(i,ddic[sub])
            items=' '.join(i.split(' ')[1:])
    #         print(items,' '.join(ddic[sub]),'\n')
    #         if items not in ' '.join(ddic[sub]):
            ddic[sub].append(items)
    return ddic


In [3]:
# This function takes a list of tuples as input. The list of tuples are in the order of 
# (subj, subj class, obj, obj class, property connecting subj and obj). Using the input, a graph is generated with the 
# required labels (attributes) of nodes and edges


def addNodes(lst):
    G = nx.Graph()
    for i in lst:
        subj = i[0]
        subj_class = i[1]
        obj=i[2]
        obj_class=i[3]
        edge_label=i[4]
        G.add_node(subj, Class=subj_class)
        G.add_node(obj, Class=obj_class)
        G.add_edge(subj, obj, Label=edge_label)
    
    return G

In [4]:
# This function is used to extract quotes within strings. This is helpful in case of extracting labels of entities 
# so that they are human readable 

def getMatch(text):
    matches = re.findall(r'"(.+?)"',text)
    return ",".join(matches)

In [5]:
# This function takes a dic and a property (with their subj obj classes as input) and basically extracts all the 
# relevent tripes of the property. Transforms the instances in a tuple structure (to avoid repetition in case any) and finally 
# saves them all in a list. This list is then passed to the 'addNodes' function to generate a graph.

def genSubGraph(ddic,l):
    nodes=set()
    for key,value in ddic.items():
        for i in value:
            if l[2]==i.split(' ')[0]:
#                     Here there is a special condition here which is looking at location created property. 
#                     This is present here only to cater to the NISV dataset since it as yet to properly define locations in their 
#                     dataset.
                if l[2]=='sdo:locationCreated':
                    loc= getMatch(i)
                    loc= loc.split(' ')[0]
                    nodes.add((key,l[0],loc,l[1],l[2]))
                else:
                    for j in i.split(' '):
                        if 'gtaa:' in j:
                            nodes.add((key,l[0],j,l[1],l[2]))
                                

    nodes=list(nodes)
    return addNodes(nodes)




In [6]:
# This function takes a list of properties (with their classes defined) and returns a list of subgraphs for each of the property 
# in the list. At the first index of every item is the name of the property so we keep track of the subgraph types

def genGraph(ddic,prop):
    print('Generating sub-graphs')
    subg={}
    for p in prop:
        s=genSubGraph(ddic,p)
        filename=p[2].split(':')[1]
        subg[filename]=s
#         subg.append([s,p[2]])
    
    return subg

In [7]:
# Here is where we take the list of subgraph and generate the eigenvector centrality score (importance according to important links)
# for each of the nodes in each of the subgraphs iteratively. 

# The documentation for the function performing this computation can be found here:
#  https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.eigenvector_centrality.html

# This function returns a list of lists. The first item for each list is the name of the property and the second item 
# is the dictionary of scores. The key are the nodes and the value are the score generated by the computation

def getScore(subg):
    print('Calculating scores')
#     score=[]
    score={}
    for k,v in subg.items():
        eigen = nx.eigenvector_centrality(v,max_iter=1500,tol=1.0e-5)
        score[k]=eigen
        
    return score

In [8]:
# Enmpties the directory of score we can write new score values if needed
def emptyScoreDir():
    dr = os.listdir('./score')
    for file in dr:
        os.remove('./score/'+file)

In [9]:
# saves all the scores in the score directory (after emptying) in the form of json jumps. These are files are named
# after each of the property we are using.

def saveScores(s):
    print('Saving scores')
    emptyScoreDir()
    for k,v in s.items():
#         print(k)
        out_file = open("./score/"+k+".json", "w")
        json.dump(v, out_file)
        out_file.close()


In [10]:
def getAverage(filename):
    
    f = open('./score/'+filename) 

    data = json.load(f) 
    s=0
    for k,v in data.items():
        s=s+v
    #     print(k,v)
    average = s/len(data)
    print(average)
    newD = {}
    for k,v in data.items():
        newD[k]=v/average

    return newD

def joinGraph(G):
    print('Producing final graph')
    lst=[]
    av=[]
    
    if os.path.exists('./score/final_score.json'):
        os.remove('./score/final_score.json')
        
    dr = os.listdir('./score')
    for file in dr:
        avg=getAverage(file)
        filename = file.split('.')[0]
        graph = G[filename]
        lst.append(graph)
        av.append(avg)
        
#     I can do this in terateive fasion and combine graphs 2 at a time as well as their scores
# https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.operators.binary.compose.html#networkx.algorithms.operators.binary.compose
    final= nx.compose_all(lst)
#     print(len(final))
    
    final_score={}
    for node in list(final.nodes(data=True)):
        sc=0
        nname=node[0]
        for d in av:
            if d.get(nname) is not None:
                sc = sc+ d[nname]
            else:
                sc= sc+0
                
        final_score[nname]=sc
    
    
    out_file = open("./score/final_score.json", "w")
    json.dump(final_score, out_file)
    out_file.close()
        

        
    nx.set_node_attributes(final,final_score,name='score')
    return final


In [None]:
def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

def getName(inp):
    if has_numbers(inp) == False:
        return inp
    
    inp = inp.strip()
    inp = inp.strip(',')
    inp = inp.strip('.')
    wikidata_endpoint = "https://cat.apis.beeldengeluid.nl/sparql"

    my_SPARQL_query = """
    PREFIX sdo: <https://schema.org/>
    PREFIX gtaa: <http://data.beeldengeluid.nl/gtaa/>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

    SELECT ?name  
    WHERE
    {
    OPTIONAL { """+inp+"""  sdo:name ?name}
    OPTIONAL { """+inp+"""  skos:prefLabel ?name}
    } 
    """
#     print(my_SPARQL_query)
    # set the endpoint 
    sparql_wd = SPARQLWrapper(wikidata_endpoint)
    # set the query
    sparql_wd.setQuery(my_SPARQL_query)
    # set the returned format
    sparql_wd.setReturnFormat(JSON)
#     print(sparql_wd)
#     get the results
    results = sparql_wd.query().convert()
#     print(results)

    for result in results["results"]["bindings"]:
        return result['name']['value']

def doit(text):      
  
  matches = re.findall(r'"(.+?)"',text)
  # matches is now ['String 1', 'String 2', 'String3']
  return ",".join(matches)


def getData(cw,F,ddic,visited,classname=None):
#     print('current node:',cw,getName(cw),'\n')
    print('current : \n',getName(cw),'\n')
    newdic={}
    for k,v in F[cw].items():
        v=v['Label']
        if v not in newdic:
            newdic[v]=[k]
        else:
            newdic[v].append(k)
    
    for key,value in newdic.items():
        print(key)
        for i in value:
#             print(i,'|',getName(i))
            print(getName(i))
        print('\n')
    
    high=getHighScore(F[cw],F)
    
    if classname==None:
        final = choice(high)[0]  
        for c in high:
            if c[0] not in visited:
                final = c[0]
                break
    else:
        classes =[]
        for cl in high:
            f_class = F.nodes[cl[0]]['Class']
            if f_class == classname:
                classes.append((cl[0],cl[1]))

        print("HIGH SCORES",classes)
    
        if len(classes)>0:
            final = choice(classes)[0]  
        #     final = choice(high)[0]
            for c in classes:
                if c[0] not in visited:
                    final = c[0]
                    break
    
        else:
            final = choice(high)[0]  
        #     final = choice(high)[0]
            for c in high:
                if c[0] not in visited:
                    final = c[0]
                    break
        
            
#     print('highest = ',high,'|',getName(high),'\n')
    print('highest = ',getName(final),'\n')
    return final
    
    
# we can get a list of all the scores sorted and we can return that list

# we can maintain 2 lists. One of the ones which had the highest score and the other with all the nodes we have encountered.
# So if we finish all the high score ones we can jump into other

def getHighScore(nbors,F):
    sdic={}
    for k,v in nbors.items():
        sdic[k.strip(',')]=F.nodes[k]['score']
    sort = sorted(sdic.items(), key=lambda x:x[1],reverse=True)
   
    return sort
    

def getChains(l,F,ddic,visited,classname=None,start_node=None):
#     print('Visualizing chains')
    n=0
    while(n!=l):
        if n==0:
            if start_node==None:
                high={}
                for node in F:
                    high[node]=F.nodes[node]['score']
                sort = sorted(high.items(), key=lambda x:x[1],reverse=True)
                top30 = sort[0:50]
                
                while True:
                    start=choice(top30)[0]
                    if start not in visited:
                        break
            else:
                start =  start_node
            
        if (F.nodes[start]['Class'] == 'Creative Work'):
            cw=start
            start =getData(cw,F,ddic,visited,classname)
            visited.append(start)

        else:
            nbors=F[start]
            high=getHighScore(nbors,F)
            cw = high[0][0]
            for c in high:
                if c[0] not in visited:
                    cw = c[0]
                    break

            start =getData(cw,F,ddic,visited,classname)
            visited.append(start)
            
        n=n+1
    return visited

In [74]:
def getNarratives():
    v=[]
    for i in range(10):
        print("---------------------- Narrative #",i+1,'----------------------------')
        res = getChains(4,F,ddic,v,'Thing')
        v.append(res)


In [12]:
# Here is when we start calling our functions:
    
ddic = getDic('./data/items2.txt')

Loading data


In [13]:
# ls = [['Creative Work','Person','sdo:byArtist'],['Creative Work','Organization','sdo:provider'],['Creative Work','Thing','sdo:genre'],['Creative Work','Person','sdo:creator'],['Creative Work','Person','sdo:mentions'],['Creative Work','Location','sdo:locationCreated'],['Creative Work','Organization','sdo:productionCompany']]
# ls = [['Creative Work','Person','sdo:byArtist'],['Creative Work','Organization','sdo:provider'],['Creative Work','Thing','sdo:genre'],['Creative Work','Organization','sdo:productionCompany'],['Creative Work','Person','sdo:creator']]
ls = [['Creative Work','Person','sdo:byArtist'],['Creative Work','Organization','sdo:provider'],['Creative Work','Thing','sdo:genre'],['Creative Work','Location','sdo:locationCreated'],['Creative Work','Organization','sdo:productionCompany'],['Creative Work','Person','sdo:creator']]

# ls = [['Creative Work','Person','sdo:byArtist'],['Creative Work','Person','sdo:creator'],['Creative Work','Person','sdo:mentions']]
G=genGraph(ddic,ls)


Generating sub-graphs


In [63]:
s = getScore(G)


Calculating scores


In [64]:
saveScores(s)

Saving scores


In [14]:
F=joinGraph(G)
# When i am combining graphs i notice that if two nodes are the same and they are attached by different properties then the 
# weight gets overwritten by the label

Producing final graph
0.0003819353476003525
0.0003527694662296174
0.0012729161186616402
0.0009429499654517726
0.006689258260430889
0.0009614123463170654


In [76]:
getNarratives()

---------------------- Narrative # 1 ----------------------------
current : 
 Programma Luganski, Nikolai - Radio Kamerorkest - Bruggen, Frans 

sdo:byArtist
Radio Kamerorkest
Brüggen, Frans
Luganski, Nikolai


sdo:creator
Schubert, Franz
Mozart, Wolfgang Amadeus
Bach, Carl Philipp Emanuel


sdo:locationCreated
Amsterdam


sdo:provider
VARA


HIGH SCORES []
highest =  Mozart, Wolfgang Amadeus 

current : 
 Programma Koten, Frank van - Jeurissen, Herman - Lier, Sonja van - Nyhuis, Annelies - Robert, Geraint - Groot Omroepkoor - Radio Kamerorkest - Montgomery, Kenneth 

sdo:byArtist
Radio Kamerorkest
Groot Omroepkoor
Montgomery, Kenneth
Jeurissen, Herman
Lier, Sonja van
Koten, Frank van


sdo:creator
Haydn, Joseph
Mozart, Wolfgang Amadeus
Bree, Johannes van


sdo:locationCreated
VARA


sdo:provider
NCRV


HIGH SCORES []
highest =  Radio Kamerorkest 

current : 
 Programma Koten, Frank van - Jeurissen, Herman - Lier, Sonja van - Nyhuis, Annelies - Robert, Geraint - Groot Omroepkoor - Radi

Radio Kamerorkest
Bour, Ernest
Klein Omroepkoor
Lloyd Webber, Julian


sdo:creator
Haydn, Joseph
Bach, Johann Sebastian
Mozart, Wolfgang Amadeus
Honegger, Arthur


sdo:locationCreated
Utrecht,


sdo:provider
VOO


HIGH SCORES []
highest =  Haydn, Joseph 

current : 
 Programma Koten, Frank van - Jeurissen, Herman - Lier, Sonja van - Nyhuis, Annelies - Robert, Geraint - Groot Omroepkoor - Radio Kamerorkest - Montgomery, Kenneth 

sdo:byArtist
Radio Kamerorkest
Groot Omroepkoor
Montgomery, Kenneth
Jeurissen, Herman
Lier, Sonja van
Koten, Frank van


sdo:creator
Haydn, Joseph
Mozart, Wolfgang Amadeus
Bree, Johannes van


sdo:locationCreated
VARA


sdo:provider
NCRV


HIGH SCORES []
highest =  Montgomery, Kenneth 

current : 
 Programma Koten, Frank van - Jeurissen, Herman - Lier, Sonja van - Nyhuis, Annelies - Robert, Geraint - Groot Omroepkoor - Radio Kamerorkest - Montgomery, Kenneth 

sdo:byArtist
Radio Kamerorkest
Groot Omroepkoor
Montgomery, Kenneth
Jeurissen, Herman
Lier, Sonja van


highest =  instrumentaal - orkest 

current : 
 Pianoconcert no.27 in Bes gr.t. (KV.595) 

sdo:byArtist
Radio Kamerorkest
Krol, Roelof
Klein, Jacques


sdo:creator
Mozart, Wolfgang Amadeus


sdo:genre
klassieke muziek
instrumentaal - orkest


sdo:locationCreated
Amsterdam


HIGH SCORES [('gtaa:234346', 509.2065924821052), ('gtaa:263284', 132.80256205702585)]
highest =  klassieke muziek 



In [None]:
high={}

for node in F:
    high[node]=F.nodes[node]['score']
    
sort = sorted(high.items(), key=lambda x:x[1],reverse=True)
top30 = sort[0:100]

t={}
for i in top30:
    t[getName(i[0])]=i[1]
    
# t