In [13]:
from graph_tool.all import *
import pandas as pd
import os
import numpy as np

In [14]:
import graph_tool as gt

In [11]:
def build_graph(citation_folderpath):
    
    edges = []
    g = gt.Graph()
    eweight = g.new_ep("double")
    
    for filename in sorted(os.listdir(citation_folderpath)):
        # edges = []
        if not filename.endswith('.csv'):
            continue

        print(filename)
        with open(citation_folderpath + "/" + filename) as file:
            for line in file:
                contents = line.strip().split("\t")
                edges.append((str(contents[0]), str(contents[1]), float(contents[2])))
    
    node_mapping = g.add_edge_list(edges, eprops=[eweight], hashed=True, hash_type='string')
    
    return g, node_mapping, eweight


def compute_centralities(graph, nodes, eweight, filename, 
                         pr_damping=0.85, katz_alpha=0.01, katz_beta=None):
    
    nodes = list(graph.vertices())
    
    largest_comp = gt.topology.label_largest_component(graph)
    
    print("Initiating PageRank")
    pr = pagerank(graph, weight=eweight, damping=pr_damping)
    
    print("Initiating Katz")
    katz_centrality = gt.centrality.katz(gt.GraphView(graph, vfilt=largest_comp), 
                                         weight=eweight, alpha=katz_alpha, beta=katz_beta)
    
    katz_list = list(katz_centrality)
    
    katz_scores = []
    for indicator in largest_comp.a:
        if indicator == 1:
            katz_scores.append(katz_list.pop(0))
        else:
            katz_scores.append(None)     
                                         
    print("Initiating in-degree")
    in_degree = graph.get_in_degrees(nodes, eweight=eweight)
    
    print("Finished centrality computations")
    
    df = pd.DataFrame()
    df['node'] = list(node_mapping)
    df['pagerank'] = list(pr.get_array())
    df['katz'] = katz_scores
    df['in_degree_strength'] = list(in_degree)
    df.to_csv(filename, index=False, sep="\t")
    
    return 

In [25]:
graph, nodes, eweight = build_graph('/home/laal/MAG/DATA/NETWORKS/SimpleWeightEconomics2010.txt')

part-00000-dfe3b6ca-d908-440a-97e4-903ced3a6b2a-c000.csv
part-00001-dfe3b6ca-d908-440a-97e4-903ced3a6b2a-c000.csv
part-00002-dfe3b6ca-d908-440a-97e4-903ced3a6b2a-c000.csv
part-00003-dfe3b6ca-d908-440a-97e4-903ced3a6b2a-c000.csv
part-00004-dfe3b6ca-d908-440a-97e4-903ced3a6b2a-c000.csv
part-00005-dfe3b6ca-d908-440a-97e4-903ced3a6b2a-c000.csv
part-00006-dfe3b6ca-d908-440a-97e4-903ced3a6b2a-c000.csv
part-00007-dfe3b6ca-d908-440a-97e4-903ced3a6b2a-c000.csv
part-00008-dfe3b6ca-d908-440a-97e4-903ced3a6b2a-c000.csv
part-00009-dfe3b6ca-d908-440a-97e4-903ced3a6b2a-c000.csv
part-00010-dfe3b6ca-d908-440a-97e4-903ced3a6b2a-c000.csv
part-00011-dfe3b6ca-d908-440a-97e4-903ced3a6b2a-c000.csv
part-00012-dfe3b6ca-d908-440a-97e4-903ced3a6b2a-c000.csv
part-00013-dfe3b6ca-d908-440a-97e4-903ced3a6b2a-c000.csv
part-00014-dfe3b6ca-d908-440a-97e4-903ced3a6b2a-c000.csv
part-00015-dfe3b6ca-d908-440a-97e4-903ced3a6b2a-c000.csv
part-00016-dfe3b6ca-d908-440a-97e4-903ced3a6b2a-c000.csv
part-00017-dfe3b6ca-d908-440a-9

In [26]:
compute_centralities(graph, nodes, eweight, '/home/laal/MAG/DATA/NETWORKS/SimpleWeightEconomics2010REMOVE.csv', 
                     pr_damping=0.85, katz_alpha=0.01, katz_beta=None)

Initiating PageRank
Initiating Katz


  vprop.fa = vprop.fa / numpy.linalg.norm(vprop.fa)


Initiating in-degree
Finished centrality computations


In [10]:
graph

<Graph object, directed, with 586877 vertices and 16279775 edges, at 0x7f3f75755e20>

In [11]:
nodes[:10]

[(0, '2460612036'),
 (1, '2615404153'),
 (2, '2227642548'),
 (3, '2585687038'),
 (4, '2144596701'),
 (5, '2151591788'),
 (6, '2902347833'),
 (7, '2111466348'),
 (8, '299227070'),
 (9, '317508920')]

In [5]:
compute_centralities(graph, nodes, eweight, '/home/laal/MAG/DATA/NETWORKS/SimpleWeightEconomics2010Centrality.csv', 
                     pr_damping=0.85, katz_alpha=0.01, katz_beta=None)

Initiating PageRank
Initiating Katz


  vprop.fa = vprop.fa / numpy.linalg.norm(vprop.fa)


Initiating in-degree
Finished centrality computations


In [6]:
graph, nodes, eweight = build_graph('/home/laal/MAG/DATA/NETWORKS/SimpleWeightMathematics2010.txt')

part-00000-891436b4-3c9d-41aa-a327-d5b880805374-c000.csv
part-00001-891436b4-3c9d-41aa-a327-d5b880805374-c000.csv
part-00002-891436b4-3c9d-41aa-a327-d5b880805374-c000.csv
part-00003-891436b4-3c9d-41aa-a327-d5b880805374-c000.csv
part-00004-891436b4-3c9d-41aa-a327-d5b880805374-c000.csv
part-00005-891436b4-3c9d-41aa-a327-d5b880805374-c000.csv
part-00006-891436b4-3c9d-41aa-a327-d5b880805374-c000.csv
part-00007-891436b4-3c9d-41aa-a327-d5b880805374-c000.csv
part-00008-891436b4-3c9d-41aa-a327-d5b880805374-c000.csv
part-00009-891436b4-3c9d-41aa-a327-d5b880805374-c000.csv
part-00010-891436b4-3c9d-41aa-a327-d5b880805374-c000.csv
part-00011-891436b4-3c9d-41aa-a327-d5b880805374-c000.csv
part-00012-891436b4-3c9d-41aa-a327-d5b880805374-c000.csv
part-00013-891436b4-3c9d-41aa-a327-d5b880805374-c000.csv
part-00014-891436b4-3c9d-41aa-a327-d5b880805374-c000.csv
part-00015-891436b4-3c9d-41aa-a327-d5b880805374-c000.csv
part-00016-891436b4-3c9d-41aa-a327-d5b880805374-c000.csv
part-00017-891436b4-3c9d-41aa-a

In [7]:
compute_centralities(graph, nodes, eweight, '/home/laal/MAG/DATA/NETWORKS/SimpleWeightMathematics2010Centrality.csv', 
                     pr_damping=0.85, katz_alpha=0.01, katz_beta=None)

Initiating PageRank
Initiating Katz
Initiating in-degree
Finished centrality computations


In [None]:
graph, nodes, eweight = build_graph('/home/laal/MAG/DATA/NETWORKS/SimpleWeightPsychology2010.txt')

part-00000-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00001-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00002-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00003-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00004-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00005-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00006-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00007-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00008-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00009-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00010-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00011-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00012-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00013-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00014-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00015-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00016-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00017-600a6096-e2ad-415b-a

part-00144-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00145-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00146-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00147-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00148-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00149-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00150-600a6096-e2ad-415b-a299-248329e67606-c000.csv
part-00151-600a6096-e2ad-415b-a299-248329e67606-c000.csv


In [9]:
pr = pagerank(graph, weight=eweight)

In [None]:
between_nodes, between_edges = betweenness(graph, weight=eweight)

In [5]:
math, math_nodes = build_graph('/home/laal/MAG/DATA/NETWORKS/SimpleWeightMathematics2010.txt')

In [None]:
psych, psych_nodes = build_graph('/home/laal/MAG/DATA/NETWORKS/SimpleWeightPsychology2010.txt')

In [14]:
economics.list_properties()

In [11]:
list(pr.get_array())

[4.909814149384813e-07,
 4.291593501695631e-06,
 4.798821184344864e-07,
 0.00317167857399106,
 1.3726257996183562e-06,
 4.553235877268791e-05,
 4.798821184344864e-07,
 0.0007932331557870794,
 7.066320674308627e-05,
 0.0005211603088568134,
 5.06611251690659e-07,
 0.00031065669494645414,
 7.157047362285487e-06,
 3.34394920689878e-05,
 5.102268157998512e-06,
 8.669881535713197e-07,
 4.226694088034049e-06,
 7.76499659562883e-07,
 4.036028955738189e-06,
 0.00010743858723148067,
 1.0302637284890987e-06,
 4.461814095684923e-06,
 4.798821184344864e-07,
 0.00133838163135433,
 2.6650591846026917e-05,
 7.66033007730338e-05,
 1.5943482807247395e-06,
 8.621618699435287e-07,
 4.125833865751337e-06,
 0.0001070661328431989,
 1.8831229512951058e-06,
 1.332854039023946e-06,
 1.7498589943545947e-05,
 0.00038887427742354273,
 1.4383625911455657e-05,
 1.2842544297629854e-06,
 1.1912473942357469e-05,
 4.798821184344864e-07,
 5.883426400093035e-07,
 1.003072607180026e-06,
 0.00010217526986772005,
 0.00027832

In [22]:
eweight.get_array()

PropertyArray([0.16666667, 0.5       , 0.25      , ..., 0.25      ,
               1.        , 0.5       ])

In [33]:
nodes = economics_nodes.get_2d_array([0])

In [35]:
nodes.shape

(1, 5982150)

In [20]:
nodelist = list(economics_nodes)

In [21]:
nodelist[:10]

['2460612036',
 '2615404153',
 '2227642548',
 '2585687038',
 '2144596701',
 '2151591788',
 '2902347833',
 '2111466348',
 '299227070',
 '317508920']

In [14]:
    g = gt.Graph()
    eweight = g.new_ep("double")

In [15]:
g.add_edge_list(economics, eprops=[eweight], hashed=True)

<VertexPropertyMap object with value type 'string', for Graph 0x7f3f987cd940, at 0x7f3e18386370>

In [11]:
edge_list = [(10, 11, .3, 10), (22, 11, .1, 0), (10, 22, .4, 42)]
g = gt.Graph()
eweight = g.new_ep("double")
g.add_edge_list(edge_list, eprops=[eweight, elayer])
print(eweight.fa)

[0.3 0.1 0.4]


In [12]:
g.get_edges()

array([[10, 11],
       [10, 22],
       [22, 11]])

In [6]:
g = Graph()

In [24]:
def build_graph(citation_folderpath):
    
    edges = []
    g = gt.Graph()
    eweight = g.new_ep("double")
    
    eweight_array = []
    node_mapping = {}
    node_idx = 0
    
    for filename in sorted(os.listdir(citation_folderpath)):
        # edges = []
        if not filename.endswith('.csv'):
            continue

        print(filename)
        with open(citation_folderpath + "/" + filename) as file:
            for line in file:
                contents = line.strip().split("\t")
                
                edge_from = contents[0]
                edge_to = contents[1]
                
                weight = float(contents[2])
                
                if edge_from in node_mapping:
                    from_vertex = node_mapping[edge_from]
                else:
                    from_vertex = g.add_vertex()
                    node_mapping[edge_from] = from_vertex
                    
                if edge_to in node_mapping:
                    to_vertex = node_mapping[edge_to]
                else:
                    to_vertex = g.add_vertex()
                    node_mapping[edge_to] = to_vertex
                    
                eweight_array.append(weight)
                
                g.add_edge(from_vertex, to_vertex)
    
    
    idx_to_node = [(int(v), node) for node, v in node_mapping.items()]
    idx_to_node = sorted(idx_to_node, key=lambda x: x[0])
    
    nodes = [node for idx, node in idx_to_node]
    
    eweight.a = np.array(eweight_array)
    
    return g, nodes, eweight


def compute_centralities(graph, node_mapping, eweight, filename, 
                         pr_damping=0.85, katz_alpha=0.01, katz_beta=None):
    
    nodes = list(graph.vertices())
    
    largest_comp = gt.topology.label_largest_component(graph)
    
    print("Initiating PageRank")
    pr = pagerank(graph, weight=eweight, damping=pr_damping)
    
    print("Initiating Katz")
    katz_centrality = gt.centrality.katz(gt.GraphView(graph, vfilt=largest_comp), 
                                         weight=eweight, alpha=katz_alpha, beta=katz_beta)
    
    katz_list = list(katz_centrality)
    
    katz_scores = []
    for indicator in largest_comp.a:
        if indicator == 1:
            katz_scores.append(katz_list.pop(0))
        else:
            katz_scores.append(None)     
                                         
    print("Initiating in-degree")
    in_degree = graph.get_in_degrees(nodes, eweight=eweight)
    
    print("Finished centrality computations")
    
    df = pd.DataFrame()
    df['node'] = list(node_mapping)
    df['pagerank'] = list(pr.get_array())
    df['katz'] = katz_scores
    df['in_degree_strength'] = list(in_degree)
    df.to_csv(filename, index=False, sep="\t")
    
    return 

In [5]:
bg = build_graph('/home/laal/MAG/DATA/NETWORKS/SimpleWeightPsychology2010.txt')

<EdgePropertyMap object with value type 'double', for a non-existent graph, at 0x7fd2dfb90f70>

In [15]:
g = Graph()

In [16]:
v1 = g.add_vertex()

In [17]:
dir(v1)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__in_degree',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__out_degree',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '__weighted_in_degree',
 '__weighted_out_degree',
 'all_edges',
 'all_neighbors',
 'all_neighbours',
 'graph_ptr',
 'graph_type',
 'in_degree',
 'in_edges',
 'in_neighbors',
 'in_neighbours',
 'is_valid',
 'out_degree',
 'out_edges',
 'out_neighbors',
 'out_neighbours']

In [18]:
v1.__int__()

0

In [19]:
int(v1)

0

In [23]:
eweight.a.shape

(16279775,)

In [12]:
spark

NameError: name 'spark' is not defined