In [18]:
import json
import networkx as nx

with open("hollywood_graph.json") as f:
    graph_dict = json.load(f)

graph = nx.node_link_graph(graph_dict)

In [19]:
# create custom projection
actor_nodes = [n for n, d in graph.nodes(data=True) if d.get("bipartite") == 1]
print(actor_nodes[0])


actor: Tom Hanks


In [20]:
actors_graph = nx.bipartite.weighted_projected_graph(graph, actor_nodes)
actors_nodes = actors_graph.nodes()
actors_edges = actors_graph.edges()
print("Number of edges in the actors projection graph:", len(actors_graph.edges()))
print("Density of the actors projection graph:",nx.density(actors_graph))


Number of edges in the actors projection graph: 4872779
Density of the actors projection graph: 0.0007844912706248758


In [6]:
print("Graph characteristics")
gender = nx.get_node_attributes(actors_graph, 'gender')
male = 0
for el in gender.values():
    if el== 'male':
        male += 1
print("Male actors:", male)
print("Female actors:", len(actor_nodes) - male)

period = nx.get_node_attributes(actors_graph, 'period')
old = 0
for el in period.values():
    if el == "old_hollywood":
        old +=1
print("Old hollywood actors:", old)
print("New hollywood actors:", len(actor_nodes) - old)

# vedere se attrici hanno lo stesso numero di film di uomini
# successo del film in base al genere del primo attore del cast

Graph characteristics
Male actors: 26676
Female actors: 84782
Old hollywood actors: 15054
New hollywood actors: 96404


In [7]:
print("ASSORTATIVITY BY GENRE")
print(round(nx.attribute_assortativity_coefficient(actors_graph, "genre"),2), "%")

print("ASSORTATIVITY BY PERIOD")
print(round(nx.attribute_assortativity_coefficient(actors_graph, "period"),2), "%")



ASSORTATIVITY BY GENRE

Caching the list of root modules, please wait!
(This will only be done once - type '%rehashx' to reset cache!)

0.31 %
ASSORTATIVITY BY PERIOD
0.8 %


In [21]:
import networkit as nk
nk.setNumberOfThreads(7)
names = list(actors_graph.nodes())
actors_graph_nk = nk.nxadapter.nx2nk(actors_graph)
print(actors_graph_nk.numberOfNodes())

111458


In [None]:
print("BETWEENNESS CENTRALITY - actors acting as bridges")
betweenness = nk.centrality.KadabraBetweenness(actors_graph_nk, err=0.01)
betweenness.run()
i = 1
for id, score in betweenness.ranking()[:10]:
    print(f"{i}°", names[id])
    i+=1

print("\n\nCLOSENESS CENTRALITY - actors best connected to other actors")
closeness = nk.centrality.TopCloseness(actors_graph_nk, k = 10)
closeness.run()
j = 1
for id in closeness.topkNodesList(includeTrail =True):
    print(f"{j}°", names[id])
    j+=1



BETWEENNESS CENTRALITY - actors acting as bridges
1° actor: Samuel L. Jackson
2° actor: Bess Flowers
3° actor: Danny Trejo
4° actor: Robert De Niro
5° actor: John Carradine
6° actor: James Franco
7° actor: Morgan Freeman
8° actor: Bruce Willis
9° actor: Christopher Walken
10° actor: Keith David


CLOSENESS CENTRALITY - actors best connected to other actors
1° actor: Samuel L. Jackson
2° actor: Christopher Walken
3° actor: Robert De Niro
4° actor: Bruce Willis
5° actor: John Goodman
6° actor: Whoopi Goldberg
7° actor: Susan Sarandon
8° actor: Morgan Freeman
9° actor: Stanley Tucci
10° actor: Donald Sutherland


In [7]:
import gc

# Elimina i vecchi grafi che non ti servono più
del graph
#del actors_graph
# del nome_della_variabile

# Forza la liberazione della memoria
gc.collect()

NameError: name 'graph' is not defined

In [22]:
%%time
import networkit as nk
print("Running a small world analysis on the biggest connected component")
components = nk.components.ConnectedComponents(actors_graph_nk)
components.run()
main_component = components.extractLargestConnectedComponent(actors_graph_nk)
main_component = nk.graphtools.getCompactedGraph(main_component, nk.graphtools.getContinuousNodeIds(main_component))


#main_component_nodes = list(nx.connected_components(actors_graph))[0]
#main_component = actors_graph.subgraph(main_component_nodes)
#del actors_graph
"""print("Is it a small-world graph?")
omega = nx.omega(main_component, niter=5, nrand=10, seed=None)
print(omega)
if  omega in range(-0.1, 0.1):
    print("Yes")"""

Running a small world analysis on the biggest connected component
CPU times: user 1.41 s, sys: 24.2 ms, total: 1.43 s
Wall time: 1.5 s


'print("Is it a small-world graph?")\nomega = nx.omega(main_component, niter=5, nrand=10, seed=None)\nprint(omega)\nif  omega in range(-0.1, 0.1):\n    print("Yes")'

In [None]:

# print("Main component number of nodes:", main_component.numberOfNodes())
# print("Main component number of edges:", main_component.numberOfEdges())
# #print("Main component density:", nx.density(main_component))
# print("Analysing the diameter...")
# diameter = nk.distance.Diameter(main_component, algo=nk.distance.DiameterAlgo.EXACT)
# diameter.run()
# diameter_value = diameter.getDiameter()
# print("Diameter:", diameter_value[0])

# print("Computing the average path length...")
# total_distance_sum = 0
# total_valid_pairs = 0

# #for node in main_component.iterNodes():
# for i in range(500):
#     node = nk.random_node(main_component)
#     bfs = nk.distance.BFS(main_component, node)
#     bfs.run()
#     distances = bfs.getDistances()
#     total_distance_sum += sum(distances)
#     total_valid_pairs += len(distances)


# avg_path_length = total_distance_sum / total_valid_pairs
# print("Average path length:", avg_path_length)

Running a small world analysis on the biggest connected component


NameError: name 'nk' is not defined

In [24]:
import numpy as np
import random
import math

print("Running a small world analysis on the biggest connected component")
components = nk.components.ConnectedComponents(actors_graph_nk)
components.run()
main_component = components.extractLargestConnectedComponent(actors_graph_nk)
main_component = nk.graphtools.getCompactedGraph(main_component, nk.graphtools.getContinuousNodeIds(main_component))
print("Main component number of nodes:", main_component.numberOfNodes())
print("Main component number of edges:", main_component.numberOfEdges())
#print("Main component density:", nx.density(main_component))
print("Analysing the diameter...")
diameter = nk.distance.Diameter(main_component, algo=nk.distance.DiameterAlgo.EXACT)
diameter.run()
diameter_value = diameter.getDiameter()
print("Diameter:", diameter_value[0])

print("Computing the average path length (sampled)...")
all_nodes = list(main_component.iterNodes())
sample_size = min(40000, main_component.numberOfNodes())
print(
    f"Sampling {sample_size} nodes representing {sample_size / main_component.numberOfNodes() * 100:.2f}% of all nodes")
sampled_nodes = random.sample(all_nodes, sample_size)

batch_size = 200
num_batches = math.ceil(sample_size / batch_size)
total_distance_sum = 0
total_valid_pairs = 0
print("Number of batches:", num_batches)
#for node in main_component.iterNodes():
for batch_iter in range(num_batches):
    start_idx = batch_iter * batch_size
    end_idx = min((batch_iter + 1) * batch_size, sample_size)
    batch_nodes = sampled_nodes[start_idx:end_idx]

    spsp = nk.distance.SPSP(main_component, batch_nodes)
    spsp.run()
    distances_from_source = spsp.getDistances()

    distances_array = np.array(distances_from_source)
    valid_mask = distances_array > 0
    total_distance_sum += distances_array[valid_mask].sum()
    total_valid_pairs += valid_mask.sum()

    del spsp
    del distances_array
    if batch_iter % 10 == 0 and batch_iter != 0:
        print("Finished batch:", batch_iter)

avg_path_length = total_distance_sum / total_valid_pairs
print("Average sampled path length:", avg_path_length)


Running a small world analysis on the biggest connected component
Main component number of nodes: 109623
Main component number of edges: 4866246
Analysing the diameter...
Diameter: 9
Computing the average path length (sampled)...
Sampling 40000 nodes representing 36.49% of all nodes
Number of batches: 200
Finished batch: 10
Finished batch: 20
Finished batch: 30
Finished batch: 40
Finished batch: 50
Finished batch: 60
Finished batch: 70
Finished batch: 80
Finished batch: 90
Finished batch: 100
Finished batch: 110
Finished batch: 120
Finished batch: 130
Finished batch: 140
Finished batch: 150
Finished batch: 160
Finished batch: 170
Finished batch: 180
Finished batch: 190
Average sampled path length: 3.1423946247103682


In [None]:
print("Computing the exact average path length...")
all_nodes = list(main_component.iterNodes())
n_nodes = main_component.numberOfNodes()
batch_size = 500
num_batches = math.ceil(n_nodes / batch_size)
total_distance_sum = 0
total_valid_pairs = n_nodes * (n_nodes - 1)
print("Number of batches:", num_batches)
#for node in main_component.iterNodes():
for batch_iter in range(num_batches):
    start_idx = batch_iter * batch_size
    end_idx = min((batch_iter + 1) * batch_size, n_nodes)
    batch_nodes = all_nodes[start_idx:end_idx]

    spsp = nk.distance.SPSP(main_component, batch_nodes)
    spsp.run()
    distances_from_source = spsp.getDistances()

    distances_array = np.array(distances_from_source)
    total_distance_sum += distances_array.sum()

    del spsp
    del distances_array
    if batch_iter % 10 == 0:
        print("Finished batch:", batch_iter)

exact_avg_path_length = total_distance_sum / total_valid_pairs
print("Exact average path length:", exact_avg_path_length)

In [25]:
clustering_coeff = nk.globals.clustering(main_component)
print("Clustering coefficient:", clustering_coeff)

Clustering coefficient: 0.7760792148006601
