In [1]:
import json
import networkx as nx
with open("hollywood_graph.json") as f:
    graph_dict = json.load(f)

graph = nx.node_link_graph(graph_dict)

In [2]:

print("GRAPH METRICS - HOLLYWOOD\n")
print("Number of nodes:", graph.number_of_nodes())
print("Number of edges:", graph.number_of_edges())

movie_nodes = {n for n, d in graph.nodes(data=True) if d.get("bipartite") == 0}
actor_nodes = {n for n, d in graph.nodes(data=True) if d.get("bipartite") == 1}

print(f"Movies: {len(movie_nodes)}")
print(f"Actors: {len(actor_nodes)}")
print("Graph density:", nx.bipartite.density(graph, list(actor_nodes)))

avg_actors_per_movie = graph.size() / len(movie_nodes)
avg_movies_per_actor = graph.size() / len(actor_nodes)

print(f"Average number of actors per movie: {round(avg_actors_per_movie,2)}")
print(f"Average number of movies per actor: {round(avg_movies_per_actor,2)}")
print("Is there any self loop?", len(list(nx.selfloop_edges(graph))))
if len(list(nx.selfloop_edges(graph))) > 0:
    print(list(nx.selfloop_edges(graph))) 

GRAPH METRICS - HOLLYWOOD

Number of nodes: 131642
Number of edges: 329137
Movies: 20184
Actors: 111458
Graph density: 0.0001463046814930588
Average number of actors per movie: 16.31
Average number of movies per actor: 2.95
Is there any self loop? 0


In [3]:
actors_graph = nx.projected_graph(graph, actor_nodes)
actors_nodes = actors_graph.nodes()
actors_edges = actors_graph.edges()
print(len(actors_graph.edges()))


4872779


In [4]:
print("ASSORTATIVITY BY GENRE - The similarity of genres among actors is: ")
print(round(nx.attribute_assortativity_coefficient(actors_graph, "genre")*100,2), "%")

print("ASSORTATIVITY BY PERIOD - The period similarity among actors is: ")
print(round(nx.attribute_assortativity_coefficient(actors_graph, "period")*100,2), "%")



ASSORTATIVITY BY GENRE - The similarity of genres among actors is: 
30.92 %
ASSORTATIVITY BY PERIOD - The period similarity among actors is: 
80.14 %


In [5]:
import networkit as nk
names = list(actors_graph.nodes())
actors_graph_nk = nk.nxadapter.nx2nk(actors_graph)
print(actors_graph_nk.numberOfNodes())


print("BETWEENNESS CENTRALITY - actors acting as bridges")
betweenness = nk.centrality.KadabraBetweenness(actors_graph_nk, err=0.01)
betweenness.run()
i = 1
for id, score in betweenness.ranking()[:10]:
    print(f"{i}°", names[id])
    i+=1

print("\n\nCLOSENESS CENTRALITY - actors best connected to other actors")
closeness = nk.centrality.TopCloseness(actors_graph_nk, k = 10)
closeness.run()
j = 1
for id in closeness.topkNodesList(includeTrail =True):
    print(f"{j}°", names[id])
    j+=1



111458
BETWEENNESS CENTRALITY - actors acting as bridges
1° actor: Bess Flowers
2° actor: Samuel L. Jackson
3° actor: Danny Trejo
4° actor: Christopher Walken
5° actor: James Franco
6° actor: Morgan Freeman
7° actor: Michael Caine
8° actor: John Carradine
9° actor: Robert De Niro
10° actor: Paul Giamatti


CLOSENESS CENTRALITY - actors best connected to other actors
1° actor: Samuel L. Jackson
2° actor: Christopher Walken
3° actor: Robert De Niro
4° actor: Bruce Willis
5° actor: John Goodman
6° actor: Whoopi Goldberg
7° actor: Susan Sarandon
8° actor: Morgan Freeman
9° actor: Stanley Tucci
10° actor: Donald Sutherland


In [9]:
print("Running a small world analysis on the biggest connected component")
components = nk.components.ConnectedComponents(actors_graph_nk)
components.run()
main_component = components.extractLargestConnectedComponent(actors_graph_nk)
print("Main component number of nodes:", main_component.numberOfNodes())
print("Main component number of edges:", main_component.numberOfEdges())

print("Analysing the diameter...")
diameter = nk.distance.Diameter(main_component, algo=nk.distance.DiameterAlgo.ESTIMATED_PEDANTIC)
diameter.run()
diameter_value = diameter.getDiameter()
print("Lower bound of the diameter:", diameter_value[0])

effective_diameter = nk.distance.EffectiveDiameterApproximation(main_component)
effective_diameter.run()
effective_diameter_value = round(effective_diameter.getEffectiveDiameter(),2)
print("Effective diameter value:", effective_diameter_value)

cc = nk.components.ConnectedComponents(main_component)
cc.run()
print("Connected components of the main component:", cc.numberOfComponents())
print("Computing the average path length...")
distance_sum = 0
valid_couples = 0
for i in range(500):
    random_node = nk.graphtools.randomNode(main_component)
    bfs = nk.distance.BFS(main_component, random_node, storePaths=False)
    bfs.run()
    distances = bfs.getDistances()
    for d in distances:
        if d > 0 and d < 1e10:
            distance_sum += d
            valid_couples += 1
    if i % 50 == 0:
        print("\tDone with round", i)

avg_path_length = distance_sum / (1000*(main_component.numberOfNodes()-1))
print("Average path length:", avg_path_length)

Running a small world analysis on the biggest connected component
Main component number of nodes: 109623
Main component number of edges: 4866246
Analysing the diameter...
Lower bound of the diameter: 13
Effective diameter value: 3.48
Connected components of the main component: 1
Computing the average path length...
	Done with round 0
	Done with round 50
	Done with round 100
	Done with round 150
	Done with round 200
	Done with round 250
	Done with round 300
	Done with round 350
	Done with round 400
	Done with round 450
Average path length: 1.5754581288427505


In [10]:
print("Valid couples:", valid_couples)
print("Average path length:", avg_path_length)

Valid couples: 54811000
Average path length: 1.5754581288427505
