In [10]:
import json
import networkx as nx

with open("hollywood_graph.json") as f:
    graph_dict = json.load(f)

graph = nx.node_link_graph(graph_dict)

In [11]:

print("GRAPH METRICS - HOLLYWOOD\n")
print("Number of nodes:", graph.number_of_nodes())
print("Number of edges:", graph.number_of_edges())

movie_nodes = {n for n, d in graph.nodes(data=True) if d.get("bipartite") == 0}
actor_nodes = {n for n, d in graph.nodes(data=True) if d.get("bipartite") == 1}

print(f"Movies: {len(movie_nodes)}")
print(f"Actors: {len(actor_nodes)}")
print("Graph density:", nx.bipartite.density(graph, list(actor_nodes)))

avg_actors_per_movie = graph.size() / len(movie_nodes)
avg_movies_per_actor = graph.size() / len(actor_nodes)

print(f"Average number of actors per movie: {round(avg_actors_per_movie,2)}")
print(f"Average number of movies per actor: {round(avg_movies_per_actor,2)}")
print("Is there any self loop?", len(list(nx.selfloop_edges(graph))))
if len(list(nx.selfloop_edges(graph))) > 0:
    print(list(nx.selfloop_edges(graph)))


GRAPH METRICS - HOLLYWOOD

Number of nodes: 131642
Number of edges: 329137
Movies: 20184
Actors: 111458
Graph density: 0.0001463046814930588
Average number of actors per movie: 16.31
Average number of movies per actor: 2.95
Is there any self loop? 0


In [12]:
actors_graph = nx.bipartite.weighted_projected_graph(graph, actor_nodes)
actors_nodes = actors_graph.nodes()
actors_edges = actors_graph.edges()
print("Number of edges in the actors projection graph:", len(actors_graph.edges()))
print("Density of the actors projection graph:",nx.density(actors_graph))


Number of edges in the actors projection graph: 4872779
Density of the actors projection graph: 0.0007844912706248758


In [15]:
print("Graph characteristics")
gender = nx.get_node_attributes(actors_graph, 'gender')
male = 0
for el in gender.values():
    if el== 'male':
        male += 1
print("Male actors:", male)
print("Female actors:", len(actor_nodes) - male)

period = nx.get_node_attributes(actors_graph, 'period')
old = 0
for el in period.values():
    if el == "old_hollywood":
        old +=1
print("Old hollywood actors:", old)
print("New hollywood actors:", len(actor_nodes) - old)

# vedere se attrici hanno lo stesso numero di film di uomini
#

Graph characteristics
Male actors: 26676
Female actors: 84782
Old hollywood actors: 15054
New hollywood actors: 96404


In [5]:
print("ASSORTATIVITY BY GENRE - The similarity of genres among actors is: ")
print(round(nx.attribute_assortativity_coefficient(actors_graph, "genre")*100,2), "%")

print("ASSORTATIVITY BY PERIOD - The period similarity among actors is: ")
print(round(nx.attribute_assortativity_coefficient(actors_graph, "period")*100,2), "%")



ASSORTATIVITY BY GENRE - The similarity of genres among actors is: 
30.92 %
ASSORTATIVITY BY PERIOD - The period similarity among actors is: 
80.14 %


In [6]:
import networkit as nk
nk.setNumberOfThreads(7)
names = list(actors_graph.nodes())
actors_graph_nk = nk.nxadapter.nx2nk(actors_graph)
print(actors_graph_nk.numberOfNodes())

111458


In [7]:
print("BETWEENNESS CENTRALITY - actors acting as bridges")
betweenness = nk.centrality.KadabraBetweenness(actors_graph_nk, err=0.01)
betweenness.run()
i = 1
for id, score in betweenness.ranking()[:10]:
    print(f"{i}°", names[id])
    i+=1

print("\n\nCLOSENESS CENTRALITY - actors best connected to other actors")
closeness = nk.centrality.TopCloseness(actors_graph_nk, k = 10)
closeness.run()
j = 1
for id in closeness.topkNodesList(includeTrail =True):
    print(f"{j}°", names[id])
    j+=1



BETWEENNESS CENTRALITY - actors acting as bridges
1° actor: Samuel L. Jackson
2° actor: Bess Flowers
3° actor: Danny Trejo
4° actor: Robert De Niro
5° actor: John Carradine
6° actor: James Franco
7° actor: Morgan Freeman
8° actor: Bruce Willis
9° actor: Christopher Walken
10° actor: Keith David


CLOSENESS CENTRALITY - actors best connected to other actors
1° actor: Samuel L. Jackson
2° actor: Christopher Walken
3° actor: Robert De Niro
4° actor: Bruce Willis
5° actor: John Goodman
6° actor: Whoopi Goldberg
7° actor: Susan Sarandon
8° actor: Morgan Freeman
9° actor: Stanley Tucci
10° actor: Donald Sutherland


In [1]:
print("Running a small world analysis on the biggest connected component")
components = nk.components.ConnectedComponents(actors_graph_nk)
components.run()
main_component = components.extractLargestConnectedComponent(actors_graph_nk)
main_component = nk.graphtools.getCompactedGraph(main_component, nk.graphtools.getContinuousNodeIds(main_component))
print("Main component number of nodes:", main_component.numberOfNodes())
print("Main component number of edges:", main_component.numberOfEdges())
#print("Main component density:", nx.density(main_component))
print("Analysing the diameter...")
diameter = nk.distance.Diameter(main_component, algo=nk.distance.DiameterAlgo.EXACT)
diameter.run()
diameter_value = diameter.getDiameter()
print("Diameter:", diameter_value[0])

print("Computing the average path length...")
total_distance_sum = 0
total_valid_pairs = 0

for node in main_component.iterNodes():
    bfs = nk.distance.BFS(main_component, node)
    bfs.run()
    distances = bfs.getDistances()
    total_distance_sum += sum(distances)
    total_valid_pairs += len(distances)


avg_path_length = total_distance_sum / total_valid_pairs
print("Average path length:", avg_path_length)

Running a small world analysis on the biggest connected component


NameError: name 'nk' is not defined