In [1]:
import json
import networkx as nx

with open("hollywood_graph.json") as f:
    graph_dict = json.load(f)

graph = nx.node_link_graph(graph_dict)

In [2]:
print(list(graph.neighbors("movie: From the Earth to the Moon")))

['actor: Tom Hanks', 'actor: Nick Searcy', 'actor: Lane Smith', 'actor: David Andrews', 'actor: Daniel Hugh Kelly', 'actor: Stephen Root', 'actor: David Clyde Carr', 'actor: Tim Daly', 'actor: Steve Hofvendahl', "actor: Conor O'Farrell", 'actor: Brett Cullen', 'actor: Cary Elwes', 'actor: Ben Marley']


In [2]:
actors_nodes = [n for n, d in graph.nodes(data=True) if d.get("bipartite") == 1]
movie_nodes = [n for n, d in graph.nodes(data=True) if d.get("bipartite") == 0]



In [3]:
import networkx as nx
from networkx.algorithms import bipartite
import itertools

# weighted projection
actors_graph = bipartite.weighted_projected_graph(graph, actors_nodes)
actors_projection_edges = actors_graph.edges()
for edge in actors_projection_edges:
    attr = actors_graph.edges[edge]
    attr["movies"] = []


for movie in movie_nodes:
    attr = graph.nodes[movie]  
    cast = list(graph.neighbors(movie))
    for u, v in itertools.combinations(cast, 2):
        if actors_graph.has_edge(u, v):
            if not attr['release_date'] is None:
                if isinstance(attr['release_date'],int):
                    if 'earliest_contact' not in actors_graph[u][v]:
                            actors_graph[u][v]['earliest_contact'] = int(attr['release_date'])
                    else:
                        actors_graph[u][v]['earliest_contact'] = min(actors_graph[u][v]['earliest_contact'],int(attr['release_date']))
                    actors_graph[u][v]["movies"].append((movie, int(attr['release_date'])))
                else:
                    print(attr['release_date'])

In [4]:
actors_projection_nodes = actors_graph.nodes()
print("Number of edges in the actors projection graph:", len(actors_projection_edges))
print("Density of the actors projection graph:",nx.density(actors_graph))


Number of edges in the actors projection graph: 4350558
Density of the actors projection graph: 0.0007004164919745423


In [5]:
# analysis of actors success

# per ogni attore vedo i suoi neighbors
# vedo l'edge assegnato alla coppia
# vedo se il contatto è stato pre-breakthrough


for actor in list(actors_projection_nodes):
    bk = actors_graph.nodes[actor]["breakthrough"]
    if not bk is None:
        pre_br_contacts = dict()
        bk = int(bk)
        pre_br_contacts["contacts_before_breakthrough"] = []
        for neighbor in actors_graph.neighbors(actor):
            if actors_graph.has_edge(u, v):
                if "earliest_contact" in actors_graph[actor][neighbor]:
                    try:
                        if int(actors_graph[actor][neighbor]["earliest_contact"]) < bk:
                            pre_br_contacts["contacts_before_breakthrough"].append((neighbor, int(actors_graph[actor][neighbor]["earliest_contact"])))
                    except:
                        print(actors_graph[actor][neighbor].keys())
    actors_graph.nodes[actor].update(pre_br_contacts)

print(actors_graph.nodes["actor: Tom Hanks"])


to_dump = nx.readwrite.json_graph.node_link_data(actors_graph)
with open("actors_projection_graph.json", "w") as f:
    json.dump(to_dump, f, indent = 4)




    


{'gender': 'male', 'countries': ['Germany', 'Malta', 'India', 'Australia', 'United Kingdom', 'Italy', 'Hong Kong', 'United States of America', 'France', 'Mexico', 'Hungary', 'Singapore'], 'avg_movie_revenue_2020_$': 97984814, 'top_ten': [[1995, 'movie: Toy Story'], [1995, 'movie: Apollo 13'], [1994, 'movie: Forrest Gump'], [1993, 'movie: Philadelphia'], [1993, 'movie: Sleepless in Seattle'], [1996, 'movie: That Thing You Do!'], [1998, 'movie: Saving Private Ryan'], [1989, "movie: The 'Burbs"], [1984, 'movie: Splash'], [1986, 'movie: The Money Pit'], [1986, 'movie: Nothing in Common'], [1998, "movie: You've Got Mail"], [1988, 'movie: Big'], [1999, 'movie: Return with Honor'], [1999, 'movie: Toy Story 2'], [1990, 'movie: The Bonfire of the Vanities'], [1999, 'movie: The Green Mile'], [1992, 'movie: A League of Their Own'], [1985, 'movie: Volunteers'], [1984, 'movie: Bachelor Party'], [1988, 'movie: Punchline'], [2000, 'movie: Cast Away'], [1989, 'movie: Turner & Hooch'], [1980, "movie: H

In [25]:
print("Graph characteristics")
gender = nx.get_node_attributes(actors_graph, 'gender')
male = 0
for el in gender.values():
    if el== 'male':
        male += 1
print("Male actors:", male)
print("Female actors:", len(actors_projection_nodes) - male)

period = nx.get_node_attributes(actors_graph, 'period')
old = 0
for el in period.values():
    if el == "old_hollywood":
        old +=1
print("Old hollywood actors:", old)
print("New hollywood actors:", len(actors_projection_nodes) - old)

# vedere se attrici hanno lo stesso numero di film di uomini
# successo del film in base al genere del primo attore del cast

Graph characteristics
Male actors: 26676
Female actors: 84782
Old hollywood actors: 15057
New hollywood actors: 96401


In [26]:
print("ASSORTATIVITY BY GENRE")
print(round(nx.attribute_assortativity_coefficient(actors_graph, "genre"),2), "%")

print("ASSORTATIVITY BY PERIOD")
print(round(nx.attribute_assortativity_coefficient(actors_graph, "period"),2), "%")



ASSORTATIVITY BY GENRE
0.33 %
ASSORTATIVITY BY PERIOD
0.9 %


In [21]:
import networkit as nk
nk.setNumberOfThreads(7)
names = list(actors_graph.nodes())
actors_graph_nk = nk.nxadapter.nx2nk(actors_graph)
print(actors_graph_nk.numberOfNodes())

111458


In [None]:
print("BETWEENNESS CENTRALITY - actors acting as bridges")
betweenness = nk.centrality.KadabraBetweenness(actors_graph_nk, err=0.01)
betweenness.run()
i = 1
for id, score in betweenness.ranking()[:10]:
    print(f"{i}°", names[id])
    i+=1

print("\n\nCLOSENESS CENTRALITY - actors best connected to other actors")
closeness = nk.centrality.TopCloseness(actors_graph_nk, k = 10)
closeness.run()
j = 1
for id in closeness.topkNodesList(includeTrail =True):
    print(f"{j}°", names[id])
    j+=1



BETWEENNESS CENTRALITY - actors acting as bridges
1° actor: Samuel L. Jackson
2° actor: Bess Flowers
3° actor: Danny Trejo
4° actor: Robert De Niro
5° actor: John Carradine
6° actor: James Franco
7° actor: Morgan Freeman
8° actor: Bruce Willis
9° actor: Christopher Walken
10° actor: Keith David


CLOSENESS CENTRALITY - actors best connected to other actors
1° actor: Samuel L. Jackson
2° actor: Christopher Walken
3° actor: Robert De Niro
4° actor: Bruce Willis
5° actor: John Goodman
6° actor: Whoopi Goldberg
7° actor: Susan Sarandon
8° actor: Morgan Freeman
9° actor: Stanley Tucci
10° actor: Donald Sutherland


In [22]:
%%time
import networkit as nk
print("Running a small world analysis on the biggest connected component")
components = nk.components.ConnectedComponents(actors_graph_nk)
components.run()
main_component = components.extractLargestConnectedComponent(actors_graph_nk)
main_component = nk.graphtools.getCompactedGraph(main_component, nk.graphtools.getContinuousNodeIds(main_component))


#main_component_nodes = list(nx.connected_components(actors_graph))[0]
#main_component = actors_graph.subgraph(main_component_nodes)
#del actors_graph
"""print("Is it a small-world graph?")
omega = nx.omega(main_component, niter=5, nrand=10, seed=None)
print(omega)
if  omega in range(-0.1, 0.1):
    print("Yes")"""

Running a small world analysis on the biggest connected component
CPU times: user 1.41 s, sys: 24.2 ms, total: 1.43 s
Wall time: 1.5 s


'print("Is it a small-world graph?")\nomega = nx.omega(main_component, niter=5, nrand=10, seed=None)\nprint(omega)\nif  omega in range(-0.1, 0.1):\n    print("Yes")'

In [None]:

print("Main component number of nodes:", main_component.numberOfNodes())
print("Main component number of edges:", main_component.numberOfEdges())
#print("Main component density:", nx.density(main_component))
print("Analysing the diameter...")
diameter = nk.distance.Diameter(main_component, algo=nk.distance.DiameterAlgo.EXACT)
diameter.run()
diameter_value = diameter.getDiameter()
print("Diameter:", diameter_value[0])


Running a small world analysis on the biggest connected component


NameError: name 'nk' is not defined

In [24]:
import numpy as np
import random
import math

print("Running a small world analysis on the biggest connected component")
components = nk.components.ConnectedComponents(actors_graph_nk)
components.run()
main_component = components.extractLargestConnectedComponent(actors_graph_nk)
main_component = nk.graphtools.getCompactedGraph(main_component, nk.graphtools.getContinuousNodeIds(main_component))
print("Main component number of nodes:", main_component.numberOfNodes())
print("Main component number of edges:", main_component.numberOfEdges())
#print("Main component density:", nx.density(main_component))
print("Analysing the diameter...")
diameter = nk.distance.Diameter(main_component, algo=nk.distance.DiameterAlgo.EXACT)
diameter.run()
diameter_value = diameter.getDiameter()
print("Diameter:", diameter_value[0])

print("Computing the average path length (sampled)...")
all_nodes = list(main_component.iterNodes())
sample_size = min(40000, main_component.numberOfNodes())
print(
    f"Sampling {sample_size} nodes representing {sample_size / main_component.numberOfNodes() * 100:.2f}% of all nodes")
sampled_nodes = random.sample(all_nodes, sample_size)

batch_size = 200
num_batches = math.ceil(sample_size / batch_size)
total_distance_sum = 0
total_valid_pairs = 0
print("Number of batches:", num_batches)
#for node in main_component.iterNodes():
for batch_iter in range(num_batches):
    start_idx = batch_iter * batch_size
    end_idx = min((batch_iter + 1) * batch_size, sample_size)
    batch_nodes = sampled_nodes[start_idx:end_idx]

    spsp = nk.distance.SPSP(main_component, batch_nodes)
    spsp.run()
    distances_from_source = spsp.getDistances()

    distances_array = np.array(distances_from_source)
    valid_mask = distances_array > 0
    total_distance_sum += distances_array[valid_mask].sum()
    total_valid_pairs += valid_mask.sum()

    del spsp
    del distances_array
    if batch_iter % 10 == 0 and batch_iter != 0:
        print("Finished batch:", batch_iter)

avg_path_length = total_distance_sum / total_valid_pairs
print("Average sampled path length:", avg_path_length)


Running a small world analysis on the biggest connected component
Main component number of nodes: 109623
Main component number of edges: 4866246
Analysing the diameter...
Diameter: 9
Computing the average path length (sampled)...
Sampling 40000 nodes representing 36.49% of all nodes
Number of batches: 200
Finished batch: 10
Finished batch: 20
Finished batch: 30
Finished batch: 40
Finished batch: 50
Finished batch: 60
Finished batch: 70
Finished batch: 80
Finished batch: 90
Finished batch: 100
Finished batch: 110
Finished batch: 120
Finished batch: 130
Finished batch: 140
Finished batch: 150
Finished batch: 160
Finished batch: 170
Finished batch: 180
Finished batch: 190
Average sampled path length: 3.1423946247103682


In [None]:
print("Computing the exact average path length...")
all_nodes = list(main_component.iterNodes())
n_nodes = main_component.numberOfNodes()
batch_size = 500
num_batches = math.ceil(n_nodes / batch_size)
total_distance_sum = 0
total_valid_pairs = n_nodes * (n_nodes - 1)
print("Number of batches:", num_batches)
#for node in main_component.iterNodes():
for batch_iter in range(num_batches):
    start_idx = batch_iter * batch_size
    end_idx = min((batch_iter + 1) * batch_size, n_nodes)
    batch_nodes = all_nodes[start_idx:end_idx]

    spsp = nk.distance.SPSP(main_component, batch_nodes)
    spsp.run()
    distances_from_source = spsp.getDistances()

    distances_array = np.array(distances_from_source)
    total_distance_sum += distances_array.sum()

    del spsp
    del distances_array
    if batch_iter % 10 == 0:
        print("Finished batch:", batch_iter)

exact_avg_path_length = total_distance_sum / total_valid_pairs
print("Exact average path length:", exact_avg_path_length)

In [25]:
clustering_coeff = nk.globals.clustering(main_component)
print("Clustering coefficient:", clustering_coeff)

Clustering coefficient: 0.7760792148006601
