In [1]:
import json
import networkx as nx

with open("actors_projection_graph.json") as f:
    graph_dict = json.load(f)

graph = nx.node_link_graph(graph_dict)

In [2]:
with open("successful_actors.json") as f:
    successful = json.load(f)
with open("almost_successful_actors.json") as f:
    almost_successful = json.load(f)

In [3]:
print(successful["actor: Tom Hanks"])

{'gender': 'male', 'countries': ['Mexico', 'United States of America', 'Italy', 'Hong Kong', 'Malta', 'France', 'Germany', 'Australia', 'India', 'United Kingdom', 'Hungary', 'Singapore'], 'avg_movie_revenue_2020_$': 97984814, 'top_ten': [[1995, 'movie: Toy Story'], [1995, 'movie: Apollo 13'], [1994, 'movie: Forrest Gump'], [1993, 'movie: Philadelphia'], [1993, 'movie: Sleepless in Seattle'], [1996, 'movie: That Thing You Do!'], [1998, 'movie: Saving Private Ryan'], [1989, "movie: The 'Burbs"], [1984, 'movie: Splash'], [1986, 'movie: The Money Pit'], [1986, 'movie: Nothing in Common'], [1998, "movie: You've Got Mail"], [1988, 'movie: Big'], [1999, 'movie: Return with Honor'], [1999, 'movie: Toy Story 2'], [1990, 'movie: The Bonfire of the Vanities'], [1999, 'movie: The Green Mile'], [1992, 'movie: A League of Their Own'], [1985, 'movie: Volunteers'], [1984, 'movie: Bachelor Party'], [1988, 'movie: Punchline'], [2000, 'movie: Cast Away'], [1989, 'movie: Turner & Hooch'], [1980, "movie: H

In [4]:
nodes = [(n,d) for n, d in graph.nodes(data=True)]
edges = [(e1,e2,d) for e1,e2, d in graph.edges(data=True)]


In [5]:
print(edges[0])
print(nodes[0])

('actor: Tom Hanks', 'actor: Kevin Conway', {'weight': 1, 'movies': [['movie: Prohibition', 2011]], 'earliest_contact': 2011})
('actor: Tom Hanks', {'gender': 'male', 'countries': ['Mexico', 'United States of America', 'Italy', 'Hong Kong', 'Malta', 'France', 'Germany', 'Australia', 'India', 'United Kingdom', 'Hungary', 'Singapore'], 'avg_movie_revenue_2020_$': 97984814, 'top_ten': [[1995, 'movie: Toy Story'], [1995, 'movie: Apollo 13'], [1994, 'movie: Forrest Gump'], [1993, 'movie: Philadelphia'], [1993, 'movie: Sleepless in Seattle'], [1996, 'movie: That Thing You Do!'], [1998, 'movie: Saving Private Ryan'], [1989, "movie: The 'Burbs"], [1984, 'movie: Splash'], [1986, 'movie: The Money Pit'], [1986, 'movie: Nothing in Common'], [1998, "movie: You've Got Mail"], [1988, 'movie: Big'], [1999, 'movie: Return with Honor'], [1999, 'movie: Toy Story 2'], [1990, 'movie: The Bonfire of the Vanities'], [1999, 'movie: The Green Mile'], [1992, 'movie: A League of Their Own'], [1985, 'movie: Volu

In [39]:
# let's create a graph with only the most successful actors to see how they connect to each other

niche_graph = nx.DiGraph()
for actor,diz in successful.items():
    niche_graph.add_node(actor, **diz)

for edge in edges:
    actor1 = edge[0]
    actor2 = edge[1]
    if actor1 and actor2 in successful:
        if "contacts_before_breakthrough" in graph.nodes[actor1]:
            for colleague in graph.nodes[actor1]["contacts_before_breakthrough"]:
                if actor2 == colleague[0]:
                    niche_graph.add_edge(actor1, actor2, **{"year":colleague[1]})
        if "contacts_before_breakthrough" in graph.nodes[actor2]:
            for colleague in graph.nodes[actor2]["contacts_before_breakthrough"]:
                if actor1 == colleague[0]:
                    niche_graph.add_edge(actor2, actor1, **{"year":colleague[1]})






In [40]:
# let's create a graph with only non-successful actors to see how they connect to each other

almost_niche_graph = nx.DiGraph()
for actor,diz in almost_successful.items():
    almost_niche_graph.add_node(actor, **diz)

for edge in edges:
    actor1 = edge[0]
    actor2 = edge[1]
    if actor1 and actor2 in almost_successful:
        if "contacts_before_breakthrough" in graph.nodes[actor1]:
            for colleague in graph.nodes[actor1]["contacts_before_breakthrough"]:
                if actor2 == colleague[0]:
                    almost_niche_graph.add_edge(actor1, actor2, **{"year":colleague[1]})
        if "contacts_before_breakthrough" in graph.nodes[actor2]:
            for colleague in graph.nodes[actor2]["contacts_before_breakthrough"]:
                if actor1 == colleague[0]:
                    almost_niche_graph.add_edge(actor2, actor1, **{"year":colleague[1]})






In [41]:
# let's create a graph with the most successful actors and those who almost made it to see how they connect to each other

mixed_graph = nx.DiGraph()
for actor,diz in successful.items():
    diz["status"] = "successful"
    mixed_graph.add_node(actor, **diz)
for actor,diz in almost_successful.items():
    diz["status"] = "almost_successful"
    mixed_graph.add_node(actor, **diz)

for edge in edges:
    actor1 = edge[0]
    actor2 = edge[1]
    if actor1 in almost_successful and actor2 in successful:
        if "contacts_before_breakthrough" in graph.nodes[actor1]:
            for colleague in graph.nodes[actor1]["contacts_before_breakthrough"]:
                if actor2 == colleague[0]:
                    mixed_graph.add_edge(actor1, actor2, **{"year":colleague[1]})
        if "contacts_before_breakthrough" in graph.nodes[actor2]:
            for colleague in graph.nodes[actor2]["contacts_before_breakthrough"]:
                if actor1 == colleague[0]:
                    mixed_graph.add_edge(actor2, actor1, **{"year":colleague[1]})
    if actor1 in successful and actor2 in almost_successful:
        if "contacts_before_breakthrough" in graph.nodes[actor1]:
            for colleague in graph.nodes[actor1]["contacts_before_breakthrough"]:
                if actor2 == colleague[0]:
                    mixed_graph.add_edge(actor1, actor2, **{"year":colleague[1]})
        if "contacts_before_breakthrough" in graph.nodes[actor2]:
            for colleague in graph.nodes[actor2]["contacts_before_breakthrough"]:
                if actor1 == colleague[0]:
                    mixed_graph.add_edge(actor2, actor1, **{"year": colleague[1]})






In [None]:
# find mentors, mentees and peers
# mentor: someone who has an edge incoming from someone, but not one outgoing to that someone
# mentor: someone who has an edge outgoing to someone, but not one incoming from that someone
# peers: actors who knew each other before either made it


In [12]:
niche_all_connections=list(niche_graph.edges())

In [13]:
niche_in_out = dict()
for tup in niche_all_connections:
    start = tup[0]
    end = tup[1]
    if start not in niche_in_out:
        niche_in_out[start] = dict()
        niche_in_out[start]["incoming"] = list()
        niche_in_out[start]["outgoing"] = list()
        
    if end not in niche_in_out:
        niche_in_out[end] = dict()
        niche_in_out[end]["incoming"] = list()
        niche_in_out[end]["outgoing"] = list()        
    niche_in_out[start]["outgoing"].append(end)
    niche_in_out[end]["incoming"].append(start) 
    


In [14]:
niche_ordered_by_outgoing = {node: len(connected['outgoing']) for node, connected in niche_in_out.items()}
niche_biggest_mentors = sorted(niche_ordered_by_outgoing.items(), key=lambda item: item[1], reverse=True)
niche_ordered_by_incoming= {node: len(connected['incoming']) for node, connected in niche_in_out.items()}
niche_biggest_mentees = sorted(niche_ordered_by_incoming.items(), key=lambda item: item[1], reverse=True)
print(niche_biggest_mentors[:5])
print(niche_biggest_mentees[:5])


[('actor: Donald Crisp', 358), ('actor: John Wayne', 356), ('actor: Lee Marvin', 278), ('actor: William Demarest', 241), ('actor: J. Carrol Naish', 231)]
[('actor: John Wayne', 230), ('actor: Donald Crisp', 195), ('actor: Brian Donlevy', 192), ('actor: Henry Fonda', 183), ('actor: James Cagney', 178)]


In [15]:
almost_niche_all_connections=list(almost_niche_graph.edges())
#almost_niche_all_connections = [(a1,a2) for a1, a2 in almost_niche_all_connections]

In [16]:
almost_niche_in_out = dict()

for tup in almost_niche_all_connections:
    start = tup[0]
    end = tup[1]
    if start not in almost_niche_in_out:
        almost_niche_in_out[start] = dict()
        almost_niche_in_out[start]["incoming"] = list()
        almost_niche_in_out[start]["outgoing"] = list()       
    if end not in almost_niche_in_out:
        almost_niche_in_out[end] = dict()
        almost_niche_in_out[end]["incoming"] = list()
        almost_niche_in_out[end]["outgoing"] = list()        
    almost_niche_in_out[start]["outgoing"].append(end)
    almost_niche_in_out[end]["incoming"].append(start) 
    


In [17]:
almost_niche_ordered_by_outgoing = {node: len(connected['outgoing']) for node, connected in almost_niche_in_out.items()}
almost_niche_biggest_mentors = sorted(almost_niche_ordered_by_outgoing.items(), key=lambda item: item[1], reverse=True)
almost_niche_ordered_by_incoming= {node: len(connected['incoming']) for node, connected in almost_niche_in_out.items()}
almost_niche_biggest_mentees = sorted(almost_niche_ordered_by_incoming.items(), key=lambda item: item[1], reverse=True)
print(almost_niche_biggest_mentors[:5])
print(almost_niche_biggest_mentees[:5])

[('actor: Charles Lane', 930), ('actor: Sam Harris', 815), ('actor: Bess Flowers', 740), ('actor: Danny Trejo', 724), ('actor: James Flavin', 719)]
[('actor: Bess Flowers', 670), ('actor: Russell Hicks', 443), ('actor: Charles Lane', 420), ('actor: Gino Corrado', 420), ('actor: Selmer Jackson', 417)]


In [20]:
mixed_all_connections=list(mixed_graph.edges())

mixed_in_out = dict()

for tup in mixed_all_connections:
    start = tup[0]
    end = tup[1]
    if start not in mixed_in_out:
        mixed_in_out[start] = dict()
        mixed_in_out[start]["incoming"] = list()
        mixed_in_out[start]["outgoing"] = list()       
    if end not in mixed_in_out:
        mixed_in_out[end] = dict()
        mixed_in_out[end]["incoming"] = list()
        mixed_in_out[end]["outgoing"] = list()        
    mixed_in_out[start]["outgoing"].append(end)
    mixed_in_out[end]["incoming"].append(start) 
    


In [21]:
mixed_ordered_by_outgoing = {node: len(connected['outgoing']) for node, connected in mixed_in_out.items()}
mixed_biggest_mentors = sorted(mixed_ordered_by_outgoing.items(), key=lambda item: item[1], reverse=True)
mixed_ordered_by_incoming= {node: len(connected['incoming']) for node, connected in mixed_in_out.items()}
mixed_biggest_mentees = sorted(mixed_ordered_by_incoming.items(), key=lambda item: item[1], reverse=True)
print(mixed_biggest_mentors[:5])
print(mixed_biggest_mentees[:5])

[('actor: John Wayne', 411), ('actor: Donald Crisp', 370), ('actor: William Demarest', 332), ('actor: George Sanders', 322), ('actor: Humphrey Bogart', 311)]
[('actor: Samuel L. Jackson', 323), ('actor: Christopher Walken', 306), ('actor: John Wayne', 302), ('actor: Robert De Niro', 277), ('actor: Dan Aykroyd', 266)]


In [None]:
assortativity_niche = nx.degree_assortativity_coefficient(niche_graph)
assortativity_almost_niche = nx.degree_assortativity_coefficient(almost_niche_graph)
print(assortativity_niche)
print(assortativity_almost_niche)


-0.4237188491816925
0.18102084750128028
-0.6037553977178279


# ANALYSIS

### CENTRALITIES

In [None]:
import statistics
import pandas as pd

def compute_centrality_nx(kind, function,df):
    niche_betweenness = function(niche_graph)
    almost_niche_betweenness = function(almost_niche_graph)
    mixed_betweenness = function(mixed_graph)

    items_niche_betweenness = niche_betweenness.items()
    top_10_niche_betweenness = sorted(items_niche_betweenness, key = lambda x: x[1])[:10]
    avg_niche_betweenness = statistics.mean([value for name, value in niche_betweenness.items()])
    max_niche_betweenness = max([value for name,value in top_10_niche_betweenness])

    items_almost_niche_betweenness = almost_niche_betweenness.items()
    top_10_almost_niche_betweenness  = sorted(items_almost_niche_betweenness, key = lambda x: x[1])[:10]
    avg_almost_niche_betweenness= statistics.mean([value for name, value in almost_niche_betweenness.items()])
    max_almost_niche_betweenness = max([value for name,value in top_10_almost_niche_betweenness])

    items_mixed_betweenness= mixed_betweenness.items()
    top_10_mixed_betweenness  = sorted(items_mixed_betweenness, key = lambda x: x[1])[:10]
    avg_mixed_betweenness = statistics.mean([value for name, value in mixed_betweenness.items()])
    max_mixed_betweenness = max([value for name,value in top_10_mixed_betweenness])

    df.loc[f"Average {kind} centrality"] = [avg_niche_betweenness, avg_almost_niche_betweenness, avg_mixed_betweenness]
    df.loc[f"Maximum {kind} centrality"] = [max_niche_betweenness, max_almost_niche_betweenness, max_mixed_betweenness]


    to_save = dict()
    to_save["niche_graph"] = top_10_niche_betweenness
    to_save["almost_niche_graph"] = top_10_almost_niche_betweenness
    to_save["mixed_graph"] = top_10_mixed_betweenness

    with open(f"analysis_5/{kind}_centralities.json", "w") as f:
        json.dump(to_save,f, indent=4)

In [None]:
import statistics
import pandas as pd

def compute_centrality(kind, function,df):
    niche_betweenness = function(niche_graph)
    almost_niche_betweenness = function(almost_niche_graph)
    mixed_betweenness = function(mixed_graph)

    items_niche_betweenness = niche_betweenness.items()
    top_10_niche_betweenness = sorted(items_niche_betweenness, key = lambda x: x[1])[:10]
    avg_niche_betweenness = statistics.mean([value for name, value in niche_betweenness.items()])
    max_niche_betweenness = max([value for name,value in top_10_niche_betweenness])

    items_almost_niche_betweenness = almost_niche_betweenness.items()
    top_10_almost_niche_betweenness  = sorted(items_almost_niche_betweenness, key = lambda x: x[1])[:10]
    avg_almost_niche_betweenness= statistics.mean([value for name, value in almost_niche_betweenness.items()])
    max_almost_niche_betweenness = max([value for name,value in top_10_almost_niche_betweenness])

    items_mixed_betweenness= mixed_betweenness.items()
    top_10_mixed_betweenness  = sorted(items_mixed_betweenness, key = lambda x: x[1])[:10]
    avg_mixed_betweenness = statistics.mean([value for name, value in mixed_betweenness.items()])
    max_mixed_betweenness = max([value for name,value in top_10_mixed_betweenness])

    df.loc[f"Average {kind} centrality"] = [avg_niche_betweenness, avg_almost_niche_betweenness, avg_mixed_betweenness]
    df.loc[f"Maximum {kind} centrality"] = [max_niche_betweenness, max_almost_niche_betweenness, max_mixed_betweenness]


    to_save = dict()
    to_save["niche_graph"] = top_10_niche_betweenness
    to_save["almost_niche_graph"] = top_10_almost_niche_betweenness
    to_save["mixed_graph"] = top_10_mixed_betweenness

    with open(f"analysis_5/{kind}_centralities.json", "w") as f:
        json.dump(to_save,f)

In [None]:
centrality_df = pd.DataFrame(columns=["Niche graph", "Almost niche graph", "Mixed graph"])
compute_centrality("IN-degree", nx.in_degree_centrality, centrality_df)
compute_centrality("OUT-degree", nx.in_degree_centrality, centrality_df)

In [56]:
compute_centrality("betweenness", nx.betweenness_centrality, centrality_df)

KeyboardInterrupt: 

In [None]:
print("Closeness centrality")
compute_centrality("closeness", nx.closeness_centrality, centrality_df)

In [None]:
compute_centrality("pagerank", nx.pagerank, centrality_df)
centrality_df.to_json("analysis_5/centrality.json")

### NETWORK STRUCTURE

In [71]:
import pandas as pd
print("Strongly-connected components - continuous collaboration")
niche_scc_sets = list(nx.strongly_connected_components(niche_graph))
niche_scc = len(max(niche_scc_sets, key=len))
almost_niche_scc_sets = list(nx.strongly_connected_components(almost_niche_graph))
almost_niche_scc = len(max(almost_niche_scc_sets, key=len))
mixed_scc_sets = list(nx.strongly_connected_components(mixed_graph))
mixed_scc = len(max(mixed_scc_sets, key=len))

niche_total_nodes = len(list(niche_graph.nodes()))
almost_niche_total_nodes = len(list(almost_niche_graph.nodes()))
mixed_total_nodes = len(list(mixed_graph.nodes()))

structure_df = pd.DataFrame({"Niche graph":niche_scc, "Almost niche graph":almost_niche_scc, "Mixed graph": mixed_scc}, index = ["Biggest SCC size"])
structure_df.loc["Biggest SCC size ratio"] = [niche_scc/niche_total_nodes, almost_niche_scc/almost_niche_total_nodes, mixed_scc/mixed_total_nodes]

print(structure_df)

Strongly-connected components - continuous collaboration
                        Niche graph  Almost niche graph  Mixed graph
Biggest SCC size        2699.000000        18539.000000  5507.000000
Biggest SCC size ratio     0.483259            0.480833     0.358856


In [None]:
print("Weakly-connected components - isolated 'families'")
niche_wcc_sets = list(nx.weakly_connected_components(niche_graph))
niche_wcc = len(max(niche_wcc_sets, key=len))
almost_niche_wcc_sets = list(nx.weakly_connected_components(almost_niche_graph))
almost_niche_wcc = len(max(almost_niche_wcc_sets, key=len))
mixed_wcc_sets = list(nx.weakly_connected_components(mixed_graph))
mixed_wcc = len(max(mixed_wcc_sets, key=len))

structure_df.loc["Biggest WCC size"] = [niche_wcc, almost_niche_wcc, mixed_wcc]
structure_df.loc["Biggest WCC size ratio"] = [niche_wcc/niche_total_nodes, almost_niche_wcc/almost_niche_total_nodes, mixed_wcc/mixed_total_nodes]
print(structure_df)


Weakly-connected components - isolated 'families'
                        Niche graph  Almost niche graph  Mixed graph
Biggest SCC size        2699.000000        18539.000000  5507.000000
Biggest SCC size ratio     0.483259            0.480833     0.358856
Biggest WCC size        5585.000000        38014.000000  9979.000000
Biggest WCC size ratio     1.000000            0.985943     0.650267


In [44]:
niche_density =nx.density(niche_graph)
almost_niche_density = nx.density(almost_niche_graph)
mixed_density = nx.density(mixed_graph)
structure_df.loc["Density"] = [niche_density, almost_niche_density, mixed_density]



In [45]:
print("Reciprocity - became famous together")
niche_reciprocity = nx.reciprocity(niche_graph)
almost_niche_reciprocity = nx.reciprocity(almost_niche_graph)
mixed_reciprocity = nx.reciprocity(mixed_graph)
structure_df.loc["Reciprocity"] = [niche_reciprocity, almost_niche_reciprocity, mixed_reciprocity]


Reciprocity - became famous together


In [47]:
print("Transitivity")
# Probabilità che se A ha lavorato con B e B con C, anche A abbia lavorato con C.
niche_transitivity = nx.transitivity(niche_graph)
almost_niche_transitivity = nx.transitivity(almost_niche_graph)
mixed_transitivity = nx.transitivity(mixed_graph)
structure_df.loc["Transitivity"] = [niche_transitivity, almost_niche_transitivity, mixed_transitivity]



Transitivity


In [48]:
print("Clustering")
niche_clustering = nx.average_clustering(niche_graph)
almost_niche_clustering = nx.average_clustering(almost_niche_graph)
mixed_clustering = nx.average_clustering(mixed_graph)
structure_df.loc["Clustering"] = [niche_clustering, almost_niche_clustering, mixed_clustering]


Clustering


In [None]:
structure_df.to_json("analysis_5/structure.json")

### ASSORTATIVITY

In [None]:
# degree assortativity
import pandas as pd
niche_degree_assort = nx.degree_assortativity_coefficient(niche_graph)
almost_niche_degree_assort = nx.degree_assortativity_coefficient(almost_niche_graph)
assortativity_df = pd.DataFrame({"Niche graph": niche_degree_assort, "Almost niche graph": almost_niche_degree_assort, "Mixed graph": None}, index = ["Degree assortativity"])


In [None]:
# attribute assortativity
import pandas as pd
mixed_attribute_assort = nx.attribute_assortativity_coefficient(mixed_graph, "status")
assortativity_df.loc["Attribute assortativity"] = [None, None, mixed_attribute_assort]
assortativity_df.to_json("analysis_5/assortativity.json")
mixed_mixing_matrix = nx.attribute_mixing_matrix(mixed_graph, "status")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.heatmap(mixed_mixing_matrix, annot=True, cmap="YlGnBu")
plt.title("Mixing Matrix: Pre-Success Collaborations")
plt.ylabel("Head (E.g. emerging actor)")
plt.xlabel("Tail (E.g. who they collaborated with)")
plt.savefig("analysis_5/heatmap_mixing_matrix.png", dpi=300)
plt.show()