In [3]:
import json
import os

import networkx as nx

with open("actors_projection_graph.json") as f:
    graph_dict = json.load(f)

graph = nx.node_link_graph(graph_dict)

In [4]:
with open("successful_actors.json") as f:
    successful = json.load(f)
with open("almost_successful_actors.json") as f:
    almost_successful = json.load(f)

In [5]:
print(successful["actor: Tom Hanks"])

{'gender': 'male', 'countries': ['Hungary', 'Germany', 'India', 'Mexico', 'United States of America', 'France', 'United Kingdom', 'Singapore', 'Australia', 'Italy', 'Hong Kong', 'Malta'], 'avg_movie_revenue_2020_$': 97984814, 'top_ten': [[1995, 'movie: Toy Story'], [1995, 'movie: Apollo 13'], [1994, 'movie: Forrest Gump'], [1993, 'movie: Philadelphia'], [1993, 'movie: Sleepless in Seattle'], [1996, 'movie: That Thing You Do!'], [1998, 'movie: Saving Private Ryan'], [1989, "movie: The 'Burbs"], [1984, 'movie: Splash'], [1986, 'movie: The Money Pit'], [1986, 'movie: Nothing in Common'], [1998, "movie: You've Got Mail"], [1988, 'movie: Big'], [1999, 'movie: Return with Honor'], [1999, 'movie: Toy Story 2'], [1990, 'movie: The Bonfire of the Vanities'], [1999, 'movie: The Green Mile'], [1992, 'movie: A League of Their Own'], [1985, 'movie: Volunteers'], [1984, 'movie: Bachelor Party'], [1988, 'movie: Punchline'], [2000, 'movie: Cast Away'], [1989, 'movie: Turner & Hooch'], [1980, "movie: H

In [5]:
nodes = [(n,d) for n, d in graph.nodes(data=True)]
edges = [(e1,e2,d) for e1,e2, d in graph.edges(data=True)]


In [4]:
print(edges[0])
print(nodes[0])

('actor: Tom Hanks', 'actor: Kamron Leal', {'weight': 1, 'movies': [['movie: Sully', 2016]], 'earliest_contact': 2016})
('actor: Tom Hanks', {'gender': 'male', 'countries': ['Hungary', 'Germany', 'India', 'Mexico', 'United States of America', 'France', 'United Kingdom', 'Singapore', 'Australia', 'Italy', 'Hong Kong', 'Malta'], 'avg_movie_revenue_2020_$': 97984814, 'top_ten': [[1995, 'movie: Toy Story'], [1995, 'movie: Apollo 13'], [1994, 'movie: Forrest Gump'], [1993, 'movie: Philadelphia'], [1993, 'movie: Sleepless in Seattle'], [1996, 'movie: That Thing You Do!'], [1998, 'movie: Saving Private Ryan'], [1989, "movie: The 'Burbs"], [1984, 'movie: Splash'], [1986, 'movie: The Money Pit'], [1986, 'movie: Nothing in Common'], [1998, "movie: You've Got Mail"], [1988, 'movie: Big'], [1999, 'movie: Return with Honor'], [1999, 'movie: Toy Story 2'], [1990, 'movie: The Bonfire of the Vanities'], [1999, 'movie: The Green Mile'], [1992, 'movie: A League of Their Own'], [1985, 'movie: Volunteers'

In [6]:
# let's create a graph with only the most successful actors to see how they connect to each other

niche_graph = nx.DiGraph()
for actor,diz in successful.items():
    niche_graph.add_node(actor, **diz)

for edge in edges:
    actor1 = edge[0]
    actor2 = edge[1]
    if actor1 and actor2 in successful:
        if "contacts_before_breakthrough" in graph.nodes[actor1]:
            for colleague in graph.nodes[actor1]["contacts_before_breakthrough"]:
                if actor2 == colleague[0]:
                    niche_graph.add_edge(actor1, actor2, **{"year":colleague[1]})
        if "contacts_before_breakthrough" in graph.nodes[actor2]:
            for colleague in graph.nodes[actor2]["contacts_before_breakthrough"]:
                if actor1 == colleague[0]:
                    niche_graph.add_edge(actor2, actor1, **{"year":colleague[1]})






In [7]:
# let's create a graph with only non-successful actors to see how they connect to each other

almost_niche_graph = nx.DiGraph()
for actor,diz in almost_successful.items():
    almost_niche_graph.add_node(actor, **diz)

for edge in edges:
    actor1 = edge[0]
    actor2 = edge[1]
    if actor1 and actor2 in almost_successful:
        if "contacts_before_breakthrough" in graph.nodes[actor1]:
            for colleague in graph.nodes[actor1]["contacts_before_breakthrough"]:
                if actor2 == colleague[0]:
                    almost_niche_graph.add_edge(actor1, actor2, **{"year":colleague[1]})
        if "contacts_before_breakthrough" in graph.nodes[actor2]:
            for colleague in graph.nodes[actor2]["contacts_before_breakthrough"]:
                if actor1 == colleague[0]:
                    almost_niche_graph.add_edge(actor2, actor1, **{"year":colleague[1]})






In [10]:
# let's create a graph with the most successful actors and those who almost made it to see how they connect to each other

mixed_graph = nx.DiGraph()
for actor,diz in successful.items():
    diz["status"] = "successful"
    mixed_graph.add_node(actor, **diz)
for actor,diz in almost_successful.items():
    diz["status"] = "almost_successful"
    mixed_graph.add_node(actor, **diz)

for edge in edges:
    actor1 = edge[0]
    actor2 = edge[1]
    if actor1 in almost_successful and actor2 in successful:
        if "contacts_before_breakthrough" in graph.nodes[actor1]:
            for colleague in graph.nodes[actor1]["contacts_before_breakthrough"]:
                if actor2 == colleague[0]:
                    mixed_graph.add_edge(actor1, actor2, **{"year":colleague[1]})
        if "contacts_before_breakthrough" in graph.nodes[actor2]:
            for colleague in graph.nodes[actor2]["contacts_before_breakthrough"]:
                if actor1 == colleague[0]:
                    mixed_graph.add_edge(actor2, actor1, **{"year":colleague[1]})
    if actor1 in successful and actor2 in almost_successful:
        if "contacts_before_breakthrough" in graph.nodes[actor1]:
            for colleague in graph.nodes[actor1]["contacts_before_breakthrough"]:
                if actor2 == colleague[0]:
                    mixed_graph.add_edge(actor1, actor2, **{"year":colleague[1]})
        if "contacts_before_breakthrough" in graph.nodes[actor2]:
            for colleague in graph.nodes[actor2]["contacts_before_breakthrough"]:
                if actor1 == colleague[0]:
                    mixed_graph.add_edge(actor2, actor1, **{"year": colleague[1]})






In [None]:
# find mentors, mentees and peers
# mentor: someone who has an edge incoming from someone, but not one outgoing to that someone
# mentor: someone who has an edge outgoing to someone, but not one incoming from that someone
# peers: actors who knew each other before either made it


In [7]:
niche_all_connections=list(niche_graph.edges())

In [8]:
niche_in_out = dict()
for tup in niche_all_connections:
    start = tup[0]
    end = tup[1]
    if start not in niche_in_out:
        niche_in_out[start] = dict()
        niche_in_out[start]["incoming"] = list()
        niche_in_out[start]["outgoing"] = list()
        
    if end not in niche_in_out:
        niche_in_out[end] = dict()
        niche_in_out[end]["incoming"] = list()
        niche_in_out[end]["outgoing"] = list()        
    niche_in_out[start]["outgoing"].append(end)
    niche_in_out[end]["incoming"].append(start) 
    


In [12]:
niche_ordered_by_outgoing = {node: len(connected['outgoing']) for node, connected in niche_in_out.items()}
niche_biggest_mentors = sorted(niche_ordered_by_outgoing.items(), key=lambda item: item[1], reverse=True)
niche_ordered_by_incoming= {node: len(connected['incoming']) for node, connected in niche_in_out.items()}
niche_biggest_mentees = sorted(niche_ordered_by_incoming.items(), key=lambda item: item[1], reverse=True)
print(niche_biggest_mentors[:5])
print(niche_biggest_mentees[:5])


[('actor: Donald Crisp', 358), ('actor: John Wayne', 356), ('actor: Lee Marvin', 278), ('actor: William Demarest', 241), ('actor: J. Carrol Naish', 231)]
[('actor: John Wayne', 230), ('actor: Donald Crisp', 195), ('actor: Brian Donlevy', 192), ('actor: Henry Fonda', 183), ('actor: Walter Brennan', 178)]


In [13]:
almost_niche_all_connections=list(almost_niche_graph.edges())
#almost_niche_all_connections = [(a1,a2) for a1, a2 in almost_niche_all_connections]

In [14]:
almost_niche_in_out = dict()

for tup in almost_niche_all_connections:
    start = tup[0]
    end = tup[1]
    if start not in almost_niche_in_out:
        almost_niche_in_out[start] = dict()
        almost_niche_in_out[start]["incoming"] = list()
        almost_niche_in_out[start]["outgoing"] = list()       
    if end not in almost_niche_in_out:
        almost_niche_in_out[end] = dict()
        almost_niche_in_out[end]["incoming"] = list()
        almost_niche_in_out[end]["outgoing"] = list()        
    almost_niche_in_out[start]["outgoing"].append(end)
    almost_niche_in_out[end]["incoming"].append(start) 
    


In [15]:
almost_niche_ordered_by_outgoing = {node: len(connected['outgoing']) for node, connected in almost_niche_in_out.items()}
almost_niche_biggest_mentors = sorted(almost_niche_ordered_by_outgoing.items(), key=lambda item: item[1], reverse=True)
almost_niche_ordered_by_incoming= {node: len(connected['incoming']) for node, connected in almost_niche_in_out.items()}
almost_niche_biggest_mentees = sorted(almost_niche_ordered_by_incoming.items(), key=lambda item: item[1], reverse=True)
print(almost_niche_biggest_mentors[:5])
print(almost_niche_biggest_mentees[:5])

[('actor: Charles Lane', 930), ('actor: Sam Harris', 815), ('actor: Bess Flowers', 740), ('actor: Danny Trejo', 724), ('actor: James Flavin', 719)]
[('actor: Bess Flowers', 670), ('actor: Russell Hicks', 443), ('actor: Gino Corrado', 420), ('actor: Charles Lane', 420), ('actor: Selmer Jackson', 417)]


In [16]:
mixed_all_connections=list(mixed_graph.edges())

mixed_in_out = dict()

for tup in mixed_all_connections:
    start = tup[0]
    end = tup[1]
    if start not in mixed_in_out:
        mixed_in_out[start] = dict()
        mixed_in_out[start]["incoming"] = list()
        mixed_in_out[start]["outgoing"] = list()       
    if end not in mixed_in_out:
        mixed_in_out[end] = dict()
        mixed_in_out[end]["incoming"] = list()
        mixed_in_out[end]["outgoing"] = list()        
    mixed_in_out[start]["outgoing"].append(end)
    mixed_in_out[end]["incoming"].append(start) 
    


In [21]:
mixed_ordered_by_outgoing = {node: len(connected['outgoing']) for node, connected in mixed_in_out.items()}
mixed_biggest_mentors = sorted(mixed_ordered_by_outgoing.items(), key=lambda item: item[1], reverse=True)
mixed_ordered_by_incoming= {node: len(connected['incoming']) for node, connected in mixed_in_out.items()}
mixed_biggest_mentees = sorted(mixed_ordered_by_incoming.items(), key=lambda item: item[1], reverse=True)
print(mixed_biggest_mentors[:5])
print(mixed_biggest_mentees[:5])

[('actor: John Wayne', 411), ('actor: Donald Crisp', 370), ('actor: William Demarest', 332), ('actor: George Sanders', 322), ('actor: Humphrey Bogart', 311)]
[('actor: Samuel L. Jackson', 323), ('actor: Christopher Walken', 306), ('actor: John Wayne', 302), ('actor: Robert De Niro', 277), ('actor: Dan Aykroyd', 266)]


# ANALYSIS

### CENTRALITIES

In [13]:
import statistics
import pandas as pd

def compute_centrality_nx(kind, function,df):
    niche_betweenness = function(niche_graph)
    almost_niche_betweenness = function(almost_niche_graph)
    mixed_betweenness = function(mixed_graph)

    items_niche_betweenness = niche_betweenness.items()
    top_10_niche_betweenness = sorted(items_niche_betweenness, key = lambda x: x[1])[:10]
    avg_niche_betweenness = statistics.mean([value for name, value in niche_betweenness.items()])
    max_niche_betweenness = max([value for name,value in top_10_niche_betweenness])

    items_almost_niche_betweenness = almost_niche_betweenness.items()
    top_10_almost_niche_betweenness  = sorted(items_almost_niche_betweenness, key = lambda x: x[1])[:10]
    avg_almost_niche_betweenness= statistics.mean([value for name, value in almost_niche_betweenness.items()])
    max_almost_niche_betweenness = max([value for name,value in top_10_almost_niche_betweenness])

    items_mixed_betweenness= mixed_betweenness.items()
    top_10_mixed_betweenness  = sorted(items_mixed_betweenness, key = lambda x: x[1])[:10]
    avg_mixed_betweenness = statistics.mean([value for name, value in mixed_betweenness.items()])
    max_mixed_betweenness = max([value for name,value in top_10_mixed_betweenness])

    df.loc[f"Average {kind} centrality"] = [avg_niche_betweenness, avg_almost_niche_betweenness, avg_mixed_betweenness]
    df.loc[f"Maximum {kind} centrality"] = [max_niche_betweenness, max_almost_niche_betweenness, max_mixed_betweenness]


    to_save = dict()
    to_save["niche_graph"] = top_10_niche_betweenness
    to_save["almost_niche_graph"] = top_10_almost_niche_betweenness
    to_save["mixed_graph"] = top_10_mixed_betweenness

    with open(f"analysis_5/{kind}_centralities.json", "w") as f:
        json.dump(to_save,f, indent=4)

In [29]:
import pandas as pd
import networkit as nk
#centrality_df = pd.DataFrame(columns=["Niche graph", "Almost niche graph", "Mixed graph"])
centrality_df = pd.read_json("analysis_5/centrality.json")
print(centrality_df)
niche_nodes_list = list(niche_graph.nodes())
almost_niche_nodes_list = list(almost_niche_graph.nodes())
mixed_nodes_list = list(mixed_graph.nodes())
nk.setNumberOfThreads(7)
niche_graph_nk = nk.nxadapter.nx2nk(niche_graph)
almost_niche_graph_nk = nk.nxadapter.nx2nk(almost_niche_graph)
mixed_graph_nk = nk.nxadapter.nx2nk(mixed_graph)

                                        Niche graph  Almost niche graph  \
Average closeness centrality               0.135567            0.119990   
Maximum closeness centrality               0.297890            0.365055   
Average pagerank centrality                0.000179            0.000026   
Maximum pagerank centrality                0.000054            0.000011   
Average betweenness centrality             0.000293            0.000039   
Maximum betweenness centrality             0.048626            0.012395   
Maximum normalized pagerank centrality     0.302622            0.408903   

                                        Mixed graph  
Average closeness centrality               0.064286  
Maximum closeness centrality               0.186468  
Average pagerank centrality                0.000065  
Maximum pagerank centrality                0.000025  
Average betweenness centrality             0.000052  
Maximum betweenness centrality             0.008766  
Maximum normalized pa

In [10]:
import statistics
import pandas as pd
import networkit as nk
import os


top10_betweenness = dict()
avg_betweenness = dict()
max_betweenness = dict()

def compute_betweenness(graph, graph_name, nodes_list):
    betweenness = nk.centrality.Betweenness(graph, normalized=True)
    betweenness.run()
    scores = betweenness.scores()
    node_scores = list(enumerate(scores))
    #print(scores)
    id_sorted_scores = sorted(node_scores, key=lambda x: x[1], reverse=True)
    name_sorted_scores = list()
    for id, score in id_sorted_scores[:10]:
        name = nodes_list[id]
        name_sorted_scores.append((name, score))
    top_10_betweenness = name_sorted_scores
    avg_betweenness_value = statistics.mean(scores)
    max_betweenness_value = name_sorted_scores[0][1]
    top10_betweenness[graph_name] = top_10_betweenness
    avg_betweenness[graph_name] = avg_betweenness_value
    max_betweenness[graph_name] = max_betweenness_value

compute_betweenness(niche_graph_nk, "niche_graph", niche_nodes_list)
compute_betweenness(almost_niche_graph_nk, "almost_niche_graph", almost_niche_nodes_list)
compute_betweenness(mixed_graph_nk, "mixed_graph", mixed_nodes_list)

avg_betweenness_list = list(avg_betweenness.values())
max_betweenness_list = list(max_betweenness.values())

centrality_df.loc[f"Average betweenness centrality"] = avg_betweenness_list
centrality_df.loc[f"Maximum betweenness centrality"] = max_betweenness_list
to_save = dict()

to_save["niche_graph"] = top10_betweenness["niche_graph"]
to_save["almost_niche_graph"] = top10_betweenness["almost_niche_graph"]
to_save["mixed_graph"] = top10_betweenness["mixed_graph"]
folder_name = "analysis_5"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    print(f"Created folder {folder_name}")
else:
    print(f"Folder {folder_name} already exists")
with open("analysis_5/betweenness_centralities.json", "w") as f:
    json.dump(to_save,f)

Folder analysis_5 already exists


In [None]:
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    print(f"Created folder {folder_name}")
else:
    print(f"Folder {folder_name} already exists")
centrality_df = pd.DataFrame(columns=["Niche graph", "Almost niche graph", "Mixed graph"])
compute_centrality_nx("IN-degree", nx.in_degree_centrality, centrality_df)
compute_centrality_nx("OUT-degree", nx.in_degree_centrality, centrality_df)

In [12]:
import statistics
top10_closeness = dict()
avg_closeness = dict()
max_closeness = dict()
def compute_closeness(graph, graph_name, nodes_list):
    closeness= nk.centrality.HarmonicCloseness(graph)
    closeness.run()
    scores = closeness.scores()
    node_scores = list(enumerate(scores))
    id_sorted_scores = sorted(node_scores, key=lambda x: x[1], reverse=True)
    name_sorted_scores = list()
    for id, score in id_sorted_scores[:10]:
        name = nodes_list[id]
        name_sorted_scores.append((name, score))
    top_10= name_sorted_scores
    avg_value = statistics.mean(scores)
    max_value = name_sorted_scores[0][1]
    top10_closeness[graph_name] = top_10
    avg_closeness[graph_name] = avg_value
    max_closeness[graph_name] = max_value

compute_closeness(niche_graph_nk, "niche_graph", niche_nodes_list)
compute_closeness(almost_niche_graph_nk, "almost_niche_graph", almost_niche_nodes_list)
compute_closeness(mixed_graph_nk, "mixed_graph", mixed_nodes_list)

avg_closeness_list = list(avg_closeness.values())
max_closeness_list = list(max_closeness.values())

centrality_df.loc[f"Average closeness centrality"] = avg_closeness_list
centrality_df.loc[f"Maximum closeness centrality"] = max_closeness_list
to_save = dict()

to_save["niche_graph"] = top10_closeness["niche_graph"]
to_save["almost_niche_graph"] = top10_closeness["almost_niche_graph"]
to_save["mixed_graph"] = top10_closeness["mixed_graph"]
folder_name = "analysis_5"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    print(f"Created folder {folder_name}")
else:
    print(f"Folder {folder_name} already exists")
with open("analysis_5/closeness_centralities.json", "w") as f:
    json.dump(to_save,f)

Folder analysis_5 already exists


In [11]:
print(centrality_df)

                                Niche graph  Almost niche graph  Mixed graph
Average closeness centrality       0.135567            0.119990     0.064286
Maximum closeness centrality       0.297890            0.365055     0.186468
Average pagerank centrality        0.000179            0.000026     0.000065
Maximum pagerank centrality        0.000054            0.000011     0.000025
Average betweenness centrality     0.000293            0.000039     0.000052
Maximum betweenness centrality     0.048626            0.012395     0.008766


In [23]:
import numpy as np
compute_centrality_nx("pagerank", nx.pagerank, centrality_df)
n_to_multiply = np.array([len(niche_nodes_list), len(almost_niche_nodes_list), len(mixed_nodes_list)])
max_pr = centrality_df.loc["Maximum pagerank centrality"].values
processed_max_pr = list(max_pr*n_to_multiply)
centrality_df.loc["Maximum normalized pagerank centrality"] = processed_max_pr

print(centrality_df)
centrality_df.to_json("analysis_5/centrality.json")


                                        Niche graph  Almost niche graph  \
Average closeness centrality               0.135567            0.119990   
Maximum closeness centrality               0.297890            0.365055   
Average pagerank centrality                0.000179            0.000026   
Maximum pagerank centrality                0.000054            0.000011   
Average betweenness centrality             0.000293            0.000039   
Maximum betweenness centrality             0.048626            0.012395   
Maximum normalized pagerank centrality     0.302622            0.408903   

                                        Mixed graph  
Average closeness centrality               0.064286  
Maximum closeness centrality               0.186468  
Average pagerank centrality                0.000065  
Maximum pagerank centrality                0.000025  
Average betweenness centrality             0.000052  
Maximum betweenness centrality             0.008766  
Maximum normalized pa

### NETWORK STRUCTURE

In [30]:
import pandas as pd
print("Strongly-connected components - continuous collaboration")
niche_scc_sets = list(nx.strongly_connected_components(niche_graph))
niche_scc = len(max(niche_scc_sets, key=len))
almost_niche_scc_sets = list(nx.strongly_connected_components(almost_niche_graph))
almost_niche_scc = len(max(almost_niche_scc_sets, key=len))
mixed_scc_sets = list(nx.strongly_connected_components(mixed_graph))
mixed_scc = len(max(mixed_scc_sets, key=len))


structure_df = pd.DataFrame({"Niche graph":niche_scc, "Almost niche graph":almost_niche_scc, "Mixed graph": mixed_scc}, index = ["Biggest SCC size"])
structure_df.loc["Biggest SCC size ratio"] = [niche_scc/len(niche_nodes_list), almost_niche_scc/len(almost_niche_nodes_list), mixed_scc/len(mixed_nodes_list)]

print(structure_df)

Strongly-connected components - continuous collaboration
                        Niche graph  Almost niche graph  Mixed graph
Biggest SCC size        2699.000000        18539.000000  5507.000000
Biggest SCC size ratio     0.483259            0.480833     0.358879


In [27]:
print("Weakly-connected components - isolated 'families'")
niche_wcc_sets = list(nx.weakly_connected_components(niche_graph))
niche_wcc = len(max(niche_wcc_sets, key=len))
almost_niche_wcc_sets = list(nx.weakly_connected_components(almost_niche_graph))
almost_niche_wcc = len(max(almost_niche_wcc_sets, key=len))
mixed_wcc_sets = list(nx.weakly_connected_components(mixed_graph))
mixed_wcc = len(max(mixed_wcc_sets, key=len))

structure_df.loc["Biggest WCC size"] = [niche_wcc, almost_niche_wcc, mixed_wcc]
structure_df.loc["Biggest WCC size ratio"] = [niche_wcc/len(niche_nodes_list), almost_niche_wcc/len(almost_niche_nodes_list), mixed_wcc/len(mixed_nodes_list)]
print(structure_df)


Weakly-connected components - isolated 'families'


NameError: name 'structure_df' is not defined

In [28]:
niche_density =nx.density(niche_graph)
almost_niche_density = nx.density(almost_niche_graph)
mixed_density = nx.density(mixed_graph)
structure_df.loc["Density"] = [niche_density, almost_niche_density, mixed_density]



In [29]:
print("Reciprocity - became famous together")
niche_reciprocity = nx.reciprocity(niche_graph)
almost_niche_reciprocity = nx.reciprocity(almost_niche_graph)
mixed_reciprocity = nx.reciprocity(mixed_graph)
structure_df.loc["Reciprocity"] = [niche_reciprocity, almost_niche_reciprocity, mixed_reciprocity]


Reciprocity - became famous together


In [30]:
print("Transitivity")
# Probabilità che se A ha lavorato con B e B con C, anche A abbia lavorato con C.
niche_transitivity = nx.transitivity(niche_graph)
almost_niche_transitivity = nx.transitivity(almost_niche_graph)
mixed_transitivity = nx.transitivity(mixed_graph)
structure_df.loc["Transitivity"] = [niche_transitivity, almost_niche_transitivity, mixed_transitivity]



Transitivity


In [31]:
print("Clustering")
niche_clustering = nx.average_clustering(niche_graph)
almost_niche_clustering = nx.average_clustering(almost_niche_graph)
mixed_clustering = nx.average_clustering(mixed_graph)
structure_df.loc["Clustering"] = [niche_clustering, almost_niche_clustering, mixed_clustering]


Clustering


In [32]:
structure_df.to_json("analysis_5/structure.json")

### ASSORTATIVITY

In [33]:
# degree assortativity
import pandas as pd
niche_degree_assort = nx.degree_assortativity_coefficient(niche_graph)
almost_niche_degree_assort = nx.degree_assortativity_coefficient(almost_niche_graph)
assortativity_df = pd.DataFrame({"Niche graph": niche_degree_assort, "Almost niche graph": almost_niche_degree_assort, "Mixed graph": None}, index = ["Degree assortativity"])


In [55]:
# attribute assortativity
import pandas as pd
mixed_attribute_assort = nx.attribute_assortativity_coefficient(mixed_graph, "status")
assortativity_df.loc["Attribute assortativity"] = [None, None, mixed_attribute_assort]
assortativity_df.to_json("analysis_5/assortativity.json")
mixed_mixing_matrix = nx.attribute_mixing_matrix(mixed_graph, "status")

In [68]:
mixed_matrix_df = pd.DataFrame({"From almost-famous to famous actor": mixed_mixing_matrix[0,1], "From famous to almost-famous actor": mixed_mixing_matrix[1,0]}, index = ["Collaborations"])
print(mixed_matrix_df)
mixed_matrix_df.to_json("analysis_5/mixed_matrix.json")

                From almost-famous to famous actor  \
Collaborations                            0.634302   

                From famous to almost-famous actor  
Collaborations                            0.365698  


### DISTANCE METRICS

In [22]:
import numpy as np
import random
import math
import pandas as pd
import networkit as nk
distance_df = pd.DataFrame(columns=["Niche graph", "Almost niche graph", "Mixed graph"])
diameter_list= list()
avg_sp = list()
def small_world_analysis(main_component, graph_name):
    main_component = nk.nxadapter.nx2nk(main_component)
    print("Main component number of nodes:", main_component.numberOfNodes())
    print("Main component number of edges:", main_component.numberOfEdges())
    print("Analysing the diameter...")
    diameter = nk.distance.Diameter(main_component, algo=nk.distance.DiameterAlgo.EXACT)
    diameter.run()
    diameter_value = diameter.getDiameter()
    print("Diameter:", diameter_value[0])
    diameter_list.append(diameter_value[0])
    print("Computing the average path length (sampled)...")
    all_nodes = list(main_component.iterNodes())
    sampled_nodes = all_nodes
    batch_size = 200
    num_batches = math.ceil(len(all_nodes) / batch_size)
    total_distance_sum = 0
    total_valid_pairs = 0
    print("Number of batches:", num_batches)
    #for node in main_component.iterNodes():
    for batch_iter in range(num_batches):
        start_idx = batch_iter * batch_size
        end_idx = min((batch_iter + 1) * batch_size, len(all_nodes))
        batch_nodes = sampled_nodes[start_idx:end_idx]

        spsp = nk.distance.SPSP(main_component, batch_nodes)
        spsp.run()
        distances_from_source = spsp.getDistances()

        distances_array = np.array(distances_from_source)
        valid_mask = distances_array > 0
        total_distance_sum += distances_array[valid_mask].sum()
        total_valid_pairs += valid_mask.sum()

        del spsp
        del distances_array
        if batch_iter % 10 == 0 and batch_iter != 0:
            print("Finished batch:", batch_iter)

    avg_path_length = total_distance_sum / total_valid_pairs
    print("Average sampled path length:", avg_path_length)
    avg_sp.append(avg_path_length)



In [11]:
import networkx as nx
niche_main_cc = max(nx.weakly_connected_components(niche_graph), key = len)
niche_main_cc = niche_graph.subgraph(niche_main_cc).copy()
niche_main_cc = niche_main_cc.to_undirected()
almost_niche_main_cc = max(nx.weakly_connected_components(almost_niche_graph), key = len)
almost_niche_main_cc = almost_niche_graph.subgraph(almost_niche_main_cc).copy()
almost_niche_main_cc = almost_niche_main_cc.to_undirected()
mixed_main_cc = max(nx.weakly_connected_components(mixed_graph), key = len)
mixed_main_cc = mixed_graph.subgraph(mixed_main_cc).copy()
mixed_main_cc = mixed_main_cc.to_undirected()



In [23]:
small_world_analysis(niche_main_cc, "niche main component")
small_world_analysis(almost_niche_main_cc, "almost_niche main component")
small_world_analysis(mixed_main_cc, "mixed main component")

Main component number of nodes: 5585
Main component number of edges: 28420
Analysing the diameter...
Diameter: 6
Computing the average path length (sampled)...
Number of batches: 28
Finished batch: 10
Finished batch: 20
Average sampled path length: 3.5392139069806814
Main component number of nodes: 38014
Main component number of edges: 558631
Analysing the diameter...
Diameter: 11
Computing the average path length (sampled)...
Number of batches: 191
Finished batch: 10
Finished batch: 20
Finished batch: 30
Finished batch: 40
Finished batch: 50
Finished batch: 60
Finished batch: 70
Finished batch: 80
Finished batch: 90
Finished batch: 100
Finished batch: 110
Finished batch: 120
Finished batch: 130
Finished batch: 140
Finished batch: 150
Finished batch: 160
Finished batch: 170
Finished batch: 180
Finished batch: 190
Average sampled path length: 3.910698588297274
Main component number of nodes: 9979
Main component number of edges: 69565
Analysing the diameter...
Diameter: 6
Computing the a

In [25]:
distance_df.loc["Diameter"] = diameter_list
distance_df.loc["Main connected component shortest path"] = avg_sp




In [33]:
distance_df.loc["Graph - number of nodes"] = [len(list(niche_graph.nodes())), len(list(almost_niche_graph.nodes())), len(list(mixed_graph.nodes()))]
distance_df.loc["Graph - number of edges"] = [len(list(niche_graph.edges())), len(list(almost_niche_graph.edges())), len(list(mixed_graph.edges()))]
distance_df.loc["Number of connected components"] = [len(niche_wcc_sets), len(almost_niche_wcc_sets), len(mixed_wcc_sets)]
distance_df.to_json("analysis_5/distance.json")