# Creating Networks from JSON Data

This notebook contains an example that reads data from a file of movies `imdb_recent_movies.json` and constructs a graph of actors. This dataset contains a sample of movies released betwen 2000-2020, their titles, genres, release years, and top-billed actors.

Using this dataset, we build a graph and perform some rudimentary graph analysis, extracting centrality metrics from it.

In [1]:
%matplotlib inline

In [11]:
import json
import random

import numpy as np
import pandas as pd
import networkx as nx


In [3]:
g = nx.Graph()

with open("../data/imdb_recent_movies.json", "r") as in_file:
    for line in in_file:
        
        this_movie = json.loads(line)
            
        for actor_id,actor_name in zip(this_movie['actor_ids'],this_movie['actor_names']):
            g.add_node(actor_id, name=actor_name)
            
        # Iterate through the list of actors, generating all pairs
        #. Starting with the first actor in the list, generate pairs with all subsequent actors
        #. then continue to second actor in the list and repeat
        for i,left_actor_id in enumerate(this_movie['actor_ids']):
            for j,right_actor_id in enumerate(this_movie['actor_ids'][i+1:]):
                g.add_edge(left_actor_id, right_actor_id)

In [4]:
print("Nodes:", len(g.nodes))

Nodes: 258059


In [5]:
top_k = 20 # how many of the most central nodes to print

In [6]:
# Calculate degree centrality for all nodes
centrality_degree = nx.degree_centrality(g)

# sort node-centrality dictionary by metric, and reverse to get top elements first
for u in sorted(centrality_degree, key=centrality_degree.get, reverse=True)[:top_k]:
    print(u, g.nodes[u]['name'], centrality_degree[u])

nm0103977 Brahmanandam 0.0015151632578722612
nm0000616 Eric Roberts 0.0014221609095629664
nm0430803 Mohan Joshi 0.0012322811150981563
nm0043199 Avinash 0.0009532740701702718
nm0621937 Nassar 0.0009493989723240512
nm0695177 Prakash Raj 0.0009028977981694038
nm0007106 Shakti Kapoor 0.0008990227003231831
nm0019382 Mohammad Ali 0.00087964721109208
nm2794335 Sadhu Kokila 0.0008370211347836533
nm0348004 Milind Gunaji 0.0008331460369374326
nm1428724 Indrans 0.0008098954498601089
nm0080238 Tanikella Bharani 0.0008060203520138883
nm0000514 Michael Madsen 0.0007982701563214471
nm0001744 Tom Sizemore 0.0007982701563214471
nm0149822 Mithun Chakraborty 0.0007711444713979028
nm2516110 Vijay Chavan 0.0007633942757054615
nm0613417 Raza Murad 0.000759519177859241
nm0457410 Ravi Kishan 0.0007207681993970348
nm0793851 Sayaji Shinde 0.0007013927101659317
nm0154164 Soumitra Chatterjee 0.000697517612319711


In [7]:
centrality_pagerank = nx.pagerank(g)
for u in sorted(centrality_pagerank, key=centrality_pagerank.get, reverse=True)[:top_k]:
    print(u, g.nodes[u]['name'], centrality_pagerank[u])

nm0000616 Eric Roberts 0.00020609746419771868
nm0103977 Brahmanandam 0.0001309154112569784
nm0000514 Michael Madsen 0.00010490813181478702
nm0001744 Tom Sizemore 0.00010419155466627456
nm0222881 Tony Devon 0.00010143582617672207
nm0043199 Avinash 9.816783252911795e-05
nm0430803 Mohan Joshi 9.696248339822569e-05
nm0442207 Lloyd Kaufman 9.351832464062323e-05
nm0001803 Danny Trejo 8.717062975430143e-05
nm0621937 Nassar 8.104883709149782e-05
nm1678610 Theodore Bouloukos 7.775747144743598e-05
nm2794335 Sadhu Kokila 7.773029843341316e-05
nm1693209 Ramesh Bhat 7.4643592320913e-05
nm0261724 Joe Estevez 7.368889228419007e-05
nm0348004 Milind Gunaji 7.346952063967529e-05
nm2278431 Joe Hammerstone 7.34334914900745e-05
nm0007106 Shakti Kapoor 7.265475165760194e-05
nm0154164 Soumitra Chatterjee 7.151535414798184e-05
nm0080238 Tanikella Bharani 7.099628184855312e-05
nm5724719 Achyuth Kumar 6.73349599471373e-05


In [8]:
centrality_btwn = nx.betweenness_centrality(g, k=20)
for u in sorted(centrality_btwn, key=centrality_btwn.get, reverse=True)[:top_k]:
    print(u, g.nodes[u]['name'], centrality_btwn[u])

nm0001744 Tom Sizemore 0.03129550923940693
nm0694361 Tyrone Power Jr. 0.026353682584371072
nm0097557 Umberto Bortolani 0.026330225501327822
nm0269442 Brett Favre 0.026315218401416953
nm0327342 Richard Gonlag 0.026315218401416953
nm3260926 Paulo Tiefenthaler 0.02631211829911367
nm0001298 Richard Grieco 0.026086338984792547
nm8172087 Nick Ford 0.025859464059391744
nm2707894 Bertie Higgins 0.025853042084638875
nm1418843 Bill Flynn 0.02439139396108042
nm1141101 Milhem Cortaz 0.02374734067174757
nm0001698 John Savage 0.023616287841910735
nm0396069 Hasan Husni 0.023046985665156918
nm0001384 Ice-T 0.02167118763964144
nm0541509 Ray 'Boom Boom' Mancini 0.020671862073852052
nm0559890 Ulrich Matthes 0.020617669291584383
nm0873365 Fabio Troiano 0.01993545057047806
nm0541908 Costas Mandylor 0.01977817341907528
nm0000514 Michael Madsen 0.019588480093793544
nm0179224 Barry Corbin 0.019540641159923242


In [9]:
# Simple function for choosing a random neigbor
#. from a node in a given graph
def walk(this_node, this_g):
    
    this_neighbors = list(this_g.neighbors(this_node))
    
    return random.choice(this_neighbors)

In [12]:
node_list = list(g.nodes) # get all nodes

endpoints = {}

sim_n = 1000 # How many simulations to run?
for run_count in range(sim_n):

    # Select a random node
    node = random.choice(node_list)
    
    # If this node is isolated, skip it
    if g.degree(node) == 0:
        continue
    
    # Random walk 
    next_node = node
    for i in range(10): # walk along 10 nodes in the graph
        next_node = walk(next_node, g)
    
    # What's the top endpoint at the end of this walk?
    endpoints[next_node] = endpoints.get(next_node, 0) + 1


In [13]:
# Sort the endpoints by their frequency and weight by simulation run
#. This ranking should be approximately following the PageRank centrality above
for node_name in sorted(endpoints, key=endpoints.get, reverse=True)[:10]:
    print(node_name, g.nodes[node_name]['name'], endpoints[node_name] / sim_n)
    

nm0000616 Eric Roberts 0.000238
nm0103977 Brahmanandam 0.000169
nm0430803 Mohan Joshi 0.000149
nm0000514 Michael Madsen 0.000148
nm0001744 Tom Sizemore 0.000143
nm0695177 Prakash Raj 0.000132
nm0001803 Danny Trejo 0.000129
nm0007106 Shakti Kapoor 0.000119
nm0222881 Tony Devon 0.000118
nm0442207 Lloyd Kaufman 0.000115
