In [1]:
import json
import pandas as pd
import networkx as nx
import collections

In [2]:
file_path = 'all_subgraphs/subgraph_two_hop_AAPL.json'

In [3]:
data = json.load(open(file_path))

In [4]:
len(data)

1

In [5]:
dd = data[0]

In [6]:
dd.keys()

dict_keys(['nodes', 'relationships'])

In [7]:
# node id to dict
crazy_dict = {}
# dict to node id
map_back = {}

In [8]:
def dict_hash(dd):
    return json.dumps(dd, sort_keys=True)

In [9]:
node_id = 0
for i, node_dict in enumerate(dd["nodes"]):
    crazy_dict[i]  = node_dict
    map_back[dict_hash(node_dict)] = i

In [10]:
dd["nodes"][0]

{'ticker': 'AAPL',
 'cik': '0000320193',
 'fiscalYearEnd': '0924',
 'businessSegments': '',
 'irsNumber': '942404110',
 'exchangeTicker': 'Nasdaq',
 'mailingAddress': 'ONE APPLE PARK WAY CUPERTINO CA 95014 CA',
 'sicCode': '3571',
 'stateCodeOfIncorporation': 'CA',
 'name': 'Apple Inc.',
 'productServices': '',
 'stateCode': 'CA',
 'businessAddress': 'ONE APPLE PARK WAY CUPERTINO CA 95014 CA (408) 996-1010'}

In [11]:
dd["relationships"][0]

[{'ticker': 'AAPL',
  'cik': '0000320193',
  'fiscalYearEnd': '0924',
  'businessSegments': '',
  'irsNumber': '942404110',
  'exchangeTicker': 'Nasdaq',
  'mailingAddress': 'ONE APPLE PARK WAY CUPERTINO CA 95014 CA',
  'sicCode': '3571',
  'stateCodeOfIncorporation': 'CA',
  'name': 'Apple Inc.',
  'productServices': '',
  'stateCode': 'CA',
  'businessAddress': 'ONE APPLE PARK WAY CUPERTINO CA 95014 CA (408) 996-1010'},
 'HAS_STATE_LOCATION',
 {'name': 'CALIFORNIA', 'code': 'CA'}]

In [18]:
relationships_remapped = []

for u, t, v in dd["relationships"]:
    temp = []
    temp.append(map_back[dict_hash(u)])
    temp.append(t)
    temp.append(map_back[dict_hash(v)])
    
    relationships_remapped.append(temp)
    
    

In [19]:
relationships_remapped[0]

[0, 'HAS_STATE_LOCATION', 1]

In [20]:
crazy_dict[1]

{'name': 'CALIFORNIA', 'code': 'CA'}

In [21]:
G = nx.Graph()

In [22]:
for k, v in crazy_dict.items():
    G.add_node(k, **v)

In [23]:
for u, t, v in relationships_remapped:
    G.add_edge(u, v, relationship_type=t)

In [24]:
nx.number_connected_components(G)

1

In [25]:
ticker_count = 0
tickers = set()
ticker_index = list()
for i in G.nodes:
    if "ticker" in G.nodes[i]:
        tickers.add(G.nodes[i]["ticker"])
        ticker_index.append(i)
    

In [26]:
if '' in tickers: tickers.remove('')

In [27]:
len(tickers)

1573

In [28]:
company_relationships = set(["IS_PARTIAL_OWNER_OF", "HAS_INSTRUMENTS_OF"])

In [29]:
G.edges[0,1]["relationship_type"]

'HAS_STATE_OF_INCORPORATION'

In [30]:
def get_all_non_company_neighbours(G, origin):
    res = []
    company_relationships = set(["IS_PARTIAL_OWNER_OF", "HAS_INSTRUMENTS_OF"])
    for n in G.neighbors(origin):
#         print(G.edges[origin,n]['relationship_type'])
        if G.edges[origin,n]['relationship_type'] not in company_relationships:
            res.append(n)
    return res

In [31]:
def create_new_edges(G, origin):
    non_company_neighbours = get_all_non_company_neighbours(G, origin)
    
    for n in non_company_neighbours:
        # get relation type
        r_type = G.edges[origin, n]["relationship_type"]
        if r_type[:7] == "COMMON_":
            continue
        # get all nodes that are connected to it with the same type
        for neighbour in G.neighbors(n):
            if neighbour == origin:
                continue
                
            # check their type
            if r_type == G.edges[n, neighbour]["relationship_type"] and ("ticker" in G.nodes[neighbour]):
                data = {
                    "relationship_type": "COMMON_" + r_type
                }
                G.add_edge(origin, neighbour, **data)

In [32]:
for node in G.nodes():
    create_new_edges(G, node)

In [33]:
type_rships_count = collections.defaultdict(int)
for n in G.edges():
    type_rships_count[G.edges[n[0], n[1]]["relationship_type"]] += 1

In [34]:
len_n = len(G.nodes())

In [35]:
len_e = len(G.edges())

In [36]:
len_e / (len_n**2)

0.36428274020741286

In [37]:
type_rships_count

defaultdict(int,
            {'HAS_STATE_OF_INCORPORATION': 79,
             'HAS_EXCHANGE_MARKET': 1389,
             'BELONGS_TO_INDUSTRY_OF': 10,
             'IS_DIRECTOR_OF': 141,
             'COMMON_IS_DIRECTOR_OF': 211,
             'COMMON_HAS_STATE_OF_INCORPORATION': 1781,
             'COMMON_HAS_EXCHANGE_MARKET': 959992,
             'COMMON_BELONGS_TO_INDUSTRY_OF': 26,
             'IS_STATE_OF': 1,
             'HAS_STATE_LOCATION': 417,
             'COMMON_HAS_STATE_LOCATION': 54282,
             'IS_PARTIAL_OWNER_OF': 17,
             'HAS_INSTRUMENTS_OF': 37})

In [None]:
# export the new relationships

In [38]:
H = G.subgraph(ticker_index)

In [39]:
nx.number_connected_components(H)

1

In [43]:
ticker_index

[0,
 2,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 193,
 194,
 195,
 196,
 197,
 198,
 199,
 200,
 201,
 202,
 203,
 204,
 205,
 206,
 207,
 208,
 209,
 210,
 211,
 212,
 213,
 214,
 215,
 216,
 217,
 218,
 219,
 220,
 221,
 222,
 223

In [40]:
node_properties = {}
for i in ticker_index:
    node_properties[i] = crazy_dict[i]

In [45]:
H.edges[0,2]

{'relationship_type': 'HAS_EXCHANGE_MARKET'}

In [46]:
ticker_set = set(ticker_index)

rship_tracker = []

# using the new relationships
for u, v in H.edges():
    rship_tracker.append([u, H.edges[u, v]["relationship_type"], v])

In [47]:
file = {
    "nodes": node_properties,
    "relationships": rship_tracker 
}

json_object = json.dumps(file, indent=4)

with open("AAPL_new_company_test.json", "w") as outfile:
    outfile.write(json_object)

In [38]:
nx.write_gml(H, "GOOG_subgraph_company_only.gml")

In [48]:
H.nodes[0]

{'ticker': 'AAPL',
 'cik': '0000320193',
 'fiscalYearEnd': '0924',
 'businessSegments': '',
 'irsNumber': '942404110',
 'exchangeTicker': 'Nasdaq',
 'mailingAddress': 'ONE APPLE PARK WAY CUPERTINO CA 95014 CA',
 'sicCode': '3571',
 'stateCodeOfIncorporation': 'CA',
 'name': 'Apple Inc.',
 'productServices': '',
 'stateCode': 'CA',
 'businessAddress': 'ONE APPLE PARK WAY CUPERTINO CA 95014 CA (408) 996-1010'}

In [39]:
new_H = nx.read_gml("GOOG_subgraph_company_only.gml")

TypeError: 'NodeView' object does not support item assignment

In [35]:
G.edges[0,1]

{'relationship_type': 'HAS_STATE_OF_INCORPORATION'}

In [32]:
crazy_dict

{0: {'ticker': 'AAPL',
  'cik': '0000320193',
  'fiscalYearEnd': '0924',
  'businessSegments': '',
  'irsNumber': '942404110',
  'exchangeTicker': 'Nasdaq',
  'mailingAddress': 'ONE APPLE PARK WAY CUPERTINO CA 95014 CA',
  'sicCode': '3571',
  'stateCodeOfIncorporation': 'CA',
  'name': 'Apple Inc.',
  'productServices': '',
  'stateCode': 'CA',
  'businessAddress': 'ONE APPLE PARK WAY CUPERTINO CA 95014 CA (408) 996-1010'},
 1: {'name': 'CALIFORNIA', 'code': 'CA'},
 2: {'name': 'NASDAQ - ALL MARKETS',
  'country': 'UNITED STATES OF AMERICA',
  'ticker': 'NASDAQ',
  'city': 'NEW YORK',
  'isoAlpha2Code': 'US'},
 3: {'name': 'RUBINSTEIN JONATHAN',
  'cik': '0001209522',
  'wikipediaPage': 'https://www.google.com/search?q=RUBINSTEIN+JONATHAN',
  'mailingAddress': 'APPLE COMPUTER INC 1 INFINITE LOOP CUPERTINO CA 95014'},
 4: {'name': 'Papermaster Mark D',
  'cik': '0001449649',
  'wikipediaPage': 'https://www.google.com/search?q=Papermaster+Mark+D',
  'mailingAddress': 'ONE AMD PLACE SUNN