Indexing and Graph Construction

In [1]:
import pandas as pd
import networkx as nx

In [None]:
df = pd.read_csv(r"...path_to\RDsqr-KG\Knowledge_Graph\kg.csv", dtype = str)

Indexing Nodes

In [3]:
head_nodes = df[['head_id', 'head_name', 'head_type', 'head_ref']].rename(
    columns={
        'head_id': 'node_id',
        'head_name': 'node_name',
        'head_type': 'node_type',
        'head_ref': 'node_ref'
    }
)

tail_nodes = df[['tail_id', 'tail_name', 'tail_type', 'tail_ref']].rename(
    columns={
        'tail_id': 'node_id',
        'tail_name': 'node_name',
        'tail_type': 'node_type',
        'tail_ref': 'node_ref'
    }
)

In [4]:
nodes = pd.concat([head_nodes, tail_nodes], ignore_index=True).drop_duplicates()

In [5]:
nodes.shape

(105605, 4)

In [6]:
nodes.drop_duplicates(subset=['node_name', 'node_type', 'node_ref'], inplace=True)

In [7]:
nodes.shape

(105300, 4)

In [8]:
nodes = nodes.reset_index(drop=True)

In [9]:
nodes['node_index'] = nodes.index

In [10]:
head_rename = nodes.rename(columns={'node_index': 'head_index'})
tail_rename = nodes.rename(columns={'node_index': 'tail_index'})

In [11]:
df = df.merge(head_rename, how='left', left_on=['head_id', 'head_name', 'head_type', 'head_ref'], right_on=['node_id', 'node_name', 'node_type', 'node_ref'])

In [12]:
df = df.merge(tail_rename, how='left', left_on=['tail_id', 'tail_name', 'tail_type', 'tail_ref'], right_on=['node_id', 'node_name', 'node_type', 'node_ref'])

In [13]:
kg = df[['head_index', 'head_id', 'head_name', 'head_type', 'head_ref',
          'tail_index', 'tail_id', 'tail_name', 'tail_type', 'tail_ref', 'predicate']]

In [14]:
edges = kg[['head_index', 'tail_index','predicate']].copy()

In [15]:
edges.shape

(5915009, 3)

In [16]:
edges.head()

Unnamed: 0,head_index,tail_index,predicate
0,0.0,160.0,ddi
1,0.0,418.0,ddi
2,0.0,577.0,ddi
3,0.0,642.0,ddi
4,0.0,893.0,ddi


In [17]:
edges.tail()

Unnamed: 0,head_index,tail_index,predicate
5915004,25881.0,23572.0,ppi
5915005,25881.0,16828.0,ppi
5915006,25881.0,35856.0,ppi
5915007,25881.0,21933.0,ppi
5915008,25881.0,21199.0,ppi


In [18]:
edges.isna().sum()

head_index    1873
tail_index     752
predicate        0
dtype: int64

In [19]:
edges.dropna(inplace=True)

In [20]:
edges.drop_duplicates(inplace=True)

In [21]:
edges.shape

(5911694, 3)

In [22]:
edges.isna().sum()

head_index    0
tail_index    0
predicate     0
dtype: int64

In [23]:
edges[['head_index', 'tail_index']] = edges[['head_index', 'tail_index']].astype(int)

In [24]:
used_node_indices = set(edges['head_index']).union(set(edges['tail_index']))
nodes = nodes[nodes['node_index'].isin(used_node_indices)].reset_index(drop=True)

In [25]:
nodes.shape

(105241, 5)

In [26]:
G = nx.DiGraph()

for index, row in edges.iterrows():
    G.add_edge(row['head_index'], row['tail_index'], predicate=row['predicate'])

In [27]:
G.number_of_edges()

5902747

In [28]:
G.number_of_nodes()

105241

In [29]:
G2 = nx.Graph()

for index, row in edges.iterrows():
    G2.add_edge(row['head_index'], row['tail_index'], predicate=row['predicate'])

In [30]:
G2.number_of_edges()

4472526

In [31]:
G2.number_of_nodes()

105241

In [None]:
nodes.to_csv(r'...path_to\RDsqr-KG\Knowledge_Graph\nodes.csv', index=False)
edges.to_csv(r'...path_to\RDsqr-KG\Knowledge_Graph\edges.csv', index=False)