-
Notifications
You must be signed in to change notification settings - Fork 269
/
data.py
270 lines (230 loc) · 13.1 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
from ..imports import *
from .. import utils as U
from .preprocessor import NodePreprocessor, LinkPreprocessor
import networkx as nx
def graph_nodes_from_csv(nodes_filepath,
links_filepath,
use_lcc=True,
sample_size=10,
train_pct=0.1, sep=',',
holdout_pct=None,
holdout_for_inductive=False,
missing_label_value=None,
random_state=None,
verbose=1):
"""
Loads graph data from CSV files.
Returns generators for nodes in graph for use with GraphSAGE model.
Args:
nodes_filepath(str): file path to training CSV containing node attributes
links_filepath(str): file path to training CSV describing links among nodes
use_lcc(bool): If True, consider the largest connected component only.
sample_size(int): Number of nodes to sample at each neighborhood level
train_pct(float): Proportion of nodes to use for training.
Default is 0.1.
sep (str): delimiter for CSVs. Default is comma.
holdout_pct(float): Percentage of nodes to remove and return separately
for later transductive/inductive inference.
Example --> train_pct=0.1 and holdout_pct=0.2:
Out of 1000 nodes, 200 (holdout_pct*1000) will be held out.
Of the remaining 800, 80 (train_pct*800) will be used for training
and 720 ((1-train_pct)*800) will be used for validation.
200 nodes will be used for transductive or inductive inference.
Note that holdout_pct is ignored if at least one node has
a missing label in nodes_filepath, in which case
these nodes are assumed to be the holdout set.
holdout_for_inductive(bool): If True, the holdout nodes will be removed from
training graph and their features will not be visible
during training. Only features of training and
validation nodes will be visible.
If False, holdout nodes will be included in graph
and their features (but not labels) are accessible
during training.
random_state (int): random seed for train/test split
verbose (boolean): verbosity
Return:
tuple of NodeSequenceWrapper objects for train and validation sets and NodePreprocessor
If holdout_pct is not None or number of nodes with missing labels is non-zero,
fourth and fifth return values are pd.DataFrame and nx.Graph
comprising the held out nodes.
"""
#----------------------------------------------------------------
# read graph structure
#----------------------------------------------------------------
nx_sep = None if sep in [' ', '\t'] else sep
g_nx = nx.read_edgelist(path=links_filepath, delimiter=nx_sep)
# read node attributes
#node_attr = pd.read_csv(nodes_filepath, sep=sep, header=None)
# store class labels within graph nodes
#values = { str(row.tolist()[0]): row.tolist()[-1] for _, row in node_attr.iterrows()}
#nx.set_node_attributes(g_nx, values, 'target')
# select largest connected component
if use_lcc:
g_nx_ccs = (g_nx.subgraph(c).copy() for c in nx.connected_components(g_nx))
g_nx = max(g_nx_ccs, key=len)
if verbose:
print("Largest subgraph statistics: {} nodes, {} edges".format(
g_nx.number_of_nodes(), g_nx.number_of_edges()))
#----------------------------------------------------------------
# read node attributes and split into train/validation
#----------------------------------------------------------------
node_attr = pd.read_csv(nodes_filepath, sep=sep, header=None)
num_features = len(node_attr.columns.values) - 2 # subract ID and target
feature_names = ["w_{}".format(ii) for ii in range(num_features)]
column_names = feature_names + ["target"]
node_data = pd.read_csv(nodes_filepath, header=None, names=column_names, sep=sep)
node_data.index = node_data.index.map(str)
node_data = node_data[node_data.index.isin(list(g_nx.nodes()))]
#----------------------------------------------------------------
# check for holdout nodes
#----------------------------------------------------------------
num_null = node_data[node_data.target.isnull()].shape[0]
num_missing = 0
if missing_label_value is not None:
num_missing = node_data[node_data.target == missing_label_value].shape[0]
if num_missing > 0 and num_null >0:
raise ValueError('Param missing_label_value is not None but there are ' +\
'NULLs in last column. Replace these with missing_label_value.')
if (num_null > 0 or num_missing > 0) and holdout_pct is not None:
warnings.warn('Number of nodes in having NULL or missing_label_value in target '+\
'column is non-zero. Using these as holdout nodes and ignoring holdout_pct.')
#----------------------------------------------------------------
# set df and G and optionally holdout nodes
#----------------------------------------------------------------
if num_null > 0:
df_annotated = node_data[~node_data.target.isnull()]
df_holdout = node_data[~node_data.target.isnull()]
G_holdout = g_nx
df_G = df_annotated if holdout_for_inductive else node_data
G = g_nx.subgraph(df_annotated.index).copy() if holdout_for_inductive else g_nx
U.vprint('using %s nodes with target=NULL as holdout set' % (num_null), verbose=verbose)
elif num_missing > 0:
df_annotated = node_data[node_data.target != missing_label_value]
df_holdout = node_data[node_data.target == missing_label_value]
G_holdout = g_nx
df_G = df_annotated if holdout_for_inductive else node_data
G = g_nx.subgraph(df_annotated.index).copy() if holdout_for_inductive else g_nx
U.vprint('using %s nodes with missing target as holdout set' % (num_missing), verbose=verbose)
elif holdout_pct is not None:
df_annotated = node_data.sample(frac=1-holdout_pct, replace=False, random_state=None)
df_holdout = node_data[~node_data.index.isin(df_annotated.index)]
G_holdout = g_nx
df_G = df_annotated if holdout_for_inductive else node_data
G = g_nx.subgraph(df_annotated.index).copy() if holdout_for_inductive else g_nx
else:
if holdout_for_inductive:
warnings.warn('holdout_for_inductive is True but no nodes were heldout '
'because holdout_pct is None and no missing targets')
df_annotated = node_data
df_holdout = None
G_holdout = None
df_G = node_data
G = g_nx
#----------------------------------------------------------------
# split into train and validation
#----------------------------------------------------------------
tr_data, te_data = sklearn.model_selection.train_test_split(df_annotated,
train_size=train_pct,
test_size=None,
stratify=df_annotated['target'],
random_state=random_state)
#te_data, test_data = sklearn.model_selection.train_test_split(test_data,
#train_size=0.2,
#test_size=None,
#stratify=test_data["target"],
#random_state=100)
#----------------------------------------------------------------
# print summary
#----------------------------------------------------------------
if verbose:
print("Size of training graph: %s nodes" % (G.number_of_nodes()))
print("Training nodes: %s" % (tr_data.shape[0]))
print("Validation nodes: %s" % (te_data.shape[0]))
if df_holdout is not None and G_holdout is not None:
print("Nodes treated as unlabeled for testing/inference: %s" % (df_holdout.shape[0]))
if holdout_for_inductive:
print("Size of graph with added holdout nodes: %s" % (G_holdout.number_of_nodes()))
print("Holdout node features are not visible during training (inductive inference)")
else:
print("Holdout node features are visible during training (transductive inference)")
print()
#----------------------------------------------------------------
# Preprocess training and validation datasets using NodePreprocessor
#----------------------------------------------------------------
preproc = NodePreprocessor(G, df_G, sample_size=sample_size, missing_label_value=missing_label_value)
trn = preproc.preprocess_train(list(tr_data.index))
val = preproc.preprocess_valid(list(te_data.index))
if df_holdout is not None and G_holdout is not None:
return (trn, val, preproc, df_holdout, G_holdout)
else:
return (trn, val, preproc)
def graph_links_from_csv(nodes_filepath,
links_filepath,
sample_sizes=[10, 20],
train_pct=0.1, val_pct=0.1, sep=',',
holdout_pct=None,
holdout_for_inductive=False,
missing_label_value=None,
random_state=None,
verbose=1):
"""
Loads graph data from CSV files.
Returns generators for links in graph for use with GraphSAGE model.
Args:
nodes_filepath(str): file path to training CSV containing node attributes
links_filepath(str): file path to training CSV describing links among nodes
sample_sizes(int): Number of nodes to sample at each neighborhood level.
train_pct(float): Proportion of edges to use for training.
Default is 0.1.
Note that train_pct is applied after val_pct is applied.
val_pct(float): Proportion of edges to use for validation
sep (str): delimiter for CSVs. Default is comma.
random_state (int): random seed for train/test split
verbose (boolean): verbosity
Return:
tuple of EdgeSequenceWrapper objects for train and validation sets and LinkPreprocessor
"""
# import stellargraph
try:
import stellargraph as sg
from stellargraph.data import EdgeSplitter
except:
raise Exception(SG_ERRMSG)
if version.parse(sg.__version__) < version.parse('0.8'):
raise Exception(SG_ERRMSG)
#----------------------------------------------------------------
# read graph structure
#----------------------------------------------------------------
nx_sep = None if sep in [' ', '\t'] else sep
G = nx.read_edgelist(path=links_filepath, delimiter=nx_sep)
print(nx.info(G))
#----------------------------------------------------------------
# read node attributes
#----------------------------------------------------------------
node_attr = pd.read_csv(nodes_filepath, sep=sep, header=None)
num_features = len(node_attr.columns.values) - 1 # subract ID and treat all other columns as features
feature_names = ["w_{}".format(ii) for ii in range(num_features)]
node_data = pd.read_csv(nodes_filepath, header=None, names=feature_names, sep=sep)
node_data.index = node_data.index.map(str)
df = node_data[node_data.index.isin(list(G.nodes()))]
for col in feature_names:
if not isinstance(node_data[col].values[0], str): continue
df = pd.concat([df, df[col].astype('str').str.get_dummies().add_prefix(col+'_')], axis=1, sort=False)
df = df.drop([col], axis=1)
feature_names = df.columns.values
node_data = df
node_features = node_data[feature_names].values
for nid, f in zip(node_data.index, node_features):
G.node[nid][sg.globalvar.TYPE_ATTR_NAME] = "node"
G.node[nid]["feature"] = f
#----------------------------------------------------------------
# train/validation sets
#----------------------------------------------------------------
edge_splitter_test = EdgeSplitter(G)
G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(p=val_pct, method="global", keep_connected=True)
edge_splitter_train = EdgeSplitter(G_test)
G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(p=train_pct, method="global", keep_connected=True)
epp = LinkPreprocessor(G, sample_sizes=sample_sizes)
trn = epp.preprocess_train(G_train, edge_ids_train, edge_labels_train)
val = epp.preprocess_valid(G_test, edge_ids_test, edge_labels_test)
return (trn, val, epp)