/
tf.py
325 lines (283 loc) · 9.53 KB
/
tf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
"""
# Raw, unoptimised data from TF files
"""
import sys
from ..core.files import (
fileOpen,
expanduser as ex,
unexpanduser as ux,
normpath,
dirExists,
dirMake,
fileNm,
dirNm,
isDir,
isFile,
scanDir,
)
DATA_TYPES = ("str", "int")
DATA_TYPE_STR = ", ".join(DATA_TYPES)
def explode(inPath, outPath):
"""Explodes `.tf` files into non-optimised `.tf` files without metadata.
An exploded `.tf` feature file is a TF file with explicit node specifiers,
no optimizations.
The format of each line is:
**Node features**:
node<tab>value
If the value is None for a certain `node`, there will be no such line.
**Edge features without values**:
node<tab>node
**Edge features with values**:
node<tab>node<tab>value
If the value is `None`, it will be left out, together with the preceding <tab>.
This way, the empty string is distinguished from a `None` value.
!!! caution "Ambiguity"
In the resulting data file, all metadata is gone.
It is not always possible to infer from the data alone what data type a feature
has:
`1<tab>2` could be a node feature assigning integer 2 to node 1, or string `2`
to node 1.
It could also be an edge feature assigning `None` to the node pair (1, 2).
Parameters
----------
inPath: string
Source file(s).
If pointing to a file, it should be file containing TF feature data.
If pointing to a directory, all `.tf` files in that directory will be exploded
(non-recursively).
The path may contain `~` which will be expanded to the user's home directory.
outPath: string
Destination of the exploded file(s).
If pointing to a non-existing location, a file or directory will be created
there, depending on whether `inPath` is a file or directory.
If pointing to an existing directory, exploded file(s) will be put there.
Returns
-------
boolean
whether the operation was successful.
"""
inPath = normpath(inPath)
outPath = normpath(outPath)
inLoc = ex(inPath)
outLoc = ex(outPath)
if not dirExists(inLoc):
return f"No such directory: `{inPath}`"
isInDir = isDir(inLoc)
outExists = dirExists(outLoc)
isOutDir = isDir(outLoc) if outExists else None
tasks = []
if isInDir:
with scanDir(inLoc) as sd:
tasks = [
(f"{inLoc}/{e.name}", f"{outLoc}/{e.name}")
for e in sd
if e.name.endswith(".tf") and e.is_file()
]
if not tasks:
return "No .tf files in `{inPath}`"
if outExists and not isOutDir:
return "Not a directory: `{outPath}`"
if not outExists:
dirMake(outLoc)
else:
if not isFile(inLoc):
return "Not a file: `{inPath}"
if outExists:
if isOutDir:
outFile = f"{outLoc}/{fileNm(inLoc)}"
else:
outFile = outLoc
else:
outDir = dirNm(outLoc)
dirMake(outDir)
outFile = outLoc
tasks = [(inLoc, outFile)]
msgs = []
for (inFile, outFile) in sorted(tasks):
result = _readTf(inFile)
if type(result) is str:
msgs.append(f"{ux(inFile)} => {ux(outFile)}:\n\t{result}")
continue
(data, valueType, isEdge) = result
_writeTf(outFile, *result)
good = True
if msgs:
for msg in msgs:
thisGood = msg[0] != "X"
(sys.stdout if thisGood else sys.stderr).write(f"{msg}\n")
if not thisGood:
good = False
return good
def _readTf(path):
fh = fileOpen(path)
i = 0
metaData = {}
isEdge = False
edgeValues = False
error = None
for line in fh:
i += 1
if i == 1:
text = line.rstrip()
if text == "@edge":
isEdge = True
elif text == "@node":
isEdge = False
elif text == "@config":
error = "! This is a config feature. It has no data."
fh.close()
return error
else:
error = f"X Line {i}: missing @node/@edge/@config"
fh.close()
return error
continue
text = line.rstrip("\n")
if len(text) and text[0] == "@":
if text == "@edgeValues":
edgeValues = True
continue
fields = text[1:].split("=", 1)
metaData[fields[0]] = fields[1] if len(fields) == 2 else None
continue
else:
if text != "":
error = f"X Line {i}: missing blank line after metadata"
fh.close()
return error
else:
break
typeKey = "valueType"
if typeKey in metaData:
valueType = metaData[typeKey]
if valueType not in DATA_TYPES:
error = (
f'X Unknown @valueType: "{valueType}". Expected one of {DATA_TYPE_STR}'
)
fh.close()
return error
else:
error = f"X Missing @valueType. Should be one of {DATA_TYPE_STR}"
fh.close()
return error
result = _readDataTf(fh, i, valueType, isEdge, edgeValues)
fh.close()
return result
def _readDataTf(fh, firstI, valueType, isEdge, edgeValues):
i = firstI
implicit_node = 1
data = {}
normFields = 3 if isEdge and edgeValues else 2
isNum = valueType == "int"
for line in fh:
i += 1
fields = line.rstrip("\n").split("\t")
lfields = len(fields)
if lfields > normFields:
return f"line {i}: {lfields} fields instead of {normFields}"
if lfields == normFields:
nodes = _setFromSpec(fields[0])
if isEdge:
if fields[1] == "":
return f"line {i}: missing node for edge"
nodes2 = _setFromSpec(fields[1])
if not isEdge or edgeValues:
valTf = fields[-1]
else:
if isEdge:
if edgeValues:
if lfields == normFields - 1:
nodes = {implicit_node}
nodes2 = _setFromSpec(fields[0])
valTf = fields[-1]
elif lfields == normFields - 2:
nodes = {implicit_node}
if fields[0] == "":
return f"line {i}: missing node for edge"
nodes2 = _setFromSpec(fields[0])
valTf = ""
else:
nodes = {implicit_node}
valTf = ""
return f"line {i}: missing node for edge"
else:
if lfields == normFields - 1:
nodes = {implicit_node}
if fields[0] == "":
return f"line {i}: missing node for edge"
nodes2 = _setFromSpec(fields[0])
else:
return f"line {i}: missing node for edge"
else:
nodes = {implicit_node}
if lfields == 1:
valTf = fields[0]
else:
valTf = ""
implicit_node = max(nodes) + 1
if not isEdge or edgeValues:
value = (
int(valTf)
if isNum and valTf != ""
else None
if isNum
else ""
if valTf == ""
else _valueFromTf(valTf)
)
if isEdge:
if not edgeValues:
value = None
for n in nodes:
for m in nodes2:
data[(n, m)] = value
else:
for n in nodes:
if value is not None:
data[n] = value
return (data, valueType, isEdge)
def _writeTf(outFile, data, valueType, isEdge):
isInt = valueType == "int"
with fileOpen(outFile, mode="w") as fh:
if isEdge:
if isInt:
for ((n, m), v) in sorted(data.items()):
vTf = "" if v is None else f"\t{v}"
fh.write(f"{n}\t{m}{vTf}\n")
else:
for ((n, m), v) in sorted(data.items()):
vTf = "" if v is None else f"\t{_valueFromTf(v)}"
fh.write(f"{n}\t{m}{vTf}\n")
else:
if isInt:
for (n, v) in sorted(data.items()):
if v is not None:
fh.write(f"{n}\t{v}\n")
else:
for (n, v) in sorted(data.items()):
if v is not None:
fh.write(f"{n}\t{_valueFromTf(v)}\n")
def _valueFromTf(tf):
return "\\".join(
x.replace("\\t", "\t").replace("\\n", "\n") for x in tf.split("\\\\")
)
def _tfFromValue(val, isInt):
return (
str(val)
if isInt
else val.replace("\\", "\\\\").replace("\t", "\\t").replace("\n", "\\n")
)
def _setFromSpec(spec):
covered = set()
for r_str in spec.split(","):
bounds = r_str.split("-")
if len(bounds) == 1:
covered.add(int(r_str))
else:
b = int(bounds[0])
e = int(bounds[1])
if e < b:
(b, e) = (e, b)
for n in range(b, e + 1):
covered.add(n)
return covered