-
Notifications
You must be signed in to change notification settings - Fork 0
/
last-utils.rb
executable file
·451 lines (398 loc) · 15.2 KB
/
last-utils.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
#!/usr/bin/ruby1.8
require('openbabel')
#include OpenBabel <= may be used for direct access to OB namespace, i.e. w/o "OpenBabel::". Below, I use namespaces for clarity.
require('set')
%w[pp rubygems hpricot].each { |x| require x }
class LUGraph
attr_accessor :nodes, :edges
def initialize(nodes, edges)
@nodes = nodes
@edges = edges
end
# LAST-SMARTS
# to_smarts( nil, 0,0, 0, 1,<pa>) <== DEFAULT
def to_smarts(backw_e,backw_n,f,opt,init,mode)
opt = @edges[0][1].weight unless !init # initialize opt level to weight of first edge
mand_branches=0
opt_branches = @edges[f].size
different_opts = {}
mand_t = []
mand_e = []
opt_t = []
opt_e = []
@edges[f].each do |t,e|
if ((mode == 'ade' || mode == 'nde' || mode == 'ode') || e.del == 0)
if (mode == 'nop' || mode == 'ode' || e.weight >= opt)
mand_branches+=1
opt_branches-=1
mand_t << t
mand_e << e
else
opt_t << t
opt_e << e
end
else
opt_branches-=1
end
end
opt_e.each do |e|
if different_opts.has_key?(e.weight)
different_opts[e.weight]+=1
else
different_opts[e.weight]=1
end
end
nr_b = 0
if (mode == 'nls' || mode == 'nde')
nr_b = different_opts.max[1] unless different_opts.size == 0 # EITHER nls variant (DEFAULT) OR
elsif (mode == 'msa' || mode == 'ade')
nr_b = different_opts.values.max unless different_opts.size == 0 # msa variant
end
if opt_branches != opt_t.size
puts "Error! O: #{opt_branches} #{opt_t.size}"
exit
end
if mand_branches != mand_t.size
puts "Error! M: #{mand_branches} #{mand_t.size}"
exit
end
if (((nr_b == 0) && (opt_branches > 0)) || (nr_b > opt_branches))
puts "Error!"
exit
end
print "["
@nodes[f].to_smarts
do_branch=(opt_branches>1)
if do_branch
# 1) Uncomment next line to not force at least one branch (c.f. not. 17.11.09)
# print "$(" ; @nodes[f].to_smarts ; print "),"
# 1) end
print ";"
for i in 0..opt_branches-1
t = opt_t[i]
e = opt_e[i]
print "$("
print "[" # self
@nodes[f].to_smarts #
print "]" #
print "(" # branch
e.to_smarts #
to_smarts(e,f,t,e.weight,0,mode) # recursive: LAST-SMARTS
print ")" #
if backw_n!=f && !backw_e.nil? # backw
print "(" unless mand_branches==0 #
backw_e.to_smarts #
print "[" #
@nodes[backw_n].to_smarts #
print "]" #
print ")" unless mand_branches==0 #
end
for j in 0..mand_branches-1 # forw
print "(" unless j==mand_branches-1 #
mand_e[j].to_smarts #
print "[" #
@nodes[mand_t[j]].to_smarts #
print "]" #
print ")" unless j==mand_branches-1 #
end
print ")"
print "," unless i==opt_branches-1
end
end
print "]"
for i in 1..nr_b
print "(~*)" unless !do_branch
end
# 2) Uncomment next block to make single optional edges mandatory (c.f. not. 17.11.09)
if !do_branch && opt_branches==1
t = opt_t[0]
e = opt_e[0]
print "(" unless mand_branches==0
e.to_smarts
to_smarts(e,f,t,e.weight,0,mode)
print ")" unless mand_branches==0
end
# 2) end
for i in 0..mand_branches-1
t = mand_t[i]
e = mand_e[i]
print "(" unless i==mand_branches-1
e.to_smarts
to_smarts(e,f,t,e.weight,0,mode)
print ")" unless i==mand_branches-1
end
end
end
class LUNode
attr_accessor :id, :lab_n
def initialize(id)
@id = id
end
def to_smarts
l_s = @lab_n.split
if l_s.size > 1
for i in 0..l_s.size-1 do
if l_s[i].to_i > 150 # aromatic carbon support
print "##{l_s[i].to_i-150}"
else
print "##{l_s[i]}"
end
print "," unless i==l_s.size-1
end
else
if @lab_n.to_i > 150 # aromatic carbon support
print "##{@lab_n.to_i-150}"
else
print "##{@lab_n}"
end
end
end
end
class LUEdge
attr_accessor :source, :target, :lab_e, :weight, :del, :opt
def initialize(source, target)
@source = source
@target = target
end
def to_s
puts "'#{@source} #{@target} #{@lab_e} #{@weight} #{@del} #{@opt}'"
end
def to_smarts
l_e = @lab_e.split
if l_e.size > 1
aromatized = false
for i in 0..l_e.size-1
case l_e[i]
when '1' then
print "-"
when '2' then
print "="
when '3' then
print "#"
when '4' then
print ":"
aromatized = true
else
end
print "," unless i==l_e.size-1
end
print ",:" unless aromatized # always allow aromatic contexts
else
case @lab_e
# Uncomment the next line for explicit aliphatic bindings in notation
when '1' then print "-,:"
when '2' then print "=,:"
when '3' then print "#,:"
when '4' then print ":"
else
end
end
end
end
def read
# Store activites and hops seperately
activities = {}
hops = {}
xml = STDIN.read
doc, lu_graphs = Hpricot::XML(xml), []
# For each graph
graphs = {}
(doc/:graph).each do |g|
id=g.attributes['id'].to_i
# For each data tag
(g/:data).each do |d|
key = d.attributes['key']
assoc = case key
when 'act' then activities
when 'hops' then hops
else nil
end
assoc[id] = d.inner_html.to_i unless assoc.nil?
end
# For each node tag
graph_nodes = {}
(g/:node).each do |n|
node = LUNode.new(n.attributes['id'].to_i)
(n/:data).each do |data|
slot = data.inner_html
case data.attributes['key']
when 'lab_n' then node.lab_n = slot
else nil
end
end
graph_nodes[n.attributes['id'].to_i]=node
end
# For each edge tag
graph_edges = Hash.new{ |h,k| h[k]=Hash.new(&h.default_proc) }
(g/:edge).each do |e|
edge = LUEdge.new(e.attributes['source'].to_i, e.attributes['target'].to_i)
(e/:data).each do |data|
slot = data.inner_html
case data.attributes['key']
when 'lab_e' then edge.lab_e = slot
when 'weight' then edge.weight = slot.to_i
when 'del' then edge.del = slot.to_i
when 'opt' then edge.opt = slot.to_i
else nil
end
end
graph_edges[e.attributes['source'].to_i][e.attributes['target'].to_i] = edge
end
graphs[id] = LUGraph.new(graph_nodes, graph_edges)
#begin
# cnt=0
# graphs[id].edges.each do |f,_cmp_|
# cnt = cnt + graphs[id].edges[f].size
# end
# puts "Graph '#{id}' has '#{cnt}' edges and '#{graphs[id].nodes.size}' nodes."
#end
end
return {:grps => graphs, :acts => activities, :hops => hops}
end
def smarts(dom, mode)
dom[:grps].sort{|a,b| a[0]<=>b[0]}.each do |id, g|
#if g.edges[0][1].del == 0
print "#{id}\t"
print "#{dom[:acts][id]}\t"
g.to_smarts(nil,0,0,0,1,mode)
print "\n"
#end
end
end
def match (smiles, smarts, verbose=true)
c=OpenBabel::OBConversion.new
c.set_in_format 'smi'
m=OpenBabel::OBMol.new
c.read_string m, smiles
p=OpenBabel::OBSmartsPattern.new
if !p.init(smarts)
puts "Error! Smarts pattern invalid."
exit
end
if verbose
p.match(m)
hits = p.get_umap_list
print "Found #{hits.size} instances of the SMARTS pattern '#{smarts}' in the SMILES string '#{smiles}'."
if hits.size>0
puts " Here are the atom indices:"
else
print "\n"
end
hits.each_with_index do |hit, index|
print " Hit #{index}: [ "
hit.each do |atom_index|
print "#{atom_index} "
end
puts "]"
end
end
p.match(m,true)
end
def match_file (file)
smarts = STDIN.readlines
# build act hash
act_hash={}
$stderr.puts "Building activity database..."
File.open(file.sub(/.smi/, '.class')) do |cfile| # class
while (line = cfile.gets)
line_arr = line.split
act_hash[line_arr.first]=line_arr.last
end
end
smi_arr=[]
$stderr.puts "Reading instances..."
File.open(file, "r") do |infile| # smi
while (line = infile.gets)
smi_arr << line
end
end
$stderr.print "Processing smarts... "
smarts.each do |s|
result_hash={}
string_result=""
smi_arr.each do |smi|
result_hash[smi.split.first] = act_hash[smi.split.first] unless !match(smi.split.last,s.split.last,false)
#$stderr.print "#{result_hash.size} "
end
string_actives = " "
string_inactives = " "
result_hash.each do |id, act|
if act == "1"
string_actives << id << " "
elsif act == "0"
string_inactives << id << " "
end
end
string_result << "\"#{s.split.last}\"\t["
string_result << string_actives << "] ["
string_result << string_inactives << "]"
puts "#{string_result}"
nom = string_actives.split.size
den = (string_actives.split.size + string_inactives.split.size)
$stderr.print "#{nom}/#{den}(#{s.split[1]}) "
end
$stderr.puts
end
# Demonstrates different SMARTS patterns
def demo
# This pattern is equivalent to [#7][#7][#6] :(
puts
match("NNC", "[$([#7]),$([#7]=[#8])][$([#7]),$([#7][#6][#6]),$([#7][#6])][$([#6]),$([#6][#7]),$([#6]~[#7,#8])]") # yes (no)
match("N(=O)NC", "[$([#7]),$([#7]=[#8])][$([#7]),$([#7][#6][#6]),$([#7][#6])][$([#6]),$([#6][#7]),$([#6]~[#7,#8])]") # yes (no)
match("N(=O)NCN", "[$([#7]),$([#7]=[#8])][$([#7]),$([#7][#6][#6]),$([#7][#6])][$([#6]),$([#6][#7]),$([#6]~[#7,#8])]") # yes (no)
match("N(=O)NC(N)N", "[$([#7]),$([#7]=[#8])][$([#7]),$([#7][#6][#6]),$([#7][#6])][$([#6]),$([#6][#7]),$([#6]~[#7,#8])]") # yes (no)
match("N(=O)NC=O", "[$([#7]),$([#7]=[#8])][$([#7]),$([#7][#6][#6]),$([#7][#6])][$([#6]),$([#6][#7]),$([#6]~[#7,#8])]") # yes (no)
match("N(=O)NC(N)=O", "[$([#7]),$([#7]=[#8])][$([#7]),$([#7][#6][#6]),$([#7][#6])][$([#6]),$([#6][#7]),$([#6]~[#7,#8])]") # yes (yes)
# This pattern is better (seems to demand a branch), but actually demands no branch since no explict degree is forced, which allows "folding" :(
puts
match("NNC", "[#7][$([#7][#6][#6]),$([#7][#6])][$([#6][#7]),$([#6]~[#7,#8])]") # yes (no)
match("NN(C)C", "[#7][$([#7][#6][#6]),$([#7][#6])][$([#6][#7]),$([#6]~[#7,#8])]") # yes (no)
match("NN(C)C(N)", "[#7][$([#7][#6][#6]),$([#7][#6])][$([#6][#7]),$([#6]~[#7,#8])]") # yes (yes)
match("NN(C)C(=O)", "[#7][$([#7][#6][#6]),$([#7][#6])][$([#6][#7]),$([#6]~[#7,#8])]") # yes (yes)
match("NN(CC)(C)C(=O)", "[#7][$([#7][#6][#6]),$([#7][#6])][$([#6][#7]),$([#6]~[#7,#8])]") # yes (yes)
match("NNC(=O)", "[#7][$([#7][#6][#6]),$([#7][#6])][$([#6][#7]),$([#6]~[#7,#8])]") # yes (no)
# This enhanced pattern enforces 1-step environments around the current node (f) and requires at least one branch :)
# See README for the recursive definition:
puts
match("NNC", "[#7][#7;$([#7]([#6][#6])([#7])[#6]),$([#7]([#6])([#7])[#6])][#6;$([#6]([#7])[#7]),$([#6](-,=[#7,#8])[#7])]") # no (no)
match("NN(C)C", "[#7][#7;$([#7]([#6][#6])([#7])[#6]),$([#7]([#6])([#7])[#6])][#6;$([#6]([#7])[#7]),$([#6](-,=[#7,#8])[#7])]") # no (no)
match("NNC(N)", "[#7][#7;$([#7]([#6][#6])([#7])[#6]),$([#7]([#6])([#7])[#6])][#6;$([#6]([#7])[#7]),$([#6](-,=[#7,#8])[#7])]") # no (no)
match("NN(C)C(=N)", "[#7][#7;$([#7]([#6][#6])([#7])[#6]),$([#7]([#6])([#7])[#6])][#6;$([#6]([#7])[#7]),$([#6](-,=[#7,#8])[#7])]") # yes (yes)
match("NN(C)C-N", "[#7][#7;$([#7]([#6][#6])([#7])[#6]),$([#7]([#6])([#7])[#6])][#6;$([#6]([#7])[#7]),$([#6](-,=[#7,#8])[#7])]")
match("NN(CC)(C)C(N)", "[#7][#7;$([#7]([#6][#6])([#7])[#6]),$([#7]([#6])([#7])[#6])][#6;$([#6]([#7])[#7]),$([#6](-,=[#7,#8])[#7])]") # yes (yes)
match("NN(CC)(C)C(N)(=O)", "[#7][#7;$([#7]([#6][#6])([#7])[#6]),$([#7]([#6])([#7])[#6])][#6;$([#6]([#7])[#7]),$([#6](-,=[#7,#8])[#7])]") # yes (yes)
match("NN(CC)C(=C)", "[#7][#7;$([#7]([#6][#6])([#7])[#6]),$([#7]([#6])([#7])[#6])][#6;$([#6]([#7])[#7]),$([#6](-,=[#7,#8])[#7])]") # no (no)
match("NN(CN)C(=N)", "[#7][#7;$([#7]([#6][#6])([#7])[#6]),$([#7]([#6])([#7])[#6])][#6;$([#6]([#7])[#7]),$([#6](-,=[#7,#8])[#7])]") # yes (yes) <- two embeddings is correct
match("NN(N)C(=N)", "[#7][#7;$([#7]([#6][#6])([#7])[#6]),$([#7]([#6])([#7])[#6])][#6;$([#6]([#7])[#7]),$([#6](-,=[#7,#8])[#7])]") # no (no)
end
# Main
STDOUT.sync = true
status=false
if $*.size==0 || $*.size>2
status=true
end
case $*[0]
when '1'
if !status
dom = read
smarts(dom, $*[1])
end
when '2'
if !status
match_file($*[1])
end
when '3'
demo
else
status=true
end
if status
puts "Usage: #{$0} 1 [ msa | nls | nop ] < /path/to/graphmlfile.graphml > /path/to/smartsfile.smarts"
puts " #{$0} 2 /path/to/smifile.smi < /path/to/smartsfile.smarts > /path/to/lastpmfile.lastpm"
puts " cmd=1 : convert GraphML to SMARTS."
puts " msa : All level max opt."
puts " nls : Next level opt."
puts " nop : No opt."
puts " cmd=2 : match SMARTS to SMILES file and create LASTPM file."
exit
end