# ai-se/where

Switch branches/tags
Nothing to show
Fetching contributors…
Cannot retrieve contributors at this time
397 lines (337 sloc) 9.68 KB
 """ # A Better Where WHERE2 is a near-linear time top-down clustering alogithm. WHERE2 updated an older where with new Python tricks. ## Standard Header Stuff """ from __future__ import division,print_function import sys sys.dont_write_bytecode = True from lib import * from nasa93 import * """ ## Dimensionality Reduction with Fastmp Project data in N dimensions down to a single dimension connecting twp distant points. Divide that data at the median of those projects. """ def fastmap(m,data): "Divide data into two using distance to two distant items." one = any(data) # 1) pick anything west = furthest(m,one,data) # 2) west is as far as you can go from anything east = furthest(m,west,data) # 3) east is as far as you can go from west c = dist(m,west,east) # now find everyone's distance lst = [] for one in data: a = dist(m,one,west) b = dist(m,one,east) x = (a*a + c*c - b*b)/(2*c) # cosine rule y = max(0, a**2 - x**2)**0.5 # not used, here for a demo lst += [(x,one)] lst = sorted(lst) mid = len(lst)//2 wests = map(second,lst[:mid]) easts = map(second,lst[mid:]) return wests,west, easts,east,c """ In the above: + _m_ is some model that generates candidate solutions that we wish to niche. + _(west,east)_ are not _the_ most distant points (that would require _N*N) distance calculations). But they are at least very distant to each other. This code needs some helper functions. _Dist_ uses the standard Euclidean measure. Note that you tune what it uses to define the niches (decisions or objectives) using the _what_ parameter: """ def dist(m,i,j, what = lambda m: m.decisions): "Euclidean distance 0 <= d <= 1 between decisions" n = len(i.cells) deltas = 0 for c in what(m): n1 = norm(m, c, i.cells[c]) n2 = norm(m, c, j.cells[c]) inc = (n1-n2)**2 deltas += inc n += abs(m.w[c]) return deltas**0.5 / n**0.5 """ The _Dist_ function normalizes all the raw values zero to one. """ def norm(m,c,val) : "Normalizes val in col c within model m 0..1" return (val- m.lo[c]) / (m.hi[c]- m.lo[c]+ 0.0001) """ Now we can define _furthest_: """ def furthest(m,i,all, init = 0, better = gt): "find which of all is furthest from 'i'" out,d= i,init for j in all: if i == j: continue tmp = dist(m,i,j) if better(tmp,d): out,d = j,tmp return out """ And of course, _closest_: """ def closest(m,i,all): return furthest(m,i,all,init=10**32,better=lt) """ ## WHERE2 = Recursive Fastmap WHERE2 finds everyone's else's distance from the poles and divide the data on the mean point of those distances. This all stops if: + Any division has _tooFew_ solutions (say, less than _sqrt_ of the total number of solutions). + Something has gone horribly wrong and you are recursing _tooDeep_ This code is controlled by the options in [_The_ settings](settingspy). For example, if _The.pruning_ is true, we may ignore some sub-tree (this process is discussed, later on). Also, if _The.verbose_ is true, the _show_ function prints out a little tree showing the progress (and to print indents in that tree, we use the string _The.b4_). For example, here's WHERE2 dividing 93 examples from NASA93. ---| _where |----------------- 93 |.. 46 |.. |.. 23 |.. |.. |.. 11 |.. |.. |.. |.. 5. |.. |.. |.. |.. 6. |.. |.. |.. 12 |.. |.. |.. |.. 6. |.. |.. |.. |.. 6. |.. |.. 23 |.. |.. |.. 11 |.. |.. |.. |.. 5. |.. |.. |.. |.. 6. |.. |.. |.. 12 |.. |.. |.. |.. 6. |.. |.. |.. |.. 6. |.. 47 |.. |.. 23 |.. |.. |.. 11 |.. |.. |.. |.. 5. |.. |.. |.. |.. 6. |.. |.. |.. 12 |.. |.. |.. |.. 6. |.. |.. |.. |.. 6. |.. |.. 24 |.. |.. |.. 12 |.. |.. |.. |.. 6. |.. |.. |.. |.. 6. |.. |.. |.. 12 |.. |.. |.. |.. 6. |.. |.. |.. |.. 6. WHERE2 returns clusters, where each cluster contains multiple solutions. """ def where2(m, data, lvl=0, up=None): node = o(val=None,_up=up,_kids=[]) def tooDeep(): return lvl > The.what.depthMax def tooFew() : return len(data) < The.what.minSize def show(suffix): if The.verbose: print(The.what.b4*lvl,len(data), suffix,' ; ',id(node) % 1000,sep='') if tooDeep() or tooFew(): show(".") node.val = data else: show("") wests,west, easts,east,c = fastmap(m,data) node.update(c=c,east=east,west=west) goLeft, goRight = maybePrune(m,lvl,west,east) if goLeft: node._kids += [where2(m, wests, lvl+1, node)] if goRight: node._kids += [where2(m, easts, lvl+1, node)] return node """ ## An Experimental Extensions Lately I've been experimenting with a system that prunes as it divides the data. GALE checks for domination between the poles and ignores data in halves with a dominated pole. This means that for _N_ solutions we only ever have to evaluate _2*log(N)_ of them- which is useful if each evaluation takes a long time. The niches found in this way contain non-dominated poles; i.e. they are approximations to the Pareto frontier. Preliminary results show that this is a useful approach but you should treat those results with a grain of salt. In any case, this code supports that pruning as an optional extra (and is enabled using the _slots.pruning_ flag). In summary, this code says if the scores for the poles are more different that _slots.wriggle_ and one pole has a better score than the other, then ignore the other pole. """ def maybePrune(m,lvl,west,east): "Usually, go left then right, unless dominated." goLeft, goRight = True,True # default if The.prune and lvl >= The.what.depthMin: sw = scores(m, west) se = scores(m, east) if abs(sw - se) > The.wriggle: # big enough to consider if se > sw: goLeft = False # no left if sw > se: goRight = False # no right return goLeft, goRight """ Note that I do not allow pruning until we have descended at least _slots.depthMin_ into the tree. ### Model-specific Stuff WHERE2 talks to models via the the following model-specific variables: + _m.cols_: list of indices in a list + _m.names_: a list of names for each column. + _m.decisions_: the subset of cols relating to decisions. + _m.obectives_: the subset of cols relating to objectives. + _m.eval(m,eg)_: function for computing variables from _eg_. + _m.lo[c]_ : the lowest value in column _c_. + _m.hi[c]_ : the highest value in column _c_. + _m.w[c]_: the weight for each column. Usually equal to one. If an objective and if we are minimizing that objective, then the weight is negative. ### Model-general stuff Using the model-specific stuff, WHERE2 defines some useful general functions. """ def some(m,x) : "with variable x of model m, pick one value at random" return m.lo[x] + by(m.hi[x] - m.lo[x]) def scores(m,it): "Score an individual." if not it.scored: m.eval(m,it) new, w = 0, 0 for c in m.objectives: val = it.cells[c] w += abs(m.w[c]) tmp = norm(m,c,val) if m.w[c] < 0: tmp = 1 - tmp new += (tmp**2) it.score = (new**0.5) / (w**0.5) it.scored = True return it.score """ ## Tree Code Tools for manipulating the tree returned by _where2_. ### Primitive: Walk the nodes """ def nodes(tree,seen=None,steps=0): if seen is None: seen=[] if tree: if not id(tree) in seen: seen.append(id(tree)) yield tree,steps for kid in tree._kids: for sub,steps1 in nodes(kid,seen,steps+1): yield sub,steps1 """ ### Return nodes that are leaves """ def leaves(tree,seen=None,steps=0): for node,steps1 in nodes(tree,seen,steps): if not node._kids: yield node,steps1 """ ### Return nodes nearest to furthest """ def neighbors(leaf,seen=None,steps=-1): """Walk the tree from 'leaf' increasingly distant leaves. """ if seen is None: seen=[] for down,steps1 in leaves(leaf,seen,steps+1): yield down,steps1 if leaf: for up,steps1 in neighbors(leaf._up, seen,steps+1): yield up,steps1 """ ### Return nodes in Groups, Closest to Furthest """ def around(leaf, f=lambda x: x): tmp,last = [], None for node,dist in neighbors(leaf): if dist > 0: if dist == last: tmp += [f(node)] else: if tmp: yield last,tmp tmp = [f(node)] last = dist if tmp: yield last,tmp """ ## Demo Code ### Code Showing the scores """ #@go def _scores(): m = nasa93() out = [] for row in m._rows: scores(m,row) out += [(row.score, [row.cells[c] for c in m.objectives])] for s,x in sorted(out): print(s,x) """ ### Code Showing the Distances """ #@go def _distances(m=nasa93): m=m() seed(The.seed) for i in m._rows: j = closest(m,i, m._rows) k = furthest(m,i, m._rows) idec = [i.cells[c] for c in m.decisions] jdec = [j.cells[c] for c in m.decisions] kdec = [k.cells[c] for c in m.decisions] print("\n", gs(idec), g(scores(m,i)),"\n", gs(jdec),"closest ", g(dist(m,i,j)),"\n", gs(kdec),"furthest", g(dist(m,i,k))) """ ### A Demo for Where2. """ @go def _where(m=nasa93): m= m() seed(1) told=N() for r in m._rows: s = scores(m,r) told += s global The The=defaults().update(verbose = True, minSize = len(m._rows)**0.5, prune = False, wriggle = 0.3*told.sd()) tree = where2(m, m._rows) n=0 for node,_ in leaves(tree): m = len(node.val) #print(m,' ',end="") n += m print(id(node) % 1000, ' ',end='') for near,dist in neighbors(node): print(dist,id(near) % 1000,' ',end='') print("") print(n) filter = lambda z: id(z) % 1000 for node,_ in leaves(tree): print(filter(node), [x for x in around(node,filter)])