Browse files

Add fragment testers.

These area bit like the packet finders. They test for know
data. But they work on sub-regions of a single subpage.

For example, a fragment could test for the BBC logo.

The fragment contructor takes an array containing the bytes
to look for, and a (x,y,w,h) describing the rect to search
on pages.

fragment.py also contains FragmentBuilder which searches
in a large number of subpages to find common fragments,
and then dumps them. Then you can copy paste that output
back into the source code to define a new fragment.
  • Loading branch information...
1 parent 2c41531 commit fa5748b716e74d6a69e0ca13b48076e3fc845ebf @ali1234 committed Sep 14, 2011
Showing with 738 additions and 100 deletions.
  1. +634 −0 fragment.py
  2. +85 −0 page.py
  3. +11 −100 subpagesquash.py
  4. +8 −0 util.py
View
634 fragment.py
@@ -0,0 +1,634 @@
+#!/usr/bin/env python
+
+import sys, os
+import numpy as np
+
+from util import subcode_bcd, mrag, page, bitwise_mode
+from printer import Printer, do_print
+from page import Page
+
+class Fragment(object):
+ def __init__(self, a, (x,y,w,h), name='Unnamed fragment'):
+ self.x = x
+ self.y = y
+ self.w = w
+ self.h = h
+ self.t = (w*h)-5
+ self.array = a
+ self.name = name
+
+ def __repr__(self):
+ aa = repr(self.array)
+ aa = 'np.'+aa[:-1] + ', dtype=np.uint8)'
+ return 'Fragment(%s, (%d,%d,%d,%d), %s)' % (aa,self.x, self.y, self.w, self.h, repr(self.name))
+
+ def dump(self):
+ print repr(self)
+ for i in range(self.h):
+ print Printer(self.array[i]).string_ansi()
+ print '--'
+
+ def test(self, p):
+ f = p.array[self.y:self.y+self.h,self.x:self.x+self.w]
+ rows = (f!=self.array).sum(axis=1)
+ # at most 2 errors per line - so as not to overwrite subpage indicator
+ return 0 if (rows > 2).any() else (self.x*self.y)-rows.sum()
+
+ def fix(self, p):
+
+ p.array[self.y:self.y+self.h,self.x:self.x+self.w] = self.array
+
+ def test_and_fix(self, p):
+ if self.test(p):
+ #print 'matched', self.name
+ self.fix(p)
+
+
+fragments = [
+
+# only one fragment from each sublist can match
+# best will be picked
+
+[ # top left, 3 line
+Fragment(np.array([[ 23, 106, 35, 51, 107, 106, 35, 51, 107, 106, 35, 51, 107],
+ [ 23, 106, 32, 36, 107, 106, 32, 36, 107, 106, 32, 39, 107],
+ [ 23, 34, 35, 35, 35, 34, 35, 35, 35, 34, 35, 35, 35]],
+ dtype=np.uint8), (2,2,13,3), 'BBC logo, top aligned'),
+
+Fragment(np.array([[ 23, 104, 44, 44, 108, 104, 44, 44, 108, 104, 44, 44, 108],
+ [ 23, 106, 32, 49, 110, 106, 32, 49, 110, 106, 32, 61, 110],
+ [ 23, 42, 44, 44, 46, 42, 44, 44, 46, 42, 44, 44, 46]],
+ dtype=np.uint8), (2,2,13,3), 'BBC logo, middle aligned'),
+
+Fragment(np.array([[ 23, 96, 112, 112, 112, 96, 112, 112, 112, 96, 112, 112, 112],
+ [ 23, 106, 32, 36, 122, 106, 32, 36, 122, 106, 32, 116, 122],
+ [ 23, 106, 112, 113, 122, 106, 112, 113, 122, 106, 112, 113, 122]],
+ dtype=np.uint8), (2,2,13,3), 'BBC logo, bottom aligned'),
+
+Fragment(np.array([[ 23, 96, 112, 112, 112, 96, 112, 112, 112, 96, 112, 112, 112],
+ [ 23, 106, 32, 36, 122, 106, 32, 36, 122, 106, 32, 116, 122],
+ [ 23, 106, 32, 37, 106, 106, 32, 37, 106, 106, 32, 39, 107],
+ [ 23, 34, 35, 35, 35, 34, 35, 35, 35, 34, 35, 35, 35]],
+ dtype=np.uint8), (2,2,13,4), 'BBC logo, 4 line'),
+
+Fragment(np.array([[ 22, 104, 60, 44, 44, 44, 44, 44, 44, 124],
+ [ 22, 106, 53, 107, 55, 106, 53, 127, 32, 127],
+ [ 22, 106, 53, 106, 53, 34, 109, 39, 32, 127],
+ [ 22, 42, 45, 44, 44, 44, 44, 44, 44, 47]],
+ dtype=np.uint8), (2,2,10,4), 'TV logo'),
+
+Fragment(np.array([[ 23, 32, 32, 32, 32, 96, 104, 32],
+ [ 23, 60, 44, 44, 44, 44, 124, 52],
+ [ 22, 119, 103, 103, 103, 103, 127, 53],
+ [ 22, 45, 45, 45, 45, 45, 47, 37]],
+ dtype=np.uint8), (2,2,8,4), 'Radio graphic')
+
+],
+
+[ # header, 3 line
+Fragment(np.array([[ 20, 29, 18, 32, 104, 60, 104, 60, 124, 104, 60, 124, 40,
+ 124, 36, 124, 108, 52, 124, 108, 52, 124, 32, 124, 32, 32, 32],
+ [ 20, 29, 18, 32, 106, 55, 106, 117, 127, 106, 117, 127, 32,
+ 127, 32, 127, 123, 52, 127, 107, 53, 127, 48, 127, 48, 32, 32],
+ [ 20, 47, 47, 47, 45, 46, 45, 44, 44, 45, 44, 44, 47,
+ 44, 47, 44, 44, 46, 44, 45, 46, 44, 46, 44, 46, 47, 47]],
+ dtype=np.uint8), (15,2,27,3), 'FOOTBALL header'),
+
+Fragment(np.array([[ 20, 124, 124, 124, 124, 124, 124, 60, 108, 60, 108, 44, 44,
+ 44, 60, 44, 44, 44, 60, 44, 108, 124, 124, 124, 124, 124, 124],
+ [ 20, 29, 32, 19, 32, 32, 32, 106, 117, 122, 53, 127, 35,
+ 127, 106, 55, 55, 127, 106, 119, 49, 32, 32, 32, 32, 32, 32],
+ [ 20, 29, 32, 19, 32, 32, 32, 42, 37, 42, 37, 47, 44,
+ 47, 42, 37, 37, 47, 42, 45, 36, 32, 32, 32, 32, 32, 32]],
+ dtype=np.uint8), (15,2,27,3), 'HOME header'),
+
+Fragment(np.array([[ 20, 124, 124, 124, 124, 124, 44, 60, 60, 108, 44, 44, 44,
+ 60, 44, 44, 108, 44, 124, 60, 44, 44, 124, 124, 124, 124, 124],
+ [ 20, 29, 32, 19, 32, 32, 127, 106, 106, 53, 127, 35, 127,
+ 106, 119, 123, 37, 127, 32, 106, 55, 107, 52, 32, 32, 32, 32],
+ [ 20, 29, 32, 19, 32, 32, 43, 46, 46, 33, 47, 44, 47,
+ 42, 37, 42, 37, 47, 44, 42, 45, 46, 33, 32, 32, 32, 32]],
+ dtype=np.uint8), (15,2,27,3), 'WORLD header'),
+
+Fragment(np.array([[ 20, 29, 20, 32, 32, 18, 104, 60, 44, 104, 60, 124, 104,
+ 60, 124, 104, 60, 124, 40, 124, 36, 32, 32, 32, 32, 32, 32],
+ [ 20, 29, 20, 32, 32, 18, 98, 115, 127, 106, 55, 35, 106,
+ 117, 127, 106, 55, 125, 32, 127, 32, 32, 32, 32, 32, 32, 32],
+ [ 20, 47, 47, 47, 47, 47, 45, 44, 44, 45, 46, 47, 45,
+ 44, 44, 45, 46, 44, 47, 44, 47, 47, 47, 47, 47, 47, 47]],
+ dtype=np.uint8), (15,2,27,3), 'SPORT header'),
+
+Fragment(np.array([[ 20, 29, 20, 32, 32, 18, 104, 60, 36, 124, 108, 52, 124,
+ 104, 60, 36, 124, 104, 52, 124, 36, 108, 60, 32, 32, 32, 32],
+ [ 20, 29, 20, 32, 32, 18, 106, 117, 48, 127, 107, 52, 127,
+ 106, 117, 48, 127, 107, 52, 127, 49, 106, 53, 32, 32, 32, 32],
+ [ 20, 47, 47, 47, 47, 47, 45, 44, 46, 44, 45, 46, 44,
+ 45, 44, 46, 44, 45, 46, 44, 46, 45, 46, 47, 47, 47, 47]],
+ dtype=np.uint8), (15,2,27,3), 'CRICKET header'),
+
+Fragment(np.array([[ 20, 29, 18, 32, 32, 32, 124, 108, 52, 124, 108, 52, 124,
+ 44, 104, 52, 124, 108, 52, 124, 44, 44, 32, 32, 32, 32, 32],
+ [ 20, 29, 18, 32, 32, 32, 127, 107, 52, 127, 107, 53, 127,
+ 112, 106, 53, 127, 106, 53, 127, 114, 127, 32, 32, 32, 32, 32],
+ [ 20, 47, 47, 47, 47, 47, 44, 45, 46, 44, 45, 46, 44,
+ 44, 45, 46, 44, 45, 46, 44, 44, 44, 47, 47, 47, 47, 47]],
+ dtype=np.uint8), (15,2,27,3), 'RACING header'),
+
+Fragment(np.array([[ 20, 29, 17, 19, 104, 52, 52, 124, 104, 60, 104, 60, 124,
+ 40, 124, 36, 124, 104, 52, 124, 36, 124, 108, 32, 32, 32, 32],
+ [ 20, 29, 17, 19, 42, 117, 117, 63, 106, 119, 106, 55, 127,
+ 32, 127, 32, 127, 107, 53, 127, 49, 127, 107, 52, 32, 32, 32],
+ [ 20, 47, 47, 47, 47, 44, 44, 46, 45, 44, 45, 46, 44,
+ 47, 44, 47, 44, 45, 46, 44, 46, 44, 45, 46, 47, 47, 47]],
+ dtype=np.uint8), (15,2,27,3), 'WEATHER header'),
+
+Fragment(np.array([[ 20, 29, 19, 60, 108, 104, 104, 44, 52, 32, 56, 44, 48,
+ 52, 104, 104, 44, 52, 52, 32, 52, 108, 36, 52, 104, 32, 32],
+ [ 20, 29, 19, 55, 107, 106, 106, 35, 52, 32, 101, 56, 49,
+ 117, 122, 106, 35, 53, 117, 48, 53, 106, 32, 115, 123, 32, 32],
+ [ 20, 47, 47, 46, 45, 45, 45, 47, 46, 47, 45, 46, 47,
+ 44, 44, 45, 47, 46, 44, 46, 46, 45, 47, 44, 44, 47, 47]],
+ dtype=np.uint8), (15,2,27,3), 'AIR QUALITY header'),
+
+Fragment(np.array([[ 22, 29, 20, 32, 40, 108, 60, 36, 124, 44, 124, 104, 60,
+ 108, 52, 124, 32, 104, 52, 124, 44, 104, 52, 32, 32, 32, 32],
+ [ 22, 29, 20, 32, 32, 106, 53, 32, 127, 35, 125, 106, 55,
+ 107, 53, 34, 109, 39, 32, 127, 115, 106, 117, 48, 32, 32, 32],
+ [ 22, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35]],
+ dtype=np.uint8), (15,2,27,3), 'TRAVEL header'),
+
+Fragment(np.array([[ 22, 124, 124, 60, 44, 44, 108, 44, 44, 44, 60, 44, 44,
+ 108, 44, 124, 60, 108, 44, 44, 60, 108, 124, 124, 124, 124,
+ 124],
+ [ 22, 29, 20, 34, 107, 55, 33, 127, 115, 63, 106, 119, 123,
+ 53, 111, 48, 122, 37, 127, 115, 106, 53, 32, 32, 32, 32,
+ 32],
+ [ 22, 29, 20, 32, 42, 37, 32, 47, 32, 47, 42, 37, 42,
+ 37, 32, 43, 33, 32, 47, 44, 42, 45, 36, 32, 32, 32,
+ 32]], dtype=np.uint8), (15,2,27,3), 'TRAVEL header, alt 1'),
+
+Fragment(np.array([[ 22, 124, 124, 124, 124, 60, 44, 44, 108, 44, 44, 44, 60,
+ 44, 44, 108, 44, 124, 60, 108, 44, 44, 60, 108, 124, 124,
+ 124],
+ [ 22, 29, 20, 32, 32, 34, 107, 55, 33, 127, 115, 63, 106,
+ 119, 123, 53, 111, 48, 122, 37, 127, 115, 106, 53, 32, 32,
+ 32],
+ [ 22, 29, 20, 32, 32, 32, 42, 37, 32, 47, 32, 47, 42,
+ 37, 42, 37, 32, 43, 33, 32, 47, 44, 42, 45, 36, 32,
+ 32]], dtype=np.uint8), (15,2,27,3), 'TRAVEL header, alt 2'),
+
+Fragment(np.array([[ 20, 124, 124, 124, 124, 124, 60, 44, 44, 60, 108, 44, 60,
+ 108, 44, 124, 44, 108, 60, 44, 44, 108, 124, 124, 124, 124,
+ 124],
+ [ 20, 29, 22, 32, 32, 32, 106, 119, 115, 106, 117, 63, 106,
+ 53, 127, 104, 55, 125, 106, 55, 115, 49, 32, 32, 32, 32,
+ 32],
+ [ 20, 29, 22, 32, 32, 32, 40, 44, 47, 42, 37, 47, 42,
+ 37, 47, 42, 37, 47, 42, 45, 46, 37, 32, 32, 32, 32,
+ 32]], dtype=np.uint8), (15,2,27,3), 'SKIING header'),
+
+
+Fragment(np.array([[ 20, 29, 18, 32, 32, 32, 32, 32, 32, 124, 108, 48, 124,
+ 44, 96, 60, 36, 124, 44, 36, 32, 32, 32, 32, 32, 32, 32],
+ [ 20, 29, 18, 32, 32, 32, 32, 32, 32, 127, 122, 37, 127,
+ 115, 42, 117, 48, 115, 123, 53, 32, 32, 32, 32, 32, 32, 32],
+ [ 20, 47, 47, 47, 47, 47, 47, 47, 47, 44, 44, 47, 44,
+ 44, 47, 44, 46, 44, 44, 46, 47, 47, 47, 47, 47, 47, 47]],
+ dtype=np.uint8), (15,2,27,3), 'DECS header'),
+
+Fragment(np.array([[ 20, 29, 18, 60, 52, 104, 104, 104, 44, 36, 60, 52, 104,
+ 104, 32, 104, 104, 104, 108, 104, 104, 108, 104, 108, 32, 32, 32],
+ [ 20, 29, 18, 55, 107, 106, 122, 106, 114, 53, 119, 123, 98,
+ 123, 32, 106, 122, 106, 106, 106, 106, 122, 106, 106, 32, 32, 32],
+ [ 20, 47, 47, 46, 45, 45, 44, 45, 44, 46, 44, 44, 45,
+ 44, 47, 45, 44, 45, 45, 45, 45, 44, 45, 45, 47, 47, 47]],
+ dtype=np.uint8), (15,2,27,3), 'RUGBY UNION header'),
+
+Fragment(np.array([[ 20, 29, 18, 60, 52, 104, 104, 104, 44, 36, 60, 52, 104,
+ 104, 32, 104, 32, 60, 104, 108, 104, 44, 36, 52, 52, 60, 32],
+ [ 20, 29, 18, 55, 107, 106, 122, 106, 114, 53, 119, 123, 98,
+ 123, 32, 106, 48, 119, 106, 107, 106, 114, 53, 117, 53, 119, 32],
+ [ 20, 47, 47, 46, 45, 45, 44, 45, 44, 46, 44, 44, 45,
+ 44, 47, 45, 46, 44, 45, 45, 45, 44, 46, 44, 46, 44, 47]],
+ dtype=np.uint8), (15,2,27,3), 'RUGBY LEAGUE header'),
+
+Fragment(np.array([[ 20, 29, 18, 32, 124, 44, 36, 120, 108, 48, 124, 108, 52,
+ 124, 108, 52, 124, 104, 52, 124, 36, 124, 108, 52, 32, 32, 32],
+ [ 20, 29, 18, 32, 115, 123, 53, 127, 106, 53, 127, 122, 53,
+ 127, 122, 53, 127, 107, 52, 127, 49, 127, 107, 52, 32, 32, 32],
+ [ 20, 47, 47, 47, 44, 44, 46, 44, 45, 46, 44, 44, 46,
+ 44, 44, 46, 44, 45, 46, 44, 46, 44, 45, 46, 47, 47, 47]],
+ dtype=np.uint8), (15,2,27,3), 'SNOOKER header'),
+
+Fragment(np.array([[ 20, 29, 18, 32, 32, 32, 40, 124, 36, 124, 36, 120, 108,
+ 48, 120, 108, 48, 124, 104, 60, 44, 32, 32, 32, 32, 32,
+ 32],
+ [ 20, 29, 18, 32, 32, 32, 32, 127, 32, 127, 49, 127, 106,
+ 53, 127, 106, 53, 127, 98, 115, 127, 32, 32, 32, 32, 32,
+ 32],
+ [ 20, 47, 47, 47, 47, 47, 47, 44, 47, 44, 46, 44, 45,
+ 46, 44, 45, 46, 44, 45, 44, 44, 47, 47, 47, 47, 47,
+ 47]], dtype=np.uint8), (15,2,27,3), 'TENNIS header'),
+
+Fragment(np.array([[ 20, 29, 18, 104, 60, 44, 36, 124, 108, 52, 124, 104, 60,
+ 116, 104, 52, 124, 108, 52, 124, 108, 52, 120, 108, 48, 32,
+ 32],
+ [ 20, 29, 18, 106, 117, 123, 53, 127, 107, 52, 127, 106, 117,
+ 63, 106, 53, 127, 107, 52, 127, 122, 53, 127, 106, 53, 32,
+ 32],
+ [ 20, 47, 47, 45, 44, 44, 46, 44, 45, 46, 44, 45, 44,
+ 46, 45, 46, 44, 45, 46, 44, 44, 46, 44, 45, 46, 47,
+ 47]], dtype=np.uint8), (15,2,27,3), 'GRIDIRON header'),
+
+Fragment(np.array([[ 20, 29, 18, 124, 44, 36, 124, 104, 104, 52, 124, 96, 60,
+ 60, 116, 96, 60, 60, 116, 104, 52, 120, 108, 48, 124, 44,
+ 36],
+ [ 20, 29, 18, 115, 123, 53, 111, 122, 122, 37, 127, 106, 53,
+ 53, 127, 106, 53, 53, 127, 106, 53, 127, 106, 53, 127, 114,
+ 53],
+ [ 20, 47, 47, 44, 44, 46, 45, 44, 44, 47, 44, 45, 46,
+ 46, 44, 45, 46, 46, 44, 45, 46, 44, 45, 46, 44, 44,
+ 46]], dtype=np.uint8), (15,2,27,3), 'SWIMMING header'),
+
+Fragment(np.array([[ 20, 29, 18, 60, 52, 104, 44, 52, 60, 44, 104, 104, 32,
+ 60, 36, 108, 36, 60, 52, 104, 44, 52, 52, 32, 52, 32,
+ 32],
+ [ 20, 29, 18, 119, 123, 106, 35, 53, 115, 123, 106, 35, 53,
+ 119, 49, 106, 32, 119, 123, 106, 35, 53, 117, 48, 117, 32,
+ 32],
+ [ 20, 47, 47, 44, 44, 45, 47, 46, 44, 44, 45, 47, 46,
+ 44, 46, 45, 47, 44, 44, 45, 47, 46, 44, 46, 44, 46,
+ 47]], dtype=np.uint8), (15,2,27,3), 'BASKETBALL header'),
+
+Fragment(np.array([[ 20, 29, 18, 124, 104, 52, 124, 108, 52, 124, 44, 104, 52,
+ 124, 40, 124, 36, 124, 104, 60, 124, 104, 60, 44, 32, 32,
+ 32],
+ [ 20, 29, 18, 115, 123, 53, 127, 107, 53, 127, 112, 106, 55,
+ 127, 32, 127, 32, 127, 106, 53, 127, 106, 117, 123, 32, 32,
+ 32],
+ [ 20, 47, 47, 44, 44, 46, 44, 45, 46, 44, 44, 45, 46,
+ 44, 46, 44, 47, 44, 45, 46, 44, 45, 44, 44, 47, 47,
+ 47]], dtype=np.uint8), (15,2,27,3), 'YACHTING header'),
+
+Fragment(np.array([[ 20, 29, 20, 32, 18, 104, 60, 124, 104, 60, 124, 104, 52,
+ 124, 104, 52, 120, 108, 48, 124, 44, 44, 32, 32, 32, 32,
+ 32],
+ [ 20, 29, 20, 32, 18, 106, 119, 125, 106, 117, 127, 104, 55,
+ 125, 106, 53, 127, 106, 53, 127, 114, 127, 32, 32, 32, 32,
+ 32],
+ [ 20, 47, 47, 47, 47, 45, 44, 44, 45, 44, 44, 45, 46,
+ 44, 45, 46, 44, 45, 46, 44, 44, 44, 47, 47, 47, 47,
+ 47]], dtype=np.uint8), (15,2,27,3), 'BOXING header'),
+
+Fragment(np.array([[ 20, 29, 18, 60, 44, 104, 108, 104, 32, 60, 36, 32, 32,
+ 32, 40, 60, 104, 44, 104, 44, 52, 60, 108, 104, 104, 44,
+ 32],
+ [ 20, 29, 18, 117, 123, 106, 122, 106, 48, 55, 33, 65, 78,
+ 68, 32, 53, 106, 115, 106, 32, 53, 53, 106, 106, 98, 123,
+ 32],
+ [ 20, 47, 47, 44, 44, 45, 44, 45, 46, 46, 47, 47, 47,
+ 47, 47, 46, 45, 44, 45, 47, 46, 46, 45, 45, 45, 44,
+ 47]], dtype=np.uint8), (15,2,27,3), 'GOLF AND TENNIS header'),
+
+Fragment(np.array([[ 20, 29, 18, 104, 60, 124, 40, 124, 36, 124, 104, 52, 124,
+ 32, 124, 36, 108, 60, 104, 52, 124, 44, 104, 60, 44, 32,
+ 32],
+ [ 20, 29, 18, 106, 55, 127, 32, 127, 32, 127, 107, 53, 127,
+ 48, 127, 49, 106, 53, 106, 53, 127, 112, 98, 115, 127, 32,
+ 32],
+ [ 20, 47, 47, 45, 46, 44, 47, 44, 47, 44, 45, 46, 44,
+ 46, 44, 46, 45, 46, 45, 46, 44, 44, 45, 44, 44, 47,
+ 47]], dtype=np.uint8), (15,2,27,3), 'ATHLETICS headers'),
+
+Fragment(np.array([[ 20, 29, 18, 104, 60, 124, 104, 52, 44, 124, 44, 104, 44,
+ 36, 124, 104, 52, 32, 104, 52, 120, 108, 48, 32, 32, 32,
+ 32],
+ [ 20, 29, 18, 106, 55, 35, 106, 53, 32, 127, 32, 106, 112,
+ 48, 127, 107, 53, 35, 106, 53, 127, 106, 53, 32, 32, 32,
+ 32],
+ [ 20, 47, 47, 45, 46, 47, 45, 46, 47, 44, 47, 45, 44,
+ 46, 44, 45, 46, 47, 45, 46, 44, 45, 46, 47, 47, 47,
+ 47]], dtype=np.uint8), (15,2,27,3), 'PITCH-IN header'),
+
+Fragment(np.array([[ 20, 29, 18, 60, 36, 60, 108, 104, 108, 108, 104, 44, 52,
+ 60, 36, 108, 36, 52, 108, 36, 52, 60, 108, 104, 44, 52,
+ 32],
+ [ 20, 29, 18, 117, 48, 117, 122, 106, 106, 106, 106, 35, 33,
+ 119, 49, 106, 32, 53, 106, 32, 53, 117, 122, 106, 32, 53,
+ 32],
+ [ 20, 47, 47, 44, 46, 44, 44, 45, 45, 45, 45, 47, 47,
+ 44, 46, 45, 47, 46, 45, 47, 46, 44, 44, 45, 47, 46,
+ 47]], dtype=np.uint8), (15,2,27,3), 'COMPETITION header'),
+
+Fragment(np.array([[ 17, 29, 23, 32, 32, 124, 44, 104, 52, 120, 108, 48, 120,
+ 108, 48, 120, 108, 48, 124, 44, 104, 60, 32, 32, 32, 32,
+ 32],
+ [ 17, 29, 23, 32, 32, 127, 35, 106, 53, 127, 106, 53, 127,
+ 107, 53, 127, 106, 53, 127, 112, 106, 119, 32, 32, 32, 32,
+ 32],
+ [ 17, 47, 47, 47, 47, 44, 47, 45, 46, 44, 45, 46, 44,
+ 45, 46, 44, 45, 46, 44, 44, 45, 44, 47, 47, 47, 47,
+ 47]], dtype=np.uint8), (15,2,27,3), 'FINANCE header'),
+
+Fragment(np.array([[ 20, 29, 20, 32, 32, 32, 19, 124, 96, 104, 52, 124, 108,
+ 52, 124, 32, 124, 36, 124, 44, 36, 32, 32, 32, 32, 32,
+ 32],
+ [ 20, 29, 20, 32, 32, 32, 19, 111, 122, 122, 37, 127, 107,
+ 53, 127, 48, 127, 49, 115, 123, 53, 32, 32, 32, 32, 32,
+ 32],
+ [ 20, 47, 47, 47, 47, 47, 47, 45, 44, 44, 47, 44, 45,
+ 46, 44, 46, 44, 46, 44, 44, 46, 47, 47, 47, 47, 47,
+ 47]], dtype=np.uint8), (15,2,27,3), 'WALES header'),
+
+Fragment(np.array([[ 20, 124, 124, 124, 60, 44, 44, 60, 44, 108, 44, 124, 44,
+ 60, 44, 44, 108, 44, 60, 44, 44, 60, 44, 44, 124, 124,
+ 124],
+ [ 20, 29, 20, 23, 106, 119, 63, 106, 119, 49, 127, 32, 127,
+ 106, 55, 115, 49, 127, 106, 55, 127, 106, 55, 127, 32, 32,
+ 32],
+ [ 20, 29, 20, 23, 42, 37, 47, 42, 45, 36, 47, 36, 47,
+ 42, 45, 46, 37, 47, 42, 45, 47, 42, 37, 47, 32, 32,
+ 32]], dtype=np.uint8), (15,2,27,3), 'RELIGION header'),
+
+Fragment(np.array([[ 20, 124, 124, 44, 44, 108, 44, 60, 108, 44, 44, 44, 108,
+ 44, 44, 44, 108, 44, 44, 108, 44, 44, 108, 44, 60, 108,
+ 124],
+ [ 20, 29, 19, 127, 115, 49, 127, 106, 53, 127, 107, 107, 53,
+ 127, 107, 107, 53, 127, 123, 53, 127, 123, 37, 127, 122, 53,
+ 32],
+ [ 20, 29, 19, 44, 46, 37, 47, 46, 37, 47, 42, 42, 37,
+ 47, 42, 42, 37, 47, 42, 37, 47, 42, 37, 44, 46, 37,
+ 32]], dtype=np.uint8), (15,2,27,3), 'SUMMARY header'),
+
+Fragment(np.array([[ 4, 29, 19, 108, 60, 104, 52, 124, 104, 60, 36, 32, 124,
+ 48, 124, 104, 60, 36, 124, 104, 104, 52, 124, 44, 36, 20,
+ 127],
+ [ 4, 29, 19, 106, 53, 106, 55, 127, 106, 119, 49, 32, 127,
+ 43, 127, 106, 119, 49, 111, 122, 122, 37, 115, 123, 53, 20,
+ 127],
+ [ 20, 47, 47, 45, 46, 45, 46, 44, 45, 44, 46, 47, 44,
+ 47, 44, 45, 44, 46, 45, 44, 44, 47, 44, 44, 46, 47,
+ 47]], dtype=np.uint8), (15,2,27,3), 'THE NEWS header'),
+
+Fragment(np.array([[ 20, 29, 18, 60, 60, 52, 60, 108, 40, 60, 104, 44, 52,
+ 60, 52, 104, 44, 104, 108, 104, 44, 52, 60, 52, 108, 36,
+ 32],
+ [ 20, 29, 18, 53, 53, 53, 117, 122, 32, 53, 106, 112, 53,
+ 55, 107, 98, 123, 106, 35, 106, 112, 53, 55, 107, 106, 32,
+ 32],
+ [ 20, 47, 47, 46, 46, 46, 44, 44, 47, 46, 45, 44, 46,
+ 46, 45, 45, 44, 45, 47, 45, 44, 46, 46, 45, 45, 47,
+ 47]], dtype=np.uint8), (15,2,27,3), 'MOTORSPORT header'),
+
+Fragment(np.array([[ 20, 29, 18, 32, 32, 32, 32, 32, 32, 124, 44, 44, 104,
+ 60, 124, 104, 52, 32, 124, 44, 32, 32, 32, 32, 32, 32,
+ 32],
+ [ 20, 29, 18, 32, 32, 32, 32, 32, 32, 127, 114, 127, 106,
+ 117, 127, 106, 117, 48, 127, 35, 32, 32, 32, 32, 32, 32,
+ 32],
+ [ 20, 47, 47, 47, 47, 47, 47, 47, 47, 44, 44, 44, 45,
+ 44, 44, 45, 44, 46, 44, 47, 47, 47, 47, 47, 47, 47,
+ 47]], dtype=np.uint8), (15,2,27,3), 'GOLF header'),
+
+Fragment(np.array([[ 20, 29, 18, 60, 52, 104, 104, 104, 44, 36, 60, 52, 104,
+ 104, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32],
+ [ 20, 29, 18, 55, 107, 106, 122, 106, 114, 53, 119, 123, 98,
+ 123, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32],
+ [ 20, 47, 47, 46, 45, 45, 44, 45, 44, 46, 44, 44, 45,
+ 44, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+ 47]], dtype=np.uint8), (15,2,27,3), 'RUGBY header'),
+
+Fragment(np.array([[ 20, 29, 18, 124, 104, 60, 36, 124, 36, 32, 124, 104, 52,
+ 124, 108, 52, 124, 44, 104, 52, 124, 104, 60, 104, 52, 124,
+ 32],
+ [ 20, 29, 18, 127, 106, 117, 48, 127, 49, 32, 127, 107, 53,
+ 127, 122, 53, 127, 112, 106, 55, 125, 106, 119, 98, 115, 127,
+ 32],
+ [ 20, 47, 47, 44, 45, 44, 46, 44, 46, 47, 44, 45, 46,
+ 44, 44, 46, 44, 44, 45, 46, 44, 45, 44, 45, 44, 44,
+ 47]], dtype=np.uint8), (15,2,27,3), 'ICE HOCKEY header'),
+
+Fragment(np.array([[ 20, 29, 18, 104, 44, 104, 108, 104, 108, 32, 60, 60, 52,
+ 52, 52, 52, 104, 108, 32, 104, 108, 104, 108, 104, 36, 32,
+ 32],
+ [ 20, 29, 18, 106, 35, 106, 122, 106, 35, 53, 53, 53, 53,
+ 117, 53, 117, 106, 107, 32, 106, 122, 106, 106, 106, 49, 32,
+ 32],
+ [ 20, 47, 47, 45, 47, 45, 44, 45, 47, 46, 46, 46, 46,
+ 44, 46, 44, 45, 45, 47, 45, 44, 45, 45, 45, 46, 47,
+ 47]], dtype=np.uint8), (15,2,27,3), 'FORMULA ONE header'),
+
+Fragment(np.array([[ 20, 29, 18, 124, 108, 52, 124, 108, 52, 108, 60, 104, 60,
+ 32, 104, 60, 124, 104, 60, 124, 104, 60, 44, 40, 124, 36,
+ 32],
+ [ 20, 29, 18, 127, 107, 53, 127, 106, 53, 106, 53, 106, 119,
+ 32, 106, 55, 35, 106, 117, 127, 98, 115, 127, 32, 127, 32,
+ 32],
+ [ 20, 47, 47, 44, 45, 46, 44, 45, 46, 45, 46, 45, 44,
+ 47, 45, 46, 47, 45, 44, 44, 45, 44, 44, 47, 44, 47,
+ 47]], dtype=np.uint8), (15,2,27,3), 'ANTE POST headers'),
+
+Fragment(np.array([[ 20, 29, 18, 60, 60, 52, 60, 52, 60, 52, 60, 36, 52,
+ 104, 32, 40, 60, 104, 44, 52, 52, 104, 104, 32, 60, 44,
+ 32],
+ [ 20, 29, 18, 53, 53, 53, 117, 53, 53, 53, 119, 49, 115,
+ 123, 32, 32, 53, 106, 35, 53, 117, 106, 35, 53, 115, 123,
+ 32],
+ [ 20, 47, 47, 46, 46, 46, 44, 46, 46, 46, 44, 46, 44,
+ 44, 47, 47, 46, 45, 47, 46, 44, 45, 47, 46, 44, 44,
+ 47]], dtype=np.uint8), (15,2,27,3), 'MONEY TALKS header'),
+
+Fragment(np.array([[ 20, 29, 18, 108, 60, 104, 52, 120, 108, 108, 48, 124, 36,
+ 124, 36, 124, 108, 52, 124, 108, 48, 120, 108, 108, 48, 32,
+ 32],
+ [ 20, 29, 18, 106, 53, 106, 53, 127, 106, 106, 53, 127, 49,
+ 127, 33, 127, 122, 53, 127, 107, 52, 127, 106, 106, 53, 32,
+ 32],
+ [ 20, 47, 47, 45, 46, 45, 46, 44, 45, 45, 46, 44, 46,
+ 44, 47, 44, 44, 46, 44, 45, 46, 44, 45, 45, 46, 47,
+ 47]], dtype=np.uint8), (15,2,27,3), 'TIMEFORM header'),
+
+Fragment(np.array([[ 20, 29, 19, 60, 36, 60, 108, 104, 44, 36, 52, 60, 108,
+ 104, 44, 104, 44, 104, 108, 32, 52, 60, 108, 104, 44, 36,
+ 32],
+ [ 20, 29, 19, 119, 49, 53, 106, 106, 114, 53, 53, 53, 106,
+ 106, 115, 106, 115, 106, 35, 53, 53, 53, 106, 106, 114, 53,
+ 32],
+ [ 20, 47, 47, 44, 46, 46, 45, 45, 44, 46, 46, 46, 45,
+ 45, 44, 45, 44, 45, 47, 46, 46, 46, 45, 45, 44, 46,
+ 47]], dtype=np.uint8), (15,2,27,3), 'ENGINEERING header'),
+
+Fragment(np.array([[ 20, 29, 19, 124, 44, 104, 60, 124, 96, 60, 116, 40, 124,
+ 36, 124, 108, 52, 124, 44, 40, 124, 36, 124, 44, 36, 32,
+ 32],
+ [ 20, 29, 19, 127, 112, 106, 117, 127, 106, 53, 127, 32, 127,
+ 32, 127, 107, 53, 127, 112, 32, 127, 32, 115, 123, 53, 32,
+ 32],
+ [ 20, 47, 47, 44, 44, 45, 44, 44, 45, 46, 44, 47, 44,
+ 47, 44, 45, 46, 44, 44, 47, 44, 47, 44, 44, 46, 47,
+ 47]], dtype=np.uint8), (15,2,27,3), 'CONTACTS header'),
+
+Fragment(np.array([[ 20, 29, 22, 32, 32, 32, 32, 32, 124, 44, 124, 104, 60,
+ 108, 48, 124, 44, 36, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32],
+ [ 20, 29, 22, 32, 32, 32, 32, 32, 127, 35, 35, 106, 117,
+ 122, 37, 127, 112, 48, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32],
+ [ 20, 47, 47, 47, 47, 47, 47, 47, 44, 47, 47, 45, 44,
+ 44, 47, 44, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47,
+ 47]], dtype=np.uint8), (15,2,27,3), 'PDC header'),
+
+
+Fragment(np.array([[ 20, 124, 124, 44, 44, 44, 60, 44, 44, 60, 44, 44, 60,
+ 44, 44, 124, 44, 44, 44, 124, 44, 108, 60, 44, 124, 124, 124],
+ [ 20, 29, 19, 127, 55, 35, 106, 127, 115, 106, 127, 115, 106,
+ 127, 115, 104, 127, 115, 127, 52, 111, 117, 122, 63, 20, 127, 32],
+ [ 20, 29, 19, 127, 117, 112, 106, 127, 112, 106, 127, 112, 106,
+ 127, 32, 106, 127, 32, 127, 53, 127, 53, 106, 127, 20, 127, 32],
+ [ 20, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35]],
+ dtype=np.uint8), (15,2,27,4), 'CEEFAX 4 line header, yellow'),
+
+Fragment(np.array([[ 20, 124, 124, 44, 44, 44, 60, 44, 44, 60, 44, 44, 60,
+ 44, 44, 124, 44, 44, 44, 124, 44, 108, 60, 44, 124, 124, 124],
+ [ 20, 29, 18, 127, 55, 35, 106, 127, 115, 106, 127, 115, 106,
+ 127, 115, 104, 127, 115, 127, 52, 111, 117, 122, 63, 20, 127, 32],
+ [ 20, 29, 18, 127, 117, 112, 106, 127, 112, 106, 127, 112, 106,
+ 127, 32, 106, 127, 32, 127, 53, 127, 53, 106, 127, 20, 127, 32],
+ [ 20, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35]],
+ dtype=np.uint8), (15,2,27,4), 'CEEFAX 4 line header, green'),
+
+Fragment(np.array([[ 20, 124, 124, 44, 124, 60, 108, 44, 44, 124, 44, 44, 124,
+ 44, 44, 60, 108, 60, 108, 44, 44, 60, 44, 44, 108, 124,
+ 124],
+ [ 4, 29, 19, 127, 104, 106, 53, 127, 115, 104, 119, 123, 52,
+ 107, 55, 106, 117, 122, 53, 127, 115, 106, 119, 123, 37, 22,
+ 32],
+ [ 4, 29, 19, 111, 122, 122, 37, 127, 112, 106, 53, 106, 53,
+ 106, 53, 106, 53, 106, 53, 127, 112, 106, 53, 106, 53, 22,
+ 32],
+ [ 20, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
+ 35]], dtype=np.uint8), (15,2,27,4), 'WEATHER index, 4 line'),
+
+Fragment(np.array([[ 20, 124, 124, 124, 44, 44, 44, 60, 44, 108, 44, 60, 60,
+ 108, 44, 44, 108, 124, 124, 124, 124, 124, 124, 124, 124, 124,
+ 124],
+ [ 4, 29, 32, 19, 127, 35, 127, 106, 119, 49, 127, 106, 106,
+ 53, 127, 115, 49, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32],
+ [ 4, 29, 32, 19, 47, 32, 47, 42, 45, 36, 43, 46, 46,
+ 33, 44, 46, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32],
+ [ 20, 29, 19, 32, 32, 32, 32, 32, 32, 106, 119, 49, 111,
+ 112, 63, 34, 127, 33, 127, 115, 63, 106, 119, 123, 53, 32,
+ 32],
+ [ 32, 32, 32, 32, 32, 32, 32, 32, 32, 42, 45, 36, 47,
+ 32, 47, 32, 47, 32, 47, 32, 47, 42, 37, 42, 37, 32,
+ 32]], dtype=np.uint8), (15,2,27,5), 'NEWS EXTRA, 5 line'),
+
+Fragment(np.array([[ 4, 29, 18, 32, 32, 112, 112, 112, 96, 112, 112, 48, 112,
+ 112, 96, 48, 112, 112, 112, 96, 112, 112, 48, 32, 32, 32,
+ 32],
+ [ 20, 29, 18, 32, 32, 127, 112, 63, 106, 117, 122, 53, 127,
+ 32, 106, 53, 127, 32, 127, 106, 53, 108, 52, 32, 32, 32,
+ 20],
+ [ 20, 29, 18, 32, 32, 127, 32, 127, 106, 53, 106, 53, 127,
+ 112, 106, 53, 127, 32, 127, 106, 117, 122, 53, 32, 32, 32,
+ 20],
+ [ 20, 47, 47, 47, 47, 44, 47, 44, 45, 46, 45, 46, 44,
+ 44, 45, 46, 44, 47, 44, 45, 44, 44, 46, 47, 47, 47,
+ 47]], dtype=np.uint8), (15,2,27,4), 'RACING index, 4 line'),
+
+],
+]
+
+
+
+
+class FragmentBuilder(object):
+ def __init__(self, (x,y,w,h)):
+ self.fragments = []
+ self.x = x
+ self.y = y
+ self.w = w
+ self.h = h
+ self.t = w*h*0.85
+
+ def get_fragment(self, p):
+ f = p.array[self.y:self.y+self.h,self.x:self.x+self.w]
+ matched = False
+ for of in self.fragments:
+ if (f==of[0]).sum() > self.t:
+ of.append(f)
+ matched = True
+ break
+
+ if not matched:
+ self.fragments.append([f])
+
+ def top_fragments(self):
+ sorttmp = [(len(u),n,bitwise_mode(u)) for n,u in enumerate(self.fragments)]
+ sorttmp.sort(reverse=True)
+ return sorttmp
+
+ def build_fraglist(self, outdir):
+ top = self.top_fragments()
+ for c,n,f in top[:50]:
+ print c
+ print Fragment(f, (self.x, self.y, self.w, self.h)).dump()
+ print '----------'
+
+
+def main_work_subdirs(gl):
+ for root, dirs, files in os.walk(gl['pwd']):
+ dirs.sort()
+ if root == gl['pwd']:
+ for d2i in dirs:
+ print(d2i)
+
+def loadpages(filename):
+ data = file(filename, 'rb')
+ print filename
+ done = False
+ while not done:
+ p = data.read(42*26)
+ if len(p) < (42*26):
+ done = True
+ else:
+ a = np.fromstring(p, dtype=np.uint8)
+ yield(Page(a))
+
+def build():
+ indir = sys.argv[1]
+ outdir = sys.argv[2]
+
+ outpath = os.path.join('.', outdir)
+ if not os.path.isdir(outpath):
+ os.makedirs(outpath)
+
+ fb = [FragmentBuilder((15,2,27,4))]
+
+ for root, dirs, files in os.walk(indir):
+ dirs.sort()
+ files.sort()
+ for f in files:
+ for p in loadpages(os.path.join('.', root, f)):
+ for f in fb:
+ f.get_fragment(p)
+
+ for f in fb:
+ f.build_fraglist(outdir)
+
+def dump():
+ for fl in fragments:
+ for f in fl:
+ print f.dump()
+
+if __name__=='__main__':
+ #build()
+ dump()
View
85 page.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+
+import sys, os
+import numpy as np
+
+from util import subcode_bcd, mrag, page
+from printer import Printer, do_print
+
+class Page(object):
+ rows = np.array([0, 27, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24])
+ def __init__(self, a):
+
+ self.array = a.reshape((26,42))
+ #do_print(self.array[0])
+ ((self.m,self.r),e) = mrag(self.array[0][:2])
+ (self.p,e) = page(self.array[0][2:4])
+ (self.s,self.c),self.e = subcode_bcd(self.array[0][4:10])
+
+ # remove parity
+ self.array[2:,2:] &= 0x7f
+
+ # check if row n-1 is double height
+ self.no_double_on_prev = ((self.array[1:-1,2:]) != 0x0d).all(axis=1)
+ # row 1 can't contain double but might due to not being printable
+ self.no_double_on_prev[0] = True
+
+ # calculate a target threshold for each line
+ # based on the number of non-blank characters
+
+ # first count non-blanks
+ self.threshold = ((self.array[2:,2:] != ord(' ')).sum(axis=1))
+
+ # if non-blanks <= 5, don't require a match (set threshold to 0)
+ # also ignore rows following a double height row
+ self.threshold *= ((self.threshold > 5) & (self.no_double_on_prev))
+
+ # some proportion of non-blanks must match in the rest of the lines
+ self.threshold *= 0.5
+
+ # sum required threshold for each line to get total threshold
+ self.threshold_sum = self.threshold.sum() * 1.5
+
+ try:
+ self.ds = int("%x" % self.s, 10)
+ except ValueError:
+ self.ds = 1000
+ rows = np.array([mrag(self.array[n][:2])[0][1] for n in range(26)])
+ self.goodrows = (rows == Page.rows)
+
+ def hamming(self, other):
+ # compute the similarity/difference between two subpages
+ # if similar enough to squash them into a single subpage, return true
+ # note: no point checking rows 0 and 1. they will always match for
+ # all subpages.
+
+ h = ((self.array[2:] != ord(' ')) & (self.array[2:] == other.array[2:])).sum(axis=1)
+ return ((h >= self.threshold)).all() and h.sum() >= self.threshold_sum
+ #return h.sum() < 200
+
+ def to_html(self, anchor):
+ body = []
+
+ p = Printer(self.array[0][10:])
+ p.anchor = anchor
+ line = ' <span class="pgnum">P%d%02x</span> ' % (self.m,self.p) + p.string_html()
+ body.append(line)
+
+ i = 2
+ for i in range(2,26):
+ if self.no_double_on_prev[i-2]:
+ p = Printer(self.array[i][2:])
+ if i == 25 and self.rows[1] == 27:
+ p.set_fasttext(self.array[1], self.m)
+ body.append(p.string_html())
+ # skip a line if this packet contained double height chars
+
+ head = '<div class="subpage" id="%d">' % self.s
+
+ return head + "".join(body) + '</div>'
+
+ def to_str(self):
+ return "".join([chr(x) for x in self.array.reshape((42*26))])
+
+
+
View
111 subpagesquash.py
@@ -3,84 +3,10 @@
import sys, os
import numpy as np
-from util import subcode_bcd, mrag, page
+from util import subcode_bcd, mrag, page, bitwise_mode
from printer import Printer, do_print
-
-class Page(object):
- rows = np.array([0, 27, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24])
- def __init__(self, a):
-
- self.array = a.reshape((26,42))
- #do_print(self.array[0])
- ((self.m,self.r),e) = mrag(self.array[0][:2])
- (self.p,e) = page(self.array[0][2:4])
- (self.s,self.c),self.e = subcode_bcd(self.array[0][4:10])
-
- # remove parity
- self.array[2:,2:] &= 0x7f
-
- # check if row n-1 is double height
- self.no_double_on_prev = ((self.array[1:-1,2:]) != 0x0d).all(axis=1)
- # row 1 can't contain double but might due to not being printable
- self.no_double_on_prev[0] = True
-
- # calculate a target threshold for each line
- # based on the number of non-blank characters
-
- # first count non-blanks
- self.threshold = ((self.array[2:,2:] != ord(' ')).sum(axis=1))
-
- # if non-blanks <= 5, don't require a match (set threshold to 0)
- # also ignore rows following a double height row
- self.threshold *= ((self.threshold > 5) & (self.no_double_on_prev))
-
- # some proportion of non-blanks must match in the rest of the lines
- self.threshold *= 0.5
-
- # sum required threshold for each line to get total threshold
- self.threshold_sum = self.threshold.sum() * 1.5
-
- try:
- self.ds = int("%x" % self.s, 10)
- except ValueError:
- self.ds = 1000
- rows = np.array([mrag(self.array[n][:2])[0][1] for n in range(26)])
- self.goodrows = (rows == Page.rows)
-
- def hamming(self, other):
- # compute the similarity/difference between two subpages
- # if similar enough to squash them into a single subpage, return true
- # note: no point checking rows 0 and 1. they will always match for
- # all subpages.
-
- h = ((self.array[2:] != ord(' ')) & (self.array[2:] == other.array[2:])).sum(axis=1)
- return ((h >= self.threshold)).all() and h.sum() >= self.threshold_sum
- #return h.sum() < 200
-
- def to_html(self, anchor):
- body = []
-
- p = Printer(self.array[0][10:])
- p.anchor = anchor
- line = ' <span class="pgnum">P%d%02x</span> ' % (self.m,self.p) + p.string_html()
- body.append(line)
-
- i = 2
- for i in range(2,26):
- if self.no_double_on_prev[i-2]:
- p = Printer(self.array[i][2:])
- if i == 25 and self.rows[1] == 27:
- p.set_fasttext(self.array[1], self.m)
- body.append(p.string_html())
- # skip a line if this packet contained double height chars
-
- head = '<div class="subpage" id="%d">' % self.s
-
- return head + "".join(body) + '</div>'
-
- def to_str(self):
- return "".join([chr(x) for x in self.array.reshape((42*26))])
-
+from page import Page
+from fragment import fragments
class Squasher(object):
def __init__(self, filename):
@@ -94,8 +20,13 @@ def __init__(self, filename):
if len(p) < (42*26):
done = True
else:
- a = np.fromstring(p, dtype=np.uint8)
- self.pages.append(Page(a))
+ p = Page(np.fromstring(p, dtype=np.uint8))
+ for flist in fragments:
+ tmp = [(f.test(p),n,f) for n,f in enumerate(flist)]
+ ans = max(tmp)
+ if ans[0] > 0:
+ ans[2].fix(p)
+ self.pages.append(p)
self.page_count += 1
print "%5d" % self.page_count,
@@ -174,28 +105,8 @@ def hamming(self):
return unique_pages
- def squash1(self, pages):
- ans = np.array([x.array for x in pages])
- s = pages[0].array.shape
-
- auni = np.unique(ans)
- mode = np.zeros(s, dtype=np.uint8)
- counts = np.zeros(s)
- for k in auni:
- count = (ans==k).sum(0)
- mode[count>counts] = k
- counts[count>counts] = count[count>counts]
-
- return mode
-
def squash(self, pages):
- _le = np.arange(0, 8, 1)
- ans = np.array([x.array for x in pages])
- a = ans.transpose()[np.newaxis].transpose()
- b = (1&(a>>_le)).sum(0) > (len(pages)/2)
- c = (b<<_le).sum(-1)
- #exit(0)
- return c
+ return bitwise_mode([x.array for x in pages])
def to_str(self):
return "".join([p.to_str() for p in self.squashed_pages])
View
8 util.py
@@ -159,3 +159,11 @@ def sethalfbyte(a, n, v):
def normalise(a):
return (a-a.mean())/a.std()
+
+
+def bitwise_mode(fragments):
+ ans = np.array(fragments)
+ a = ans.transpose()[np.newaxis].transpose()
+ b = (1&(a>>_le)).sum(0) > (len(fragments)/2)
+ c = (b<<_le).sum(-1)
+ return c

0 comments on commit fa5748b

Please sign in to comment.