In [1]:
%load_ext Cython

In [2]:
%%cython

from libc.stdint cimport *

cdef extern from "Python.h":
    object PyString_FromStringAndSize(char *, Py_ssize_t)
    char *PyBytes_AsString(object)

cdef extern from "/Users/bob/code/podcast/rss_extract/lib/histogram/murmur3.h":
    cdef void MurmurHash3_x64_128(const void * key, const int _len, uint32_t seed, void * out)
    
def murmur_hash(bytes s):
    cdef Py_ssize_t length = len(s)
    cdef char *sb = PyBytes_AsString(s)
    cdef uint64_t[2] res
    
    MurmurHash3_x64_128(sb, length, 42, res)
    
    return res[0] ^ res[1]
    
    

In [3]:
murmur_hash("test".encode('utf-8'))

14466693972460839356

In [38]:
import re
entity_re = re.compile(r'\\u([0-9a-f]{4})')
def decode_entities(v):
    return entity_re.sub(lambda match: chr(int(match.group(1), 16)), v)

In [27]:
? chr

In [4]:
import pickle

In [5]:
import numpy as np

In [42]:
with open('./token_dists.pickle', 'rb') as inf:
    dists = pickle.load(inf)

In [41]:
def fix_keys(dist):
    res = {}
    for k, v in dist.items():
        res[decode_entities(k)] = v
    return res

In [43]:
dists = [fix_keys(v) for v in dists if len(v) > 0]

In [122]:
len(dists)

1392

In [62]:
dists_lowercase = []
for dist in dists:
    v = {}
    for k, c in dist.items():
        k = k.lower()
        if k in v:
            v[k] += c
        else:
            v[k] = c
    dists_lowercase.append(v)

In [63]:
dists = dists_lowercase

In [64]:
dist_hashes = []
for dist in dists:
    dlen = len(dist)
    hash_arr = np.empty((dlen,), dtype=np.uint64)
    count_arr = np.zeros((dlen,), dtype=np.uint64)
    for idx, (k, c) in enumerate(dist.items()):
        h = murmur_hash(k.encode('utf-8'))
        hash_arr[idx] = h
        count_arr[idx] = c
    dist_hashes.append((hash_arr, count_arr))

In [65]:
dist_hashes[0]

(array([ 6542722928290085166, 11955699455548345443,  6921773163938638153,
        ...,  9039886265152516753,   561218049625248724,
        12685494650715651576], dtype=uint64),
 array([1800, 9761, 4822, ...,    1,    1,    1], dtype=uint64))

In [66]:
sorted_dist_hashes = []
for h, c in dist_hashes:
    idxes = np.argsort(h)
    sorted_dist_hashes.append((h[idxes], c[idxes]))

In [67]:
sorted_dist_hashes[1]

(array([   72070893150096027,   139428885052208584,   140570187466950067,
          147530717713822329,   159234690102437018,   163954680460945382,
          177046348584101622,   227446426996025206,   237188749443315877,
          276287089699395670,   325354432707452096,   386361036130087665,
          428938599612749114,   451521113658592875,   465474334209975165,
          507229187383669329,   545121197812336463,   550120577644887867,
          551114696100349979,   561977573161684396,   562056088709496250,
          568786754235137239,   667474728443831911,   693911418889159014,
          706066105445191250,   707984191652811888,   812049558892549454,
          854498940094406552,   867617679497147089,   978838221826738503,
          983711964283762621,   994827356953513381,  1023335688996444883,
         1041471357125997668,  1043919380356768157,  1066840804797798782,
         1070461328702153130,  1073030030509327534,  1078386024212994099,
         1106858698018407619,  1228670

In [123]:
sums = [np.sum(v[1]) for v in sorted_dist_hashes]

In [129]:
sorted_dist_hashes = [v for idx, v in enumerate(sorted_dist_hashes) if sums[idx] > 3000]

In [130]:
len(sorted_dist_hashes)

647

In [131]:
lengths = np.array([v[0].shape[0] for v in sorted_dist_hashes])

In [132]:
lengths

array([ 65763,  80456, 209060,   7181,   1003,  21924,   4678,   5931,
        17797,  95146,   6536, 107891,   3743,   5480,   1996,    564,
         1679,   2960,    483,   2582,    176,  79239,  26751,  71454,
         8942,   1213,  67526,   3330, 194490, 347654,    841,  49352,
         3051,   3292,  30897,  49413,   3338,  41682,   2335,    442,
         2387,   1095,    421,   9125,   2335,   1533,   5280,  12564,
          656,    593,  17610,  14084,   1470,  55933,   1964,   1130,
          567,   4310,   9174,  88444,  40780,   8045,   3980,   5922,
        13916,   1667,   9160,   2183, 140202,  11876,   1524,  19660,
         8279,   2585,   3440, 239608,  67486,   1622,    809,   6844,
        33794,   2965,  86935,  19985,   1366,  20877,   5600,    394,
         6034,   8267,   7229,   1395,  45430,   8983,  55090,  24555,
          803,  56510,   4125,   1218,   2573,  14700,   3054,    402,
         2143,    951,   4348, 113345, 187833,  34612,   1090,   3221,
      

In [133]:
res = np.empty((1 + lengths.shape[0] + np.sum(lengths)*2,), dtype=np.uint64)

In [134]:
res[0] = lengths.shape[0]

In [135]:
res[1:(1 + lengths.shape[0])] = lengths

In [136]:
offset = 1 + lengths.shape[0]
for h, c in sorted_dist_hashes:
    hs = h.shape[0]
    cs = c.shape[0]
    res[offset:(offset + hs)] = h
    offset += hs
    res[offset:(offset + cs)] = c
    offset += cs

In [137]:
res[-1000:-990]

array([1, 2, 1, 1, 1, 2, 3, 2, 1, 1], dtype=uint64)

In [138]:
res.tofile('token_dists.bin')

In [139]:
!ls -lh token_dists.bin

-rw-r--r--  1 bob  staff   245M Apr 17 09:07 token_dists.bin


In [106]:
res[0]

1392

In [79]:
res[-712:-700]

array([17099400986748975806, 17126617102872912932, 17205451217589082872,
       17217264110123707600, 17239752794746872376, 17258088682278238853,
       17293087393332838770, 17310626912190217444, 17316995750203367552,
       17332563090086013606, 17336924912161307323, 17374192831438801270],
      dtype=uint64)

In [82]:
test_values = []
for dist in dists:
    v = []
    for idx, it in enumerate(dist.items()):
        if it[1] > 100:
            v.append(it)
        if len(v) > 20:
            break
    test_values.append(v)

In [87]:
test_values = list(zip(range(len(test_values)), test_values))

In [121]:
for c, x in test_values[-10:]:
    if len(x) < 1: continue
    print("(%d, [%s]);" % (c, ';'.join('("%s", %d)' % (''.join('\\x%s' % hex(char).replace('0x', '') for char in v[0].encode('utf-8')),v[1]) for v in x)))

(1382, [("\x61\x6d\x61\x74\x65\x75\x72", 125);("\xe4\xb8\x8a\xe4\xb8\x80\xe4\xb8\x96\xef\xbc\x8c\xe8\x8b\x8d\xe5\x9b\xbd\xe4\xb8\x9e\xe7\x9b\xb8\xe4\xb9\x8b\xe5\xa5\xb3\xe4\xb8\x8a\xe5\xae\x98\xe6\xb2\xab\xef\xbc\x8c\xe7\xbb\x9d\xe7\xbe\x8e\xe5\x80\xbe\xe5\x9f\x8e\xef\xbc\x8c\xe6\x80\xa7\xe5\xad\x90\xe5\x8d\xb4\xe5\xa4\xaa\xe8\xbf\x87\xe8\xbd\xaf\xe5\xbc\xb1\xef\xbc\x8c\xe8\xa2\xab\xe5\xad\xaa\xe7\x94\x9f\xe5\xa6\xb9\xe5\xa6\xb9\xe6\x8a\xa2\xe5\xb0\xbd\xe4\xba\x86\xe6\x89\x80\xe6\x9c\x89\xe9\xa3\x8e\xe5\xa4\xb4\xef\xbc\x8c\xe5\x8d\xb4\xe5\x9c\xa8\xe9\x93\xb6\xe6\x9c\x88\xe5\x9b\xbd\xe8\xa6\x81\xe6\xb1\x82\xe8\x81\x94\xe5\xa7\xbb\xe4\xb9\x8b\xe6\x97\xb6\xef\xbc\x8c\xe8\xa2\xab\xe4\xba\xb2\xe4\xba\xba\xe6\xaf\xab\xe4\xb8\x8d\xe7\x95\x99\xe6\x83\x85\xe5\x9c\xb0\xe6\x8e\xa8\xe4\xba\x86\xe5\x87\xba\xe5\x8e\xbb\xef\xbc\x8c\xe5\x8f\xaa\xe4\xb8\xba\xe4\xbf\x9d\xe4\xbd\x8f\xe5\xa5\xb9\xe9\x82\xa3\xe4\xb8\xaa\xe5\xa6\x82\xe7\x8f\xa0\xe5\xa6\x82\xe5\xae\x9d\xe7\x9a\x84\xe5\xa6\xb9\xe5\xa6\xb9\xe3

In [111]:
list(zip(range(100), [np.sum(v[1]) for v in sorted_dist_hashes][:100]))

[(0, 705623),
 (1, 1321),
 (2, 129561),
 (3, 392),
 (4, 47159),
 (5, 926708),
 (6, 73177),
 (7, 2202332),
 (8, 33018),
 (9, 2312),
 (10, 89386),
 (11, 21),
 (12, 108),
 (13, 15006),
 (14, 612),
 (15, 326067),
 (16, 10711),
 (17, 7680),
 (18, 12617),
 (19, 24666),
 (20, 91488),
 (21, 67381),
 (22, 87777),
 (23, 529),
 (24, 39184),
 (25, 889444),
 (26, 23638),
 (27, 1021532),
 (28, 1510),
 (29, 12101),
 (30, 20961),
 (31, 8578),
 (32, 1509),
 (33, 629),
 (34, 38939),
 (35, 127),
 (36, 5850),
 (37, 3125),
 (38, 12910),
 (39, 17945),
 (40, 45189),
 (41, 1959),
 (42, 6721),
 (43, 7723),
 (44, 2489),
 (45, 11609),
 (46, 883431),
 (47, 116486),
 (48, 3951997),
 (49, 568776),
 (50, 4884),
 (51, 1606),
 (52, 37133),
 (53, 17438),
 (54, 5812),
 (55, 751943),
 (56, 9716),
 (57, 2762318),
 (58, 184930),
 (59, 3849845),
 (60, 15),
 (61, 8309),
 (62, 2421),
 (63, 392),
 (64, 313417),
 (65, 35599),
 (66, 9093),
 (67, 10024),
 (68, 128859),
 (69, 1463),
 (70, 402937),
 (71, 9402),
 (72, 8789),
 (73, 2

In [109]:
l = 28644063
r = 458137848

In [110]:
r / l

15.994164235709158

In [113]:
murmur_hash("die".encode('utf-8')) == 3485591795621796808

True

In [114]:
4125 / 740

5.574324324324325

In [116]:
sorted_dist_hashes[18][0].shape

(4125,)

In [117]:
len(sorted_dist_hashes)

1392

In [140]:
v = '0.00154463577359,0.00154463368465,0.00154462982294,0.00154481413587,0.00154729793775,0.00154469613819,0.00154503871905,0.00154487749164,0.00154469740519,0.00154463396398,0.001544888384,0.00154463306577,0.00154513754307,0.00154492176201,0.00154534718628,0.00154871806457,0.00154568294705,0.00154510555428,0.00154777891423,0.00154542690201,0.00154710809631,0.00154463401126,0.00154468005771,0.00154463788029,0.00154479340051,0.00154568984944,0.00154463523421,0.00154526284636,0.00154462925421,0.00154462862237,0.00154717774778,0.0015446467308,0.0015453064037,0.00154524331216,0.00154467496486,0.00154464235114,0.00154528407841,0.00154465197127,0.00154575053576,0.00154906161509,0.00154544425343,0.00154655644435,0.00154919607114,0.00154479007647,0.00154593153218,0.00154583605966,0.00154489633622,0.00154475957151,0.00154792529873,0.00154705352966,0.00154469182912,0.00154471868506,0.00154622963923,0.00154464038004,0.00154567311406,0.00154576586046,0.00154880097708,0.00154506071114,0.00154472117588,0.00154463509591,0.00154464433714,0.00154482031183,0.0015449469941,0.00154488454923,0.00154474606562,0.00154586117634,0.00154479992875,0.00154602163577,0.00154463230756,0.00154475173947,0.00154598694874,0.00154466658521,0.00154480590636,0.00154514196827,0.00154506424054,0.00154462886578,0.00154463992985,0.00154511446414,0.00154693307581,0.00154479844154,0.00154465534418,0.00154523873582,0.00154464190077,0.00154467316115,0.00154594236321,0.00154466974127,0.00154495443807,0.00154698500115,0.00154486785511,0.00154477502256,0.00154484637606,0.00154612004173,0.00154464515596,0.00154478233347,0.00154464311583,0.00154468482148,0.00154701139054,0.0015446415679,0.00154492643273,0.00154587639977,0.00154523066744,0.00154471237222,0.00154512351251,0.00154915256439,0.00154548332988,0.00154581467817,0.00154494902863,0.00154463273544,0.00154463158146,0.00154466855083,0.00154700863324,0.00154544914689,0.00154466677464,0.00154661351604,0.00154468746651,0.0015448931461,0.00154464950515,0.00154509582324,0.00154474091493,0.00154463996934,0.00154465994492,0.00154463062004,0.00154924042253,0.00154515083073,0.00154471987678,0.00154580945044,0.00154468679395,0.00154530209827,0.0015454502423,0.00154679320451,0.00154510056618,0.00154879534516,0.00154546463932,0.00154478429848,0.00155023237029,0.00154492296204,0.00154463597452,0.00154463464495,0.0015450925032,0.00154497828365,0.00154528275344,0.00154486458618,0.00154469714474,0.00154640283335,0.0015446391246,0.00154476660301,0.00154463398705,0.00154490184689,0.00154477569927,0.00154554517448,0.00154462919468,0.00154512555566,0.00154480535387,0.00154475548684,0.00154476701735,0.00154513186975,0.0015455917687,0.00154471202929,0.00154484411005,0.00154925078001,0.00154609800316,0.0015465679606,0.00154494679537,0.00154880097708,0.00154687681515,0.00154463037164,0.00154516888494,0.00154480331837,0.00154476634804,0.00154560304654,0.00154484223466,0.00154464903923,0.00154719898395,0.00154517290649,0.00154571742258,0.00154779345211,0.00154555329581,0.00154570168077,0.00154531304198,0.00154463786355,0.00154547837413,0.00154464109116,0.00154479132532,0.00154495183885,0.00154529821144,0.00154469536137,0.00154998448622,0.00154467832283,0.00154537166277,0.00154469928139,0.00154576355606,0.00154474893387,0.00154612948148,0.0015446871301,0.00154475755235,0.00154463122205,0.00154506644838,0.00154801760889,0.00154630175013,0.00154526633333,0.00154476151344,0.0015453851864,0.00154610257008,0.00155000313301,0.00154856676189,0.0015496986379,0.00154523452548,0.00154463442042,0.00154518650026,0.00154466665121,0.00154510235134,0.00154503915845,0.00154464099985,0.00154521762646,0.00154969447829,0.00154479798616,0.00154562673561,0.00154474099267,0.0015447216576,0.00154644998932,0.00154491151522,0.00154500634051,0.00154912947911,0.00154485515616,0.00154537067687,0.00154831512652,0.00154476732892,0.00154482510851,0.00154598366406,0.0015446683052,0.00155011292245,0.00154463090513,0.0015480705204,0.00154488364003,0.00154611896015,0.00155042706401,0.00154476187158,0.00154995216371,0.00154481759309,0.00154519125193,0.00154471652964,0.00154585626549,0.00154462874273,0.00154568547958,0.0015448795597,0.00154492114463,0.00154467926163,0.00154468806076,0.00154549317359,0.00154522861026,0.00154468254904,0.00154578225664,0.00154472149119,0.00154532474009,0.00154579074265,0.00154578899178,0.00154570845237,0.00154567046404,0.00154501484068,0.00154573165394,0.00154480844884,0.00154503022883,0.00154605582193,0.00154484001626,0.00154924732235,0.00154508366154,0.00154590034589,0.00154463287591,0.0015447127786,0.00154607456803,0.00154661096454,0.00154463447154,0.00154463885821,0.00154790430452,0.00154464957337,0.00154482328991,0.00154491494909,0.00154547335946,0.00154524989806,0.00154869382934,0.0015448621541,0.00154721189623,0.00154462881739,0.00154671841361,0.00154479247374,0.00154490606715,0.00154886685522,0.00154492727667,0.00154802320008,0.00154668223813,0.00154486666872,0.00154479145651,0.00154473590476,0.00154563851962,0.00154763615218,0.00154966555095,0.00154530940511,0.00154549805798,0.0015466774637,0.00154462966209,0.00154474732668,0.00154471751957,0.00154530224583,0.00154820409368,0.00154497060677,0.00154462847952,0.0015460346629,0.00154481885344,0.00154543824454,0.00154469785875,0.00154483026291,0.00154518038693,0.00154483835512,0.00154545676508,0.00154465884034,0.00154681004263,0.00154494334092,0.00154500527228,0.00154465755848,0.00154462803329,0.00154713937146,0.00154463078973,0.00154864098794,0.00154520796369,0.00154479347223,0.0015474224054,0.00154532766763,0.00154744150609,0.00154556411345,0.00154566328868,0.00154549414608,0.00154463067254,0.00154465353411,0.00155002662483,0.00154463960289,0.00154479510701,0.00154492212804,0.00154559312639,0.00154473810129,0.00154570902053,0.00154596255009,0.00154521655573,0.00154647341488,0.00154476973047,0.00154744278875,0.0015474376651,0.00154757724115,0.00154685571506,0.00154858442606,0.00154596110822,0.00154462964872,0.00154589798848,0.00154784122245,0.00154476426202,0.00154463163327,0.0015447287762,0.00154686216512,0.00154487493878,0.00154534417695,0.00154512097213,0.00154463035066,0.00154463424851,0.00154462985,0.00154464320109,0.00154530446691,0.00154693393684,0.00154474286688,0.00154558104303,0.00154485772887,0.00154576001301,0.00154598904729,0.00154469749592,0.00154871806457,0.00154464588633,0.00154618912039,0.00154464966796,0.00154521375647,0.0015447618863,0.00154693825166,0.00154465498173,0.00154508383035,0.00154463221592,0.00154546851812,0.00154510319392,0.00154914594447,0.00154493958036,0.00154484829294,0.0015447063153,0.00154680159097,0.00154495129331,0.00154555664065,0.00154512351251,0.0015447091422,0.00154486376689,0.00154462931871,0.00154466500514,0.00154496761413,0.00154934253167,0.00154641620371,0.00154483120327,0.00154827161547,0.00154480209442,0.00154498813842,0.0015498398428,0.00154469368803,0.00154463497064,0.00154660019342,0.00154644302374,0.00154596660396,0.00154601070433,0.00154499273412,0.001544634458,0.00154474921115,0.00154477792169,0.00154904891989,0.00154504444901,0.00154463156519,0.00154588274794,0.00154479741092,0.00154467851998,0.00154464886101,0.001547920025,0.00154552961534,0.00154492783219,0.00154464631218,0.00154476840779,0.00154463010614,0.00154582388083,0.00154594488827,0.00154834173542,0.00154577028581,0.00154463958346,0.0015454087855,0.00154652561385,0.00154463578976,0.00154796629881,0.00154473213663,0.00154586019103,0.00154477255083,0.00154474329202,0.00154465905818,0.00154475743387,0.00154600300049,0.00154506741938,0.00154511678251,0.0015456770252,0.00154515499358,0.00154711709569,0.00154612148627,0.00154488170419,0.00154465681667,0.00154527207354,0.00154512652297,0.00154493576923,0.00154481237321,0.00154492202941,0.00154475178481,0.00154886976669,0.00154463166921,0.00154545520797,0.00154532639866,0.0015486991901,0.00154463984276,0.00154509883041,0.00154464152901,0.00154583653303,0.00154822073849,0.0015449153522,0.00154518207718,0.00154500194403,0.00154695567357,0.0015442178662,0.00154484597959,0.00154542856233,0.00154576481185,0.00154675808827,0.00154473307129,0.00154469405915,0.0015453012143,0.00154478729736,0.00154473873803,0.00154506874179,0.0015454450108,0.00154466918924,0.00154915920372,0.00154634267382,0.00154558089574,0.00154464072613,0.00154651228935,0.00154464497404,0.00154539805396,0.00154483575617,0.00154485102867,0.00154507632269,0.00154515762342,0.00154471797121,0.00155027851531,0.00154734135301,0.00154606748088,0.00154571377847,0.00155004081872,0.0015452000965,0.00154489383566,0.00154519290562,0.00154530827648,0.00154607084704,0.00154546113379,0.0015449400713,0.0015450858318,0.00154524927074,0.00154480823598,0.00154471908941,0.00154763908613,0.00154557635225,0.00154481913977,0.00154463036675,0.00154467220375,0.00154573741113,0.00154504999294,0.00154540085448,0.00154493958036,0.00154600055334,0.00154780321877,0.00154578594041,0.00154845686679,0.00154473987676,0.00154463343568,0.00154470255839,0.00154502268892,0.00154569907021,0.0015448877223,0.00154473179614,0.00154464046181,0.00154510823782,0.00154496169192,0.00154463252258,0.00154466808183,0.00154468438087,0.00154599145357,0.00154489548196,0.00154462829438,0.00154468253357,0.00154465382019,0.00154552281044,0.00154559707015,0.00154463206003,0.00154836646645,0.001544762205,0.00154512563613,0.00154808981982,0.00154582643662,0.00154466470759,0.00154467142272,0.0015505664543,0.00154605880188,0.00154503534517,0.00154518579188,0.00154466474234,0.00154473538719,0.00154520616666,0.00154497590343,0.00154485201575,0.00154856927568,0.00154693738741,0.00154475866489,0.00154766575142,0.00154463142711,0.00154469102608,0.00154476886874,0.0015449559196,0.00154719470847,0.00154464405066,0.00154466576972,0.00154973216228,0.00154522040587,0.00154881796451,0.00154472294478,0.0015448018863,0.00154703931876,0.00154463142377,0.00154504817652,0.00154577198072,0.00154927512929,0.00154490517519,0.0015468373768,0.00154512555566,0.00154503526423,0.00154462954146,0.00154497796441,0.00154467131352,0.00154521208964,0.0015446290693,0.00154462934109,0.00154654031228,0.00154481434578,0.00154548714459,0.00154474934179,0.00154546384517,0.00154463326037,0.0015451079756,0.00154462874177,0.00154673626285,0.00155042706401,0.0015450116298,0.00154467484571,0.00154700222428,0.00154720542392,0.00154476550172,0.00154464381623,0.00154467036747,0.00154476734804,0.00154481046898,0.00154468645581,0.00154566242038,0.00154523238264,0.00154687599622,0.00154523309524,0.00154488109673,0.00154464781186,0.00154472977471,0.00154467449181,0.00154531472187,0.00154463798255,0.00154513978867,0.00154462871493,0.00154658146815,0.00154687191052,0.00154467857241,0.00154749907142,0.00154465627495,0.0015453421019,0.00154736056741,0.00154512547522,0.00154462946306,0.00154465159124,0.00154857683634,0.00154774708093,0.00154465660306,0.001544731206,0.00154618047968,0.00154466092357,0.00154712414047,0.00154572478462,0.00154516097088,0.0015448479448,0.001544830732,0.00154665126422'.split(',')

In [141]:
len(v)

647

In [None]:
topic_vecs = np.memmap('/mnt/lappy2/word_vectors.npy').reshape((-1, 647))