In [36]:
import os, sys
cwd = os.getcwd()
project_path = cwd[:cwd.find('pygents')+7]
if project_path not in sys.path: sys.path.append(project_path)
os.chdir(project_path) 

#https://stackoverflow.com/questions/41827983/right-way-to-calculate-the-cosine-similarity-of-two-word-frequency-dictionaries
#https://realpython.com/python-counter/

#from scipy.spatial.distance import cosine
#from sklearn.metrics.pairwise import cosine_similarity

from treelib import Node, Tree

from copy import deepcopy as dcp
import pandas as pd

#force reimport
if 'pygents.util' in sys.modules:
    del sys.modules['pygents.util']
if 'pygents.text' in sys.modules:
    del sys.modules['pygents.text']
if 'pygents.plot' in sys.modules:
    del sys.modules['pygents.plot']
if 'pygents.token' in sys.modules:
    del sys.modules['pygents.token']

from pygents.token import *
from pygents.text import *
from pygents.util import *
from pygents.plot import plot_bars, plot_dict, matrix_plot


In [2]:
import math

def cosine_dic_parts(dic1,dic2):
    numerator = 0
    dena = 0
    for key1,val1 in dic1.items():
        numerator += val1*dic2.get(key1,0.0)
        dena += val1*val1
    denb = 0
    for val2 in dic2.values():
        denb += val2*val2
    return numerator, dena, denb

def cosine_dic(dic1,dic2):
    numerator, dena, denb = cosine_dic_parts(dic1,dic2)
    return numerator/math.sqrt(dena*denb) if numerator != 0 else 0.0

x0 = {'c':3,'a':1,'b':2}
x1 = {'a':1,'b':2,'c':3}
x2 = {'a':1,'b':2,'c':0}
x3 = {'a':1,'b':2,'d':3}
x4 = {'a':1,'e':2,'d':3}
x5 = {'a':999,'e':2,'d':3}
x6 = {'a':0.1,'e':2,'d':3}
x7 = {'f':1,'e':2,'d':3}
assert str(cosine_dic(x0,x1)) == "1.0"
assert str(cosine_dic(x0,x2)) == "0.5976143046671968"
assert str(cosine_dic(x1,x2)) == "0.5976143046671968"
assert str(cosine_dic(x1,x3)) == "0.35714285714285715"
assert str(cosine_dic(x1,x4)) == "0.07142857142857142"
assert str(cosine_dic(x1,x5)) == "0.26725950125174264"
assert str(cosine_dic(x1,x6)) == "0.007409643851431125"
assert str(cosine_dic(x1,x7)) == "0.0"

# computes cosine distance based on 2 dicts corresponding to 
# two vectors in complementary two segments of bi-segment vector space
def cosine_dic2(dica1,dicb1,dica2,dicb2):
    numerator1, dena1, denb1 = cosine_dic_parts(dica1,dicb1)
    numerator2, dena2, denb2 = cosine_dic_parts(dica2,dicb2)
    return (numerator1+numerator2)/math.sqrt((dena1+dena2)*(denb1+denb2)) if numerator1 != 0 or numerator2 != 0 else 0.0
assert str(cosine_dic2(x0,x1,{},{})) == "1.0"
assert str(cosine_dic2(x0,x2,{},{})) == "0.5976143046671968"
assert str(cosine_dic2(x1,x2,{},{})) == "0.5976143046671968"
assert str(cosine_dic2(x1,x3,{},{})) == "0.35714285714285715"
assert str(cosine_dic2(x1,x4,{},{})) == "0.07142857142857142"
assert str(cosine_dic2(x1,x5,{},{})) == "0.26725950125174264"
assert str(cosine_dic2(x1,x6,{},{})) == "0.007409643851431125"
assert str(cosine_dic2(x1,x7,{},{})) == "0.0"
assert str(cosine_dic2(x0,x1,x0,x1)) == "1.0"
assert str(cosine_dic2(x1,x7,x1,x7)) == "0.0"

def compute_similiarities(model,arity=1,debug=False):
    lst = []
    done = set()
    for a in model[0]:
        if len(a) == arity:
            a1 = model[1][a]
            a2 = model[2][a]
            for b in model[0]:
                if a != b and len(b) == arity and not (b,a) in done:
                    b1 = model[1][b]
                    b2 = model[2][b]
                    s = cosine_dic2(a1,b1,a2,b2)
                    done.add((a,b))
                    lst.append( (a,b,s) if a <= b else (b,a,s) )
            if debug:
                print(a)
    return lst


def compute_similiarities_from_dict(dic,debug=False):
    lst = []
    done = set()
    for a in dic:
            a1 = dic[a][0]
            a2 = dic[a][1]
            for b in dic:
                if a != b and not (b,a) in done:
                    b1 = dic[b][0]
                    b2 = dic[b][1]
                    s = cosine_dic2(a1,b1,a2,b2)
                    done.add((a,b))
                    lst.append( (a,b,s) if a <= b else (b,a,s) )
            if debug:
                print(a)
    return lst

def model_to_dict(model,arity=1,debug=False):
    copy = {}
    for a in model[0]:
        if len(a) == arity:
            copy[a] = (model[1][a] if a in model[1] else {}, model[2][a] if a in model[2] else {})
    return copy

def dict_merge(a,b):
    c = dcp(a)
    for key in b:
        if key in c:
            c[key] = c[key] + b[key]
        else:
            c[key] = b[key]
    return c
assert str(dict_merge({'a':0.2,'b':0.1},{'c':0.2,'b':0.1})) == "{'a': 0.2, 'b': 0.2, 'c': 0.2}"         

def join_letters(a,b):
    return "".join(sorted(list(a)+list(b)))
assert str(join_letters("1.2","zba")) == ".12abz"
    
def do_cluster(model,dic4tree=None,debug = False):
    copy = model_to_dict(model)
    if debug:
        print(len(copy))
    n = 0
    while True:
        simlst = compute_similiarities_from_dict(copy)
        simlst.sort(key=lambda tup: tup[2], reverse=False) # sort to end so we can be removing from the end
        length = len(simlst)
        if length == 0:
            break # root
        top = simlst[length - 1]
        merged_name = join_letters(top[0],top[1])
        if not dic4tree is None:
            dic4tree[top[0]] = merged_name
            dic4tree[top[1]] = merged_name
        if debug:
            print(n,len(copy),length,top[0],'+',top[1],'=>',top[2])
        copy[ merged_name ] = ( dict_merge(copy[top[0]][0],copy[top[1]][0]), dict_merge(copy[top[0]][1],copy[top[1]][1]) )
        del copy[top[0]]
        del copy[top[1]]
        if n > 100:
            break
        n += 1
    if debug:
        print(len(copy))



In [3]:
print(cosine_dic(x0,x1))
print(cosine_dic(x0,x2))
print(cosine_dic(x1,x2))
print(cosine_dic(x1,x3))
print(cosine_dic(x1,x4))
print(cosine_dic(x1,x5))
print(cosine_dic(x1,x6))
print(cosine_dic(x1,x7))


1.0
0.5976143046671968
0.5976143046671968
0.35714285714285715
0.07142857142857142
0.26725950125174264
0.007409643851431125
0.0


In [4]:
brown_chars = FreedomTokenizer(name='data/models/brown_nolines_chars_7a',max_n=7,mode='chars',debug=False)
model_compress_with_loss(brown_chars.model,0.0001)


In [5]:
print(cosine_dic(brown_chars.model[1]['.'],brown_chars.model[1][',']))
print(cosine_dic(brown_chars.model[2]['.'],brown_chars.model[2][',']))

print(cosine_dic(brown_chars.model[1]['.'],brown_chars.model[1][' ']))
print(cosine_dic(brown_chars.model[2]['.'],brown_chars.model[2][' ']))

print(cosine_dic(brown_chars.model[1]['.'],brown_chars.model[1]['a']))
print(cosine_dic(brown_chars.model[2]['.'],brown_chars.model[2]['a']))

print(cosine_dic(brown_chars.model[1]['o'],brown_chars.model[1]['a']))
print(cosine_dic(brown_chars.model[2]['o'],brown_chars.model[2]['a']))


0.9999222003602027
0.9907565768001892
0.12763480631061797
0.8811401825208185
0.22319366970832127
0.31534753618812983
0.7720141235753476
0.8849807028305376


In [6]:
simlst = compute_similiarities(brown_chars.model,debug=False)
simlst.sort(key=lambda tup: tup[2], reverse=True)
sim_df = pd.DataFrame(simlst,columns=['a','b','sim'])
sim_df


Unnamed: 0,a,b,sim
0,",",;,0.999476
1,",",.,0.999165
2,.,;,0.998428
3,!,?,0.995960
4,[,{,0.987710
...,...,...,...
2011,$,],0.000000
2012,$,;,0.000000
2013,!,$,0.000000
2014,$,>,0.000000


In [7]:
simlst

[(',', ';', 0.9994755482324188),
 (',', '.', 0.9991648680218301),
 ('.', ';', 0.9984282455363871),
 ('!', '?', 0.9959599780890476),
 ('[', '{', 0.9877099439908715),
 ('>', '}', 0.9868727646622297),
 ('(', '[', 0.9846575192359338),
 (',', ':', 0.9822583608820747),
 (':', ';', 0.9821348526719393),
 ('(', '{', 0.9821066733551082),
 ('.', ':', 0.9813953297158914),
 ('<', '{', 0.9802205004458286),
 ('<', '[', 0.9795336530687687),
 ('2', '3', 0.9758629641063381),
 ('>', ']', 0.9707658079946206),
 (']', '}', 0.9687710608748165),
 (',', ']', 0.9679828586549064),
 ('(', '<', 0.9674732796083245),
 (';', ']', 0.9670737009481303),
 ('.', ']', 0.9662268539079678),
 ('[', '~', 0.9657787215738826),
 (')', '>', 0.9632849057637669),
 (')', '}', 0.9630093779230737),
 (',', '}', 0.962001043899077),
 (';', '}', 0.9617516314370288),
 ('(', '~', 0.9617033405790373),
 (';', '>', 0.9605738517119934),
 (',', '>', 0.9605233684439405),
 ('.', '}', 0.9600885933968563),
 ('{', '~', 0.9596649647199278),
 (':', ']',

In [8]:
def build_tree(lst):
    parents = {}
    for item in lst:
        pass
        

In [9]:
simlst = compute_similiarities_from_dict(model_to_dict(brown_chars.model))
simlst.sort(key=lambda tup: tup[2], reverse=True)
sim_df = pd.DataFrame(simlst,columns=['a','b','sim'])
sim_df


Unnamed: 0,a,b,sim
0,",",;,0.999476
1,",",.,0.999165
2,.,;,0.998428
3,!,?,0.995960
4,[,{,0.987710
...,...,...,...
2011,$,],0.000000
2012,$,;,0.000000
2013,!,$,0.000000
2014,$,>,0.000000


In [45]:
sim_df[sim_df['b'] == 'x']

Unnamed: 0,a,b,sim
360,r,x,0.674923
449,,x,0.62322
699,-,x,0.517199
754,d,x,0.495862
755,s,x,0.495649
758,n,x,0.494063
924,l,x,0.423802
987,v,x,0.402245
999,m,x,0.397495
1092,c,x,0.358333


In [10]:
do_cluster(brown_chars.model,debug=True)


64
0 64 2016 , + ; => 0.9994755482324188
1 63 1953 ,; + . => 0.9991541566507661
2 62 1891 ! + ? => 0.9959599780890476
3 61 1830 [ + { => 0.9877099439908715
4 60 1770 > + } => 0.9868727646622297
5 59 1711 ( + [{ => 0.9849595859613685
6 58 1653 ,.; + : => 0.9820893944359783
7 57 1596 2 + 3 => 0.9758629641063381
8 56 1540 ([{ + < => 0.9718134389427836
9 55 1485 >} + ] => 0.9717276422077935
10 54 1431 ) + >]} => 0.9651380673786327
11 53 1378 (<[{ + ~ => 0.9619265058839013
12 52 1326 # + + => 0.9589497629016187
13 51 1275 & + ,.:; => 0.9549524401317208
14 50 1225 4 + 7 => 0.9545051585990892
15 49 1176 &,.:; + )>]} => 0.9531714890687416
16 48 1128 #+ + @ => 0.950085582077181
17 47 1081 23 + 47 => 0.9444027022724498
18 46 1035 5 + 6 => 0.9400552781903716
19 45 990 !? + &),.:;>]} => 0.9324717587913735
20 44 946 #+@ + _ => 0.9192246817696368
21 43 903 b + p => 0.9148296693120033
22 42 861 $ + (<[{~ => 0.9012754833110997
23 41 820 !&),.:;>?]} + y => 0.8955212902858396
24 40 780 bp + w => 0.88882

In [11]:
dictree = {}
do_cluster(brown_chars.model,dic4tree=dictree,debug=True)


64
0 64 2016 , + ; => 0.9994755482324188
1 63 1953 ,; + . => 0.9991541566507661
2 62 1891 ! + ? => 0.9959599780890476
3 61 1830 [ + { => 0.9877099439908715
4 60 1770 > + } => 0.9868727646622297
5 59 1711 ( + [{ => 0.9849595859613685
6 58 1653 ,.; + : => 0.9820893944359783
7 57 1596 2 + 3 => 0.9758629641063381
8 56 1540 ([{ + < => 0.9718134389427836
9 55 1485 >} + ] => 0.9717276422077935
10 54 1431 ) + >]} => 0.9651380673786327
11 53 1378 (<[{ + ~ => 0.9619265058839013
12 52 1326 # + + => 0.9589497629016187
13 51 1275 & + ,.:; => 0.9549524401317208
14 50 1225 4 + 7 => 0.9545051585990892
15 49 1176 &,.:; + )>]} => 0.9531714890687416
16 48 1128 #+ + @ => 0.950085582077181
17 47 1081 23 + 47 => 0.9444027022724498
18 46 1035 5 + 6 => 0.9400552781903716
19 45 990 !? + &),.:;>]} => 0.9324717587913735
20 44 946 #+@ + _ => 0.9192246817696368
21 43 903 b + p => 0.9148296693120033
22 42 861 $ + (<[{~ => 0.9012754833110997
23 41 820 !&),.:;>?]} + y => 0.8955212902858396
24 40 780 bp + w => 0.88882

In [37]:
parents_children_dict = {}
for child in dictree: 
    dictcount(parents_children_dict,dictree[child])
parents_children_list = list(parents_children_dict)
parents_children_list.sort(key=lambda i: len(i), reverse=True)

tree = Tree()
for child in parents_children_list: 
    #print(child,'->',dictree[child] if child in dictree else '***root***')
    if not child in dictree:
        tree.create_node(child, child)
    else:
        tree.create_node(child, child, parent=dictree[child])
for child in dictree: 
    if not child in parents_children_list:
        tree.create_node(child, child, parent=dictree[child])

tree.show()


 !"#$%&'()*+,-./0123456789:;<>?@[]_abcdefghijklmnopqrstuvwxyz{}~
├──  !"#$%&'()+,-./0123456789:;<>?@[]_abcdefghijklmnopqrstuvwxyz{}~
│   ├──  !"#$%&'()+,-./0123456789:;<>?@[]_abcdefghijklmnoprstuvwxyz{}~
│   │   ├──  !"#$%&'()+,-.1:;<>?@[]_abcdefghijklmnoprstuvwxyz{}~
│   │   │   ├──  !"#$%&()+,-.1:;<>?@[]_abcdefghijklmnoprstuvwxyz{}~
│   │   │   │   ├──  !"#$%&()+,-.1:;<>?@[]_abcdefgijklmnoprstuvwxyz{}~
│   │   │   │   │   ├──  x
│   │   │   │   │   │   ├──  
│   │   │   │   │   │   └── x
│   │   │   │   │   └── !"#$%&()+,-.1:;<>?@[]_abcdefgijklmnoprstuvwyz{}~
│   │   │   │   │       ├── !"#$%&()+,-.1:;<>?@[]_bcdfgjklmnprstvwyz{}~
│   │   │   │   │       │   ├── !#%&)+,-.:;>?@]_dgklnrsvyz}
│   │   │   │   │       │   │   ├── !#%&)+,-.:;>?@]_dgsy}
│   │   │   │   │       │   │   │   ├── !#%&)+,-.:;>?@]_dsy}
│   │   │   │   │       │   │   │   │   ├── !%&),-.:;>?]dsy}
│   │   │   │   │       │   │   │   │   │   ├── !%&),-.:;>?]y}
│   │   │   │   │       │   │   │   │   │   │   ├── !&),-