In [92]:
from linkers_and_taggers import wikifier_tag, stanford_tag, stanford_tagger
from sample_data import n_samples
from json_extractor import from_file_get_n_docs
from similarity import similar, find_similar_to_a_in_dict_b, a_is_not_in_dict_b, compare_linkers_parsed_docs

In [93]:
raw_docs = n_samples(10)
stan_docs = from_file_get_n_docs('50_tagged_by_stanford.jsonl', 10)
wiki_docs = from_file_get_n_docs('100_tagged_by_wikifier.jsonl', 10)
len(raw_docs), len(stan_docs), len(wiki_docs)

(10, 10, 10)

In [94]:
test_doc_index = 3

In [95]:
describe_token_array = lambda toks, name=None: (u'{0} {1} tokens'.format(len(toks), name), u'|| '.join(toks))

# Wikifier tokens observations
    1. No punctuation as word tokens. This only means that the surface forms only include words and no punctuation. It makes sense for a surface_form to not be split by punctuation. When I want to index stanford's words, I have to skip punctuation. However, I have to keep punctuation for stanford to tag. So, maybe I can make a dictionary that maps the token index in stanford, to the word index.
    2. ' are kept as part of the token for It's, I don't know about possesives.

In [96]:
wiki_tokens = wiki_docs[test_doc_index]['words']
describe_token_array(wiki_tokens, 'wiki')

(u'876 wiki tokens',
 u"NYMag.com|| Daily|| Intelligencer|| Vulture|| The|| Cut|| Science|| of|| Us|| Grub|| Street|| Bedford|| Bowery|| FOLLOW|| Facebook|| Twitter|| UserName|| LOG|| IN|| REGISTER|| Fashions|| Runway|| Street|| Style|| Designers|| Fame|| Beauty|| Goods|| Love|| War|| search|| Sections|| Fashions|| Fame|| Beauty|| Goods|| Love|| War|| Plus|| Runway|| Street|| Style|| Designers|| Sites|| NYMag.com|| Daily|| Intelligencer|| Vulture|| Science|| of|| Us|| Grub|| Street|| Bedford|| Bowery|| Like|| UsFollow|| Us|| Popular|| on|| The|| Cut|| Ask|| Polly|| Should|| I|| Just|| Give|| Up|| on|| My|| Writing|| Top|| Shows|| Oscar|| de|| la|| Renta|| See|| it|| Michael|| Kors|| See|| it|| Suno|| See|| it|| Coach|| See|| it|| Narciso|| Rodriguez|| See|| it|| Tory|| Burch|| See|| it|| Carolina|| Herrera|| See|| it|| Rodarte|| See|| it|| Diesel|| Black|| Gold|| See|| it|| Jeremy|| Scott|| See|| it|| Thom|| Browne|| See|| it|| rag|| bone|| See|| it|| Tommy|| Hilfiger|| See|| it|| Prab

# Current Stanford tokenization scheme I use
    1. In doc index no. 2 it keeps the >> (\xbb) weird chars (with proper encoding of course)
    2. In doc index no. 2 it keeps the ... (\u2026) weird chars

In [97]:
stan_tokens = [t[0] for t in stan_docs[test_doc_index]]
describe_token_array(stan_tokens, 'stanford')
# print describe_token_array(stan_tokens, 'stanford')[1]

(u'884 stanford tokens',
 u'NYMag.com|| Daily|| Intelligencer|| Vulture|| The|| Cut|| Science|| of|| Us|| Grub|| Street|| Bedford|| &|| Bowery|| FOLLOW:|| Facebook|| Twitter|| UserName|| LOG|| IN|| REGISTER|| Fashions|| Runway|| Street|| Style|| Designers|| Fame|| Beauty|| Goods|| Love|| &|| War|| search|| Sections|| Fashions|| Fame|| Beauty|| Goods|| Love|| &|| War|| Plus|| Runway|| Street|| Style|| Designers|| Sites|| NYMag.com|| Daily|| Intelligencer|| Vulture|| Science|| of|| Us|| Grub|| Street|| Bedford|| &|| Bowery|| Like|| UsFollow|| Us|| Popular|| on|| The|| Cut|| Ask|| Polly:|| Should|| I|| Just|| Give|| Up|| on|| My|| Writing?|| \xbb|| Top|| Shows|| Oscar|| de|| la|| Renta|| See|| it|| \xbb|| Michael|| Kors|| See|| it|| \xbb|| Suno|| See|| it|| \xbb|| Coach|| See|| it|| \xbb|| Narciso|| Rodriguez|| See|| it|| \xbb|| Tory|| Burch|| See|| it|| \xbb|| Carolina|| Herrera|| See|| it|| \xbb|| Rodarte|| See|| it|| \xbb|| Diesel|| Black|| Gold|| See|| it|| \xbb|| Jeremy|| Scott|| See

# I have two options for dealing with differences due to special characters in text.
    1. Pre process the text for stanford to remove all special characters.
    2. Keep the special characters and build a function that looks for similar words in a token range.

# I need to map array a to array b of different length both arrays sharing a subset of values in the set a + b

This way, I can find what window of tokens did the linker or taggers found.  

In [100]:
a = [u'aaaa', u'a.', u'hola', u'perro']
b = [u"aaaa's", u'!!!', u'a', u'hola!!', u'...', u',', u'perro']

expected = [0, 2, 3, 4]

calls = 0
def map_token_list_a_to_b(a, b, a_shifts = 0, b_shifts = 0):
    global calls
    calls += 1
    difference = len(a) - len(b)
    a_index = 0
    b_index = 0
    
    indexes = list()
    if not a:
        return []
    
    if not b:
        indexes.extend([None for i in range(abs(difference))])
        return indexes
    
    if similar(a[0], b[0], 0.6):
        pair = (a[0], b[0], a_shifts, b_shifts)
        indexes.append(pair)
        indexes.extend(map_token_list_a_to_b(a[1:], b[1:], a_shifts + 1, b_shifts + 1))
        return indexes
    else:
        if abs(difference) > 8:
            return indexes
        
        shift_a = map_token_list_a_to_b(a[1:], b, a_shifts + 1, b_shifts) if len(a) > 1 and difference >= 0 else None
        shift_b = map_token_list_a_to_b(a, b[1:], a_shifts, b_shifts + 1) if len(b) > 1 and difference < 0 else None
        
        if shift_a and shift_b:
            print "Both returned\na={0}\nb={1}".format(shift_a, shift_b)
            indexes.extend(shift_a if len(shift_a) >= len(shift_b) else shift_b)
            return indexes
        elif shift_a:
            indexes.extend(shift_a)
            return indexes
        elif shift_b:
            indexes.extend(shift_b)
            return indexes
        else:
            return []

print map_token_list_a_to_b(a, b)

[(u'aaaa', u"aaaa's", 0, 0), (u'a.', u'a', 1, 2), (u'hola', u'hola!!', 2, 3), (u'perro', u'perro', 3, 6)]


In [132]:
valid_index = lambda i, arr: i < len(arr)

def should_shift_list (arr, i, shifts, max_shifts, previous_similar):
    return valid_index(i+shifts, arr) and shifts <= max_shifts and not previous_similar

def compare_a_to_b_shifting_max_from_indexes (a, b, max_shifts, a_i, b_i):
    
    next_shift = 0
    current_shift = 0
    are_similar = False
    while should_shift_list(a, a_i, next_shift, max_shifts, are_similar):
        are_similar = similar(a[a_i + next_shift], b[b_i], 0.6)
        current_shift = next_shift
        next_shift += 1    
    
    current_shift = -1 if not are_similar else current_shift
    return current_shift

def shift_indexes(a_i, b_i, a_shifts, b_shifts):
    if a_shifts > 0 and b_shifts > 0:
        if a_shifts <= b_shifts:
            a_i += a_shifts
        else:
            b_i += b_shifts
    elif a_shifts > 0:
        a_i += a_shifts
    else:
        b_i += b_shifts
    
    return a_i, b_i

def update_max_shifts(max_shifts, a_shifts, b_shifts):
    return max_shifts - max([a_shifts, b_shifts])

def map_token_list_a_to_b(a, b):
    max_shifts = abs(len(a) - len(b)) + 1
    a_index = 0
    b_index = 0
    
    mapping = list()
    still_similar = True
    while (a_index < len(a) or b_index < len(b) and still_similar):
        
        a_index = a_index if a_index < len(a) else len(a) - 1
        b_index = b_index if b_index < len(b) else len(b) - 1
        
        a_shifts = compare_a_to_b_shifting_max_from_indexes(a, b, max_shifts, a_index, b_index)
        b_shifts = compare_a_to_b_shifting_max_from_indexes(b, a, max_shifts, b_index, a_index)
        
        if a_shifts == -1 and b_shifts == -1:
            still_similar = False
        else:
            a_index, b_index = shift_indexes(a_index, b_index, a_shifts, b_shifts)
            max_shifts = update_max_shifts(max_shifts, a_shifts, b_shifts)
            a_to_b = (a[a_index], a_index, b[b_index], b_index)
            mapping.append(a_to_b)
        
        a_index += 1
        b_index += 1
    
    return mapping

a = [u'aaaa', u'a.', u'hola', u'..', u',', u'perro']
b = [u"aaaa's", u'!!!', u'a', u'hola!!', u'perro']
expected = [0, 2, 3, 4]
 
print map_token_list_a_to_b(a, b)

[(u'aaaa', 0, u"aaaa's", 0), (u'a.', 1, u'a', 2), (u'hola', 2, u'hola!!', 3), (u'perro', 5, u'perro', 4)]


In [122]:
len(stan_tokens), len(wiki_tokens)

(884, 876)

In [128]:
stan_to_wiki = map_token_list_a_to_b(stan_tokens, wiki_tokens)

In [129]:
print u"\n".join([u"Stan={0}; Wiki={1}".format(stan_tokens[m[1]], wiki_tokens[m[3]]) for m in stan_to_wiki if not similar(stan_tokens[m[1]], wiki_tokens[m[3]], 0.6)])




In [130]:
stan_to_wiki

[(u'NYMag.com', 0, u'NYMag.com', 0),
 (u'Daily', 1, u'Daily', 1),
 (u'Intelligencer', 2, u'Intelligencer', 2),
 (u'Vulture', 3, u'Vulture', 3),
 (u'The', 4, u'The', 4),
 (u'Cut', 5, u'Cut', 5),
 (u'Science', 6, u'Science', 6),
 (u'of', 7, u'of', 7),
 (u'Us', 8, u'Us', 8),
 (u'Grub', 9, u'Grub', 9),
 (u'Street', 10, u'Street', 10),
 (u'Bedford', 11, u'Bedford', 11),
 (u'Bowery', 13, u'Bowery', 12),
 (u'FOLLOW:', 14, u'FOLLOW', 13),
 (u'Facebook', 15, u'Facebook', 14),
 (u'Twitter', 16, u'Twitter', 15),
 (u'UserName', 17, u'UserName', 16),
 (u'LOG', 18, u'LOG', 17),
 (u'IN', 19, u'IN', 18),
 (u'REGISTER', 20, u'REGISTER', 19),
 (u'Fashions', 21, u'Fashions', 20),
 (u'Runway', 22, u'Runway', 21),
 (u'Street', 23, u'Street', 22),
 (u'Style', 24, u'Style', 23),
 (u'Designers', 25, u'Designers', 24),
 (u'Fame', 26, u'Fame', 25),
 (u'Beauty', 27, u'Beauty', 26),
 (u'Goods', 28, u'Goods', 27),
 (u'Love', 29, u'Love', 28),
 (u'War', 31, u'War', 29),
 (u'search', 32, u'search', 30),
 (u'Sections