[Documentation page](https://github.com/cephcyn/alignpaper/blob/master/documentation/muitiple_alignment.md)

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import numpy as np

import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option('display.max_columns', None)

In [None]:
import gensim

# Load fasttext-wiki-news-subwords-300 pretrained model
fasttext = gensim.models.keyedvectors.FastTextKeyedVectors.load('model/fasttext-wiki-news-subwords-300.model', mmap='r')

# # Load word2vec-google-news-300 pretrained model
# word2vec = gensim.models.KeyedVectors.load('model/word2vec-google-news-300.model', mmap='r')

import spacy

sp = spacy.load('en_core_web_sm')

import scispacy
from scispacy.linking import EntityLinker

scisp = spacy.load('en_core_sci_sm')
linker = scisp.add_pipe('scispacy_linker', config={'resolve_abbreviations': True, 'linker_name': 'umls'})

In [None]:
import warnings

# Get the embedding of a phrase
def get_phrase_embed(embed_model, phrase, remove_label=False, norm_zero_threshold=0.000000001):
    # split the phrase into tokens to pass into the embed model
    try:
        phraseS = phrase.split()
    except:
        return pd.DataFrame()
    # TODO remove stopwords?
    # retrieve the embeddings of each token in the phrase
    unknowns = []
    emb = []
    for w in phraseS:
        try:
            emb.append(embed_model[w])
        except:
            unknowns.append(w)
    # normalize each embed so that it has a norm of 1
    emb_normalized = []
    for i in range(len(emb)):
        e = emb[i]
        e_norm = np.linalg.norm(e)
        if e_norm < norm_zero_threshold:
            warnings.warn(f'embed vector for word \'{phraseS[i]}\' with extremely low norm value')
        emb_normalized.append(e / e_norm)
    emb = emb_normalized
    # if there are no recognized tokens in the phrase, return empty (same as non-splittable phrase)
    if len(emb) == 0:
        return pd.DataFrame()
    # Average the embeds for tokens which have embeds
    emb_avg = pd.DataFrame(emb).sum() / len(emb)
    if not remove_label:
        emb_avg['word'] = phrase
    return pd.DataFrame([emb_avg])

# get_phrase_embed(
#     word2vec, 
#     'test sentence')
# get_phrase_embed(
#     word2vec, 
#     'This is a test sentence !')

# Import sample dataset
(The code to construct the file `temp/ebm-pio_consegments.hdf` is in analyze.ipynb)

In [None]:
# Import the data we've already constructed out of constituency parse of specific phrases in specific sentences
con_segments = pd.read_hdf(f'temp/ebm-pio_consegments.hdf','mydata')
con_segments

In [None]:
# Transform that data into the format that is more readable for alignment
# (sorry, this is sort of an abuse of DataFrame datatypes)

def transformTuples(row):
    # turn each row into the segment tuples used for alignment
    output = pd.DataFrame()
    for i in range(len(row['alignsegments'])):
        output[f'txt{i}'] = [(row['alignsegments'][i], row['aligntypes'][i], row['alignctypes'][i])]
    return output.set_index(pd.Series([row.name]))

transformTuples(con_segments.loc[7298])

In [None]:
# alignment_df = con_segments.groupby(con_segments.index, group_keys=False).apply(
#     lambda group: transformTuples(group.iloc[0]))
# alignment_df = alignment_df.applymap(lambda x: ('', '', []) if x is np.nan else x)
# alignment_df.to_hdf(f'temp/ebm-pio_consegments-alignformat.hdf', 'mydata', mode='w')
alignment_df = pd.read_hdf(f'temp/ebm-pio_consegments-alignformat.hdf','mydata')
alignment_df

In [None]:
def splitAlignTuples(row, split_token=' '):
    # take each row of alignment tuples and split them on the split token
    output = pd.DataFrame()
    # extract the data we're going to use
    tokens = ' '.join(row.map(lambda x: x[0])).split()
    cpos = [e for sublist in list(row.map(lambda x: x[2])) for e in sublist]
    output_row = list(zip(tokens, cpos, [[e] for e in cpos]))
    for i in range(len(tokens)):
        output[f'txt{i}'] = [output_row[i]]
    return output.set_index(pd.Series([row.name]))

splitAlignTuples(alignment_df.loc[1])

In [None]:
# alignment_df_tseg = alignment_df.groupby(alignment_df.index, group_keys=False).apply(
#     lambda group: splitAlignTuples(group.iloc[0]))
# alignment_df_tseg = alignment_df_tseg.applymap(lambda x: ('', '', []) if x is np.nan else x)
# alignment_df_tseg.to_hdf(f'temp/ebm-pio_tseg-alignformat.hdf', 'mydata', mode='w')
alignment_df_tseg = pd.read_hdf(f'temp/ebm-pio_tseg-alignformat.hdf','mydata')
alignment_df_tseg

### Parse file or string into alignment DF

In [None]:
import ast

def sheetstring_to_alignment(alignment_text):
    data_start = 4
    nonpos_characters = [' ', '\'', '[', ']']
    rows = {}
    alignment_text = alignment_text.split('\n')
    # fill in actual data
    for line in alignment_text:
        cells = line.split('\t')
        if cells[2] == 's-txt':
            # input the text data
            if cells[0] not in rows:
                rows[cells[0]] = [('','',[])]*len(cells[data_start:])
            for i in range(min(len(cells[data_start:]), len(rows[cells[0]]))):
                prev_tuple = rows[cells[0]][i]
                rows[cells[0]][i] = (cells[data_start+i].strip(), prev_tuple[1], prev_tuple[2])
        elif cells[2] == 's-pos':
            # input the token pos data
            if cells[0] not in rows:
                rows[cells[0]] = [('','',[])]*len(cells[data_start:])
            for i in range(min(len(cells[data_start:]), len(rows[cells[0]]))):
                prev_tuple = rows[cells[0]][i]
                cell_pos = ast.literal_eval(cells[data_start+i]) if len(cells[data_start+i])>0 else []
                cell_pos = [pos for pos in cell_pos if pos!='']
                rows[cells[0]][i] = (prev_tuple[0], prev_tuple[1], cell_pos)
        elif cells[2] == 's-ppos':
            if cells[0] not in rows:
                rows[cells[0]] = [('','',[])]*len(cells[data_start:])
            for i in range(min(len(cells[data_start:]), len(rows[cells[0]]))):
                prev_tuple = rows[cells[0]][i]
                rows[cells[0]][i] = (prev_tuple[0], cells[data_start+i].strip(), prev_tuple[2])
    # fill in the blank cells
    output_width = max([len(r) for r in rows.values()])
    for k in rows:
        for i in range(output_width - len(rows[k])):
            rows[k] = rows[k] + [('','',[])]
    output_df = pd.DataFrame(rows.values(), index=rows.keys())
    output_df.columns = [f'txt{i}' for i in range(len(output_df.columns))]
    return output_df

In [None]:
def read_alignment_from_sheetstring_file(filename):
    with open(filename, 'r') as f:
        file = f.read()
    file = file.splitlines()
    input_alignment_text = '\n'.join(file)
    align_df = sheetstring_to_alignment(input_alignment_text)
    return align_df

# read_alignment_from_sheetstring_file('testcases/basic-colcreat-005-relation/a.alignment')

### Parse JSONable dict format into alignment DF

In [None]:
def jsondict_to_alignment(alignment_dict):
    rows = {}
    for row in alignment_dict['alignment']:
        rows[row['id']] = [
            (
                ' '.join(row['txt'][i]) if ('txt' in row) else '',
                ' '.join(row['ppos'][i]) if ('ppos' in row) else '', # TODO haven't nailed down format... phrasePOS is hardly used
                row['pos'][i] if ('pos' in row) else []
            ) 
            for i in range(len(row['txt']))
        ]
    output_df = pd.DataFrame(rows.values(), index=rows.keys())
    output_df.columns = [f'txt{i}' for i in range(len(output_df.columns))]
    return output_df

In [None]:
import json

def read_alignment_from_jsondict_file(filename):
    with open(filename, 'r') as f:
        jsonfile = json.load(f)
    align_df = jsondict_to_alignment(jsonfile)
    return align_df

# read_alignment_from_jsondict_file('testcases/basic-colcreat-005-relation/a.json')

### Parse alignment DF into JSONable dict format

In [None]:
def alignment_to_jsondict(alignment_df):
    rows = []
    for index, row in alignment_df.iterrows():
        row_obj = {}
        row_obj['id'] = index
        row_obj['pos'] = []
        row_obj['txt'] = []
        for col in row:
            row_obj['pos'].append(col[2])
            row_obj['txt'].append([e for e in col[0].split(' ') if (e != '')])
        rows.append(row_obj)
    return {'alignment':rows}

# alignment_to_json(read_alignment_from_jsondict_file('testcases/basic-colcreat-005-relation/a.json'))

# Alignment operations

In [None]:
# def mergeAdjacentNP(row):
#     # merge adjacent noun phrases
#     # TODO adapt this for alignment df format!!!
#     # TODO fix the deep copy bug
#     output = row.copy()
#     for i in reversed(range(len(output['alignsegments'])-1)):
#         if output['aligntypes'][i]=='NP' and output['aligntypes'][i+1]=='NP':
#             output['alignsegments'][i] += ' ' + output['alignsegments'][i+1]
#             output['alignsegments'][i+1] = []
#             output['aligntypes'][i+1] = []
#             output['alignctypes'][i] += output['alignctypes'][i+1]
#             output['alignctypes'][i+1] = []
#     output = output.drop('aligntup')
#     output['alignsegments'] = [e for e in output['alignsegments'] if e != []]
#     output['aligntypes'] = [e for e in output['aligntypes'] if e != []]
#     output['alignctypes'] = [[e for i in e] for e in output['alignctypes'] if e != []]
#     return output

# temp_df = con_segments
# temp_df = temp_df.apply(
#     lambda row: mergeAdjacentNP(row), 
#     axis=1, result_type='expand')
# temp_df

In [None]:
# # TODO this is still buggy, don't use it
# def mergeParentheses(row):
#     # merge parenthetical clauses
#     numOpenParens = 0
#     lastStart = -1
#     mergeSegments = []
#     for i in range(len(row['alignsegments'])):
#         for c in [c for c in row['alignsegments'][i] if c in ['(', ')']]:
#             if c == '(':
#                 numOpenParens += 1
#                 if numOpenParens == 1:
#                     lastStart = i
#             else:
#                 numOpenParens -= 1
#                 if numOpenParens == 0 and lastStart != i:
#                     # close the parentheses
#                     mergeSegments.append((lastStart, i))
#     if numOpenParens > 0:
#         mergeSegments.append((lastStart, len(row['alignsegments'])))
#     mergeSegments = list(set(mergeSegments))
#     if mergeSegments != []:
#         for t in reversed(mergeSegments):
#             print(row['aligntypes'][t[0]:t[1]+1])
#             print(row['alignsegments'][t[0]:t[1]+1])
#         print()
#     return row

# temp_df = con_segments
# temp_df.apply(
#     lambda row: mergeParentheses(row), 
#     axis=1, result_type='expand')

In [None]:
def extractTup(data, tup_i=0, is_frame=True):
    types = {
        'segment': 0,
        'pos': 1,
        'cpos': 2
    }
    if tup_i in types:
        tup_i = types[tup_i]
    else:
        raise ValueError(f'tup_i not in types: {types.keys()}')
    if is_frame:
        return data.applymap(lambda x: x[tup_i])
    else:
        return data.map(lambda x: x[tup_i])

# extractTup(transformTuples(temp_df.loc[7298]), tup_i='segment', is_frame=True)
extractTup(alignment_df.loc[[7298]], tup_i='segment', is_frame=True)

In [None]:
def removeEmptyColumns(align_df):
    output_columns = []
    for c in align_df.columns:
        align_df_c = extractTup(align_df.loc[:, c], tup_i='segment', is_frame=False)
        if len([e for e in align_df_c if e.strip() != '']) != 0:
            output_columns.append(c)
    output_df = align_df[output_columns]
    output_df.columns = [f'txt{i}' for i in range(len(output_df.columns))]
    return output_df
removeEmptyColumns(alignment_df.loc[[7298, 7321]])

### Create alignment
[alignRowMajorLocal documentation / pseudocode](https://github.com/cephcyn/alignpaper/blob/master/documentation/multiple_alignment.md#alignRowMajorLocal)

In [None]:
import math
from nltk.metrics import edit_distance

def alignRowMajorLocal(align_a, align_b, use_types=False, remove_empty_cols=True, embed_model=fasttext, debug_print=False):
    # An implementation of Smith-Waterman alignment
    # RETURNS:
    #  1. The alignment DataFrame
    #  2. The score associated with this alignment
    if remove_empty_cols:
        align_a = removeEmptyColumns(align_a)
        align_b = removeEmptyColumns(align_b)
    align_a_segment = extractTup(align_a, tup_i='segment')
    align_b_segment = extractTup(align_b, tup_i='segment')
    align_a_type = extractTup(align_a, tup_i='pos')
    align_b_type = extractTup(align_b, tup_i='pos')
    align_a_ctype = extractTup(align_a, tup_i='cpos')
    align_b_ctype = extractTup(align_b, tup_i='cpos')
    # Doing a general alignment
    align_a_elems = [i for i in range(len(align_a.columns))]
    align_b_elems = [i for i in range(len(align_b.columns))]
    if debug_print:
        print(align_a_elems)
        print(align_b_elems)
        print()
    def getScoreAligningIndices(index_a, index_b, embed_model):
        # A higher score is better / more match!
        # make sure all the segment texts are precomputed lol
        text_a = list(align_a_segment[align_a.columns[index_a]])
        text_b = list(align_b_segment[align_b.columns[index_b]])
        # TODO clean up this embed checking thing and remove the need for cached_phrase_embeds
        # TODO also unify this with the col embed variation measure somehow?
        cached_phrase_embeds = {}
        for text in text_a+text_b:
            if text not in cached_phrase_embeds:
                try:
                    cached_phrase_embeds[text] = get_phrase_embed(embed_model, text).drop('word', 1)
                except KeyError:
                    pass
        # start off with phrase embedding distance (current max is 60 for perfect match)
        # if we have embeds for any word in each set, ignore others and just use words we have embeds for
        if any(s in cached_phrase_embeds for s in text_a)\
                and any(s in cached_phrase_embeds for s in text_b):
            # calculate overall embeds
            embed_a = pd.concat([cached_phrase_embeds[text] for text
                                 in text_a if text in cached_phrase_embeds]).apply(lambda x: x.mean())
            embed_b = pd.concat([cached_phrase_embeds[text] for text
                                 in text_b if text in cached_phrase_embeds]).apply(lambda x: x.mean())
            # TODO can tweak this scoring calculation a little for performance
            score = 10 * (6 - np.linalg.norm(embed_a-embed_b))
        else:
            # use levenshtein dist as fallback... if either set has NO words with embeds available
            scaled_edits_sum = 0
            for phrase_a in [p for p in text_a if len(p) != 0]:
                for phrase_b in [p for p in text_b if len(p) != 0]:
                    scaled_edits_sum += edit_distance(phrase_a,phrase_b) / max(len(phrase_a), len(phrase_b))
            score = 60 * (1 - (scaled_edits_sum / (len(text_a) * len(text_b))))
        # add a component based on phrase type if that flag is set
        # TODO improve this?; this currently just returns -inf if mismatch of type sets
        # Might want to add support for aligning different types of phrase together...
        if use_types:
            # reduce to set
            types_a = set([t for t in align_a_type[align_a.columns[index_a]] if t.strip() != ''])
            types_b = set([t for t in align_b_type[align_b.columns[index_b]] if t.strip() != ''])
#             # check if we are handling a hard pos match
#             if any([((p in types_a) or (p in types_b)) for p in pos_must_match]):
            if len(types_a) != 0 and len(types_b) != 0 and types_a != types_b:
                score = -1 * math.inf
        # TODO: add a component based on phrase ctype (phrase POS breakdown) (?)
        if debug_print:
            print(f'scoring between '
                  +f'"{list(align_a_segment[align_a.columns[index_a]])}" and '
                  +f'"{list(align_b_segment[align_b.columns[index_b]])}": {score}')
        return score
    def getGapPenalty(length, size=1):
        return -1 * (1 * min(length,1) + 0.1 * max(length-1,0)) #* (1 + math.log(size))
    # Build score matrix of size (a-alignables + 1)x(b-alignables + 1)
    scores = np.zeros((len(align_a_elems)+1, len(align_b_elems)+1))
    # Build traceback matrix
    # traceback = 0 for end, 4 for W, 7 for NW, 9 for N (to calculate traceback, t%2 is N-ness, t%3 is W-ness)
    traceback = np.zeros((len(align_a_elems)+1, len(align_b_elems)+1))
    # Iterate through all of the cells to populate both the score and traceback matrices
    for i in range(1, scores.shape[0]):
        for j in range(1, scores.shape[1]):
            score_map = {}
            # calculate score for aligning nouns a[i] and b[j]
            score_map[
                scores[i-1,j-1] + getScoreAligningIndices(align_a_elems[i-1], align_b_elems[j-1], embed_model)
            ] = 7
            # calculate score for gap in i
            for i_gap in range(1, i):
                igap_score = scores[i-i_gap,j] + getGapPenalty(i_gap, size=len(align_a_elems))
                score_map[igap_score] = 9
            # calculate score for gap in j
            for j_gap in range(1, j):
                jgap_score = scores[i,j-j_gap] + getGapPenalty(j_gap, size=len(align_b_elems))
                score_map[jgap_score] = 4
            # add the possibility for unrelatedness
            score_map[0] = 0
            scores[i,j] = max(score_map.keys())
            traceback[i,j] = score_map[max(score_map.keys())]
    if debug_print:
        print()
        print(scores)
        print(traceback)
        print()
    # Do traceback to build our final alignment
    tracepoint = np.unravel_index(np.argmax(scores, axis=None), scores.shape)
    points_a = []
    points_b = []
    while traceback[tracepoint] != 0:
        # contribute to the align information
        if traceback[tracepoint] == 7:
            # this is a point where two elements were aligned
            points_a.append(align_a_elems[tracepoint[0]-1])
            points_b.append(align_b_elems[tracepoint[1]-1])
        elif traceback[tracepoint] == 4:
            # this is a point where there was a gap inserted for row_a
            points_a.append(-1)
            points_b.append(align_b_elems[tracepoint[1]-1])
        elif traceback[tracepoint] == 9:
            # this is a point where there was a gap inserted for row_b
            points_a.append(align_a_elems[tracepoint[0]-1])
            points_b.append(-1)
        # step backwards
        tracepoint = (
            tracepoint[0] - int(traceback[tracepoint] % 2),
            tracepoint[1] - int(traceback[tracepoint] % 3))
    points_a = list(reversed(points_a))
    points_b = list(reversed(points_b))
    if len(points_a) != len(points_b):
        # enforce that align_a and align_b are the same length (they should be)
        raise ValueError('should not occur; bug in S-W local alignment?')
    if debug_print:
        print(points_a)
        print(points_b)
        print()
    # Create a nice neat form of this alignment
    # TODO add support for NP-only alignment gaps?
    range_a = [i for i in points_a if i >= 0]
    range_b = [i for i in points_b if i >= 0]
    range_a = (range_a[0], range_a[-1])
    range_b = (range_b[0], range_b[-1])
    output = pd.DataFrame(columns=[f'txt{i}' for i in range(
        (range_a[0] + range_b[0]) + len(points_a)
        + max(0, (len(align_a.columns) - range_a[1]) - 1)
        + max(0, (len(align_b.columns) - range_b[1]) - 1)
    )])
    # build the segment from align_a
    realign_a = align_a.loc[:, [f'txt{i}' for i in range(range_a[0])]]
    for i in range(range_b[0]):
        realign_a.insert(len(realign_a.columns), f'insx{i}', np.nan, True)
    for i in points_a:
        if i >= 0:
            realign_a[align_a.columns[i]] = align_a.loc[:, align_a.columns[i]]
        else:
            realign_a.insert(len(realign_a.columns), f'ins{len(realign_a.columns)}', np.nan, True)
    for i in range(range_a[1]+1, len(align_a.columns)):
        realign_a[align_a.columns[i]] = align_a.loc[:, align_a.columns[i]]
    for i in range(range_b[1]+1, len(align_b.columns)):
        realign_a.insert(len(realign_a.columns), f'insx{i+range_b[0]}', np.nan, True)
    # build the segment from align_b
    realign_b = align_b.loc[:, [f'txt{i}' for i in range(range_b[0])]]
    for i in range(range_a[0]):
        realign_b.insert(0, f'insx{i}', np.nan, True)
    for i in points_b:
        if i >= 0:
            realign_b[align_b.columns[i]] = align_b.loc[:, align_b.columns[i]]
        else:
            realign_b.insert(len(realign_b.columns), f'ins{len(realign_b.columns)}', np.nan, True)
    for i in range(range_a[1]+1, len(align_a.columns)):
        realign_b.insert(len(realign_b.columns), f'insx{i+range_a[0]}', np.nan, True)
    for i in range(range_b[1]+1, len(align_b.columns)):
        realign_b[align_b.columns[i]] = align_b.loc[:, align_b.columns[i]]
    # build final output
    realign_a.columns = output.columns
    realign_b.columns = output.columns
    output = output.append(realign_a)
    output = output.append(realign_b)
    return output.applymap(lambda x: ('', '', []) if x is np.nan else x), np.amax(scores, axis=None)

# toy_align, toy_align_score = alignRowMajorLocal(
#     alignment_df.loc[[7298]],
#     alignment_df.loc[[7321]],
#     remove_empty_cols=True)
# print(toy_align_score)
# toy_align

# toy_align, toy_align_score = alignRowMajorLocal(
#     alignment_df_tseg.loc[[7298]],
#     alignment_df_tseg.loc[[7321]],
#     remove_empty_cols=True, use_types=True)
# print(toy_align_score)
# toy_align

In [None]:
# alignment_df_tseg

In [None]:
toy_align, toy_align_score = alignRowMajorLocal(
    alignment_df_tseg.loc[[1021]],
    alignment_df_tseg.loc[[1048]],
    remove_empty_cols=True, use_types=False)
print(toy_align_score)
toy_align

In [None]:
# Create the toy data
toy_tiny_data = pd.Series(
    ['Asperger syndrome', 
     'high - functioning ASD', 
     'unrecognized and untreated anxiety', 
     'generalized anxiety disorders', 
     'anxiety', 
     'high - functioning autism spectrum disorders and anxiety', 
     'high - functioning ASD and anxiety', 
     'high - functioning ASD', 
     'high - functioning autism spectrum disorders', 
     'previously undetected anxiety', 
     'untreated anxiety']
)
# toy_tiny_data

In [None]:
# Create the toy data alignment
toy_tiny_align = pd.DataFrame(toy_tiny_data.map(lambda x: (x, '', [] if len(x)==0 else [f'POS{e}' for e in range(len(x.split(' ')))])))
toy_tiny_align = toy_tiny_align.rename(columns={0:'txt0'})
toy_tiny_align = toy_tiny_align.set_index(toy_tiny_align.index.map(lambda x: str(x)))
# toy_tiny_align

In [None]:
# Column split step 1: Build word tree with node = word units running right->left

# add text to the given trienode (edits the given tree_node)
def wordTreeHelper(tree_node, text, id_data=None, right_align=False):
    text = text.strip()
    # Check for base case
    if text == '':
        tree_node[id_data] = id_data
        return tree_node
    # Select the right key
    key = ''
    if right_align:
        key = text.split(' ')[-1]
        text = ' '.join(text.split(' ')[0:-1])
    else:
        key = text.split(' ')[0]
        text = ' '.join(text.split(' ')[1:])
    # Put the key and text into the trie
    if key not in tree_node:
        tree_node[key] = {}
    tree_node[key] = wordTreeHelper(tree_node[key], text, id_data=id_data, right_align=right_align)
    return tree_node

def wordTree(df, right_align=False):
    tree = {}
    for e_id in df.index:
        tree = wordTreeHelper(tree, df.loc[e_id], id_data=e_id, right_align=right_align)
    return tree

st = wordTree(toy_tiny_data, 'txt')
st

In [None]:
# Column split step 2a: Collapse the suffix trie (merge nodes with only one child)

# edits the input trie
def wordTreeCollapse(tree, right_align=False):
    # Collapse children nodes first
    added_keys = {}
    removed_keys = []
    for child in tree:
        # if child maps to more nodes
        if tree[child]!=child:
            tree[child] = wordTreeCollapse(tree[child], right_align=right_align)
            # Check if the new child node is collapsible
            if len(tree[child]) == 1:
                merge_key = list(tree[child])[0]
                if tree[child][merge_key]!=merge_key:
                    grandchild = list(tree[child])[0]
                    grandchild_tree = tree[child][grandchild]
                    # Perform the merge (put into edit queue)
                    removed_keys.append(child)
                    if right_align:
                        added_keys[grandchild + ' ' + child] = grandchild_tree
                    else:
                        added_keys[child + ' ' + grandchild] = grandchild_tree
    # Perform removals
    for key in removed_keys:
        tree.pop(key)
    # Perform additions
    for key in added_keys:
        tree[key] = added_keys[key]
    return tree

st = wordTreeCollapse(st)
st

In [None]:
# Column split step 2b: Compress the suffix trie to only two levels of depth.

def wordTreeCompressHelper(tree_node, col_stack=[]):
    updated_node = {}
    for k in tree_node:
        if k==tree_node[k]:
            # if this reaches a leaf node
            full_text = ' '.join(col_stack)
            if full_text not in updated_node:
                updated_node[full_text] = {}
            updated_node[full_text][k]=k
        else:
            # if this is not a leaf node
            sub_node = wordTreeCompressHelper(tree_node[k], col_stack+[k])
            for sub_text in sub_node:
                if sub_text not in updated_node:
                    updated_node[sub_text] = {}
                for sub_value in sub_node[sub_text]:
                    updated_node[sub_text][sub_value] = sub_node[sub_text][sub_value]
    return updated_node

# edits the input trie
def wordTreeCompress(tree, right_align=False):
    for root in tree:
        if root!=tree[root]:
            # compress the 2nd level onwards into a single level!
            compressed = wordTreeCompressHelper(tree[root])
            # remove the empty string artifact from compresshelper
            if '' in compressed:
                for k in compressed['']:
                    compressed[k] = compressed[''][k]
                compressed.pop('')
            tree[root] = compressed
    return tree

st = wordTreeCompress(st)
st

In [None]:
# Column split step 3: Output the suffix trie to multiple columns

# Calculate how many output columns we'll need
# Get the depth of the trie (a trie with one terminal node {0:0} has depth 0)
def wordTreeDepth(tree):
    max_depth = 0
    for child in tree:
        if tree[child]!=child:
            max_depth = max(max_depth, 1 + wordTreeDepth(tree[child]))
    return max_depth

def wordTreeSplit(tree, max_depth, output, so_far=[], right_align=False):
    for child in tree:
        if tree[child]==child:
            # we have hit a base, put in an entry
            if right_align:
                output[child] = ['']*(max_depth - len(so_far)) + so_far
            else:
                output[child] = so_far + ['']*(max_depth - len(so_far))
        else:
            # this node has further children!
            output = wordTreeSplit(
                tree[child], 
                max_depth, 
                output, 
                ([child] + so_far) if right_align else (so_far + [child]), 
                right_align=right_align)
    return output

wordTreeSplit(st, wordTreeDepth(st), {}, right_align=False)

### split columns (word trie, unpolished)
[splitCol documentation](https://github.com/cephcyn/alignpaper/blob/master/documentation/multiple_alignment.md#splitCol)

In [None]:
def splitCol(src_alignment, split_col, right_align=False):
    tree = wordTree(src_alignment[split_col].map(lambda x: x[0]), right_align=right_align)
    tree = wordTreeCollapse(tree, right_align=right_align)
    # squish that tree into only 2 levels because we only want to generate ONE new column max
    tree = wordTreeCompress(tree)
    tree_depth = wordTreeDepth(tree)
    split_data = wordTreeSplit(tree, tree_depth, {}, right_align=right_align)
    # TODO ... if we actually use this later, use the parse tree to determine merged phrase POS
    pos_info = src_alignment[split_col].map(lambda x: x[2])
    for k in split_data:
        pos_i = 0
        for i in range(len(split_data[k])):
            chunk_text = split_data[k][i]
            chunk_len = 0 if len(chunk_text)==0 else len(chunk_text.split(' '))
            split_data[k][i] = (chunk_text, '', pos_info[k][pos_i:pos_i+chunk_len])
            pos_i += chunk_len
    split = pd.DataFrame(columns=[f'{split_col}-{i}' for i in range(tree_depth)])
    for id in split_data:
        split.loc[id] = split_data[id]
    # combine the split results back into the rest of our original input data!
    output = src_alignment.copy()
    for i in range(len(split.columns)):
        output.insert(output.columns.get_loc(split_col), f'{split_col}-{i}', split[split.columns[i]])
    output = output.drop(split_col, 1)
    output.columns = [f'txt{i}' for i in range(len(output.columns))]
    return output
    
splitCol(toy_tiny_align, 'txt0', right_align=False)

In [None]:
# # Read in our manually created start alignment
# align_df = read_alignment_from_sheetstring_file('interactive_input/alignment')

# align_df
# splitCol(align_df, 'txt20', right_align=True)

### merge columns
[mergeCol documentation](https://github.com/cephcyn/alignpaper/blob/master/documentation/multiple_alignment.md#mergeCol)

In [None]:
def mergeCol(src_alignment, merge_col):
    merge_col_next = src_alignment.columns[list(src_alignment.columns).index(merge_col)+1]
    # combine the segment (text)
    merged_segment = src_alignment[merge_col].map(lambda x: x[0]) + ' ' + src_alignment[merge_col_next].map(lambda x: x[0])
    merged_segment = merged_segment.map(lambda x: x.strip())
    # combine the pos (phrase pos)
    merged_pos = src_alignment[merge_col].map(lambda x: x[1]) + ' ' + src_alignment[merge_col_next].map(lambda x: x[1])
    merged_pos = merged_pos.map(lambda x: x.strip().split())
    merged_pos = merged_pos.map(lambda x: '' if len(x)==0 else x[0]) # TODO ... if we actually use this later, use the parse tree to determine merged phrase POS
    # combine the cpos (token / word pos)
    merged_cpos = src_alignment[merge_col].map(lambda x: x[2]) + src_alignment[merge_col_next].map(lambda x: x[2])
    # put the result column into our result
    result = src_alignment.copy()
    result[merge_col] = list(zip(merged_segment, merged_pos, merged_cpos))
    del result[merge_col_next]
    result.columns = [f'txt{i}' for i in range(len(result.columns))]
    return result

mergeCol(toy_align, 'txt0')

In [None]:
def canShiftCells(src_alignment, shift_rows, shift_col, shift_distance, shift_size):
    # remove duplicates
    shift_rows = list(set(shift_rows))
    # check that the selected segment starting point(s) exist
    if not all([(e in src_alignment.index) for e in shift_rows]):
        return False
    if shift_col not in src_alignment.columns:
        return False
    # get the index numbers we are working with
    colindex_start = list(src_alignment.columns).index(shift_col)
    # check that the entire selected segment is contained within the alignment
    if colindex_start + shift_size > len(src_alignment.columns):
        return False
    # check that the proposed shift is contained within the alignment
    if (colindex_start + shift_distance) < 0 or (colindex_start + shift_distance) >= len(src_alignment.columns):
        return False
    if (colindex_start + (shift_size-1) + shift_distance) < 0 or (colindex_start + (shift_size-1) + shift_distance) >= len(src_alignment.columns):
        return False
    # check that the alignment segment(s) is entirely text and does not contain whitespace
    if any([any([len(e[0].strip())==0 for e in src_alignment.loc[shift_row][colindex_start:colindex_start+shift_size]]) for shift_row in shift_rows]):
        return False
    # if the shift distance is 0, it always works (although it's a very useless shift)
    if shift_distance==0:
        return True
    # figure out if the shift collides with any other text for each of the rows we want to shift
    for shift_row in shift_rows:
        if shift_distance > 0:
            can_reach = [
                i for i
                in range(colindex_start+shift_size, min(len(src_alignment.loc[shift_row]), colindex_start+shift_size+shift_distance))
            ]
        elif shift_distance < 0:
            can_reach = [
                i for i 
                in reversed(range(max(0, colindex_start+shift_distance), colindex_start))
            ]
        can_reach = [(i, src_alignment.loc[shift_row][i][0].strip()=='') for i in can_reach]
        # check whether we should continue with the shift
        if not all([e[1] for e in can_reach]):
            return False
    return True

# canShiftCells(toy_align, [7321], 'txt1', 1, 3)
# canShiftCells(toy_align, [7321], 'txt1', 2, 3)
# canShiftCells(toy_align, [7321], 'txt1', 3, 3)
# canShiftCells(toy_align, [7321], 'txt1', 4, 3)
# canShiftCells(toy_align, [7321], 'txt1', 5, 3)
# canShiftCells(toy_align, [7321], 'txt1', 6, 3)

### shift cells
[shiftCell documentation](https://github.com/cephcyn/alignpaper/blob/master/documentation/multiple_alignment.md#shiftCell)

In [None]:
# If it is impossible to shift the cells as specified, throws a ValueError
def shiftCells(src_alignment, shift_rows, shift_col, shift_distance, shift_size=1, emptycell=('','',[]), debug_print=False):
    if debug_print:
        print(f'shift rows {shift_rows}, {shift_size} cells starting from {shift_col}, {shift_distance} cells over')
    # check if it's possible to shift
    if not canShiftCells(src_alignment, shift_rows, shift_col, shift_distance, shift_size):
        raise ValueError('impossible to shift with given parameters: ' 
                         + f'(shift row {shift_rows}, {shift_size} cells starting from {shift_col}, {shift_distance} cells over)')
    # initialize the alignment table copy we'll be working with
    result = src_alignment.copy()
    # get the index numbers we are working with
    colindex_start = list(result.columns).index(shift_col)
    for shift_row in shift_rows:
        # grab the old contents
        clipboard = [e for e in result.loc[shift_row][colindex_start:colindex_start+shift_size]]
        # replace old contents with empty tuples
        for i in range(colindex_start, colindex_start+shift_size):
            result.loc[shift_row][i] = emptycell
        # put old content in its destination location
        for i in range(len(clipboard)):
            result.loc[shift_row][colindex_start+shift_distance+i] = clipboard[i]
    return result # removeEmptyColumns(result)

# shiftCells(toy_align, [7321], 'txt1', 1, shift_size=3)
# shiftCells(toy_align, [7298], 'txt2', -1, shift_size=2)

In [None]:
# for i in [7298, 7321, 5126, 5134, 4594, 4618, 6507, 6474, 7308, 5130, 2552]:
#     alignment_df.loc[[i]]

In [None]:
# # Alignment in manually selected "nice" order without types enforced
# temp_align = []
# temp_align.append(alignRowMajorLocal(alignment_df.loc[[5126]], alignment_df.loc[[5134]], 
#                                      remove_empty_cols=True)[0])
# temp_align.append(alignRowMajorLocal(alignment_df.loc[[7298]], alignment_df.loc[[7321]], 
#                                      remove_empty_cols=True)[0])
# temp_align.append(alignRowMajorLocal(alignment_df.loc[[4594]], alignment_df.loc[[4618]], 
#                                      remove_empty_cols=True)[0])
# temp_align.append(alignRowMajorLocal(alignment_df.loc[[5130]], alignment_df.loc[[2552]], 
#                                      remove_empty_cols=True)[0])
# temp_align.append(alignRowMajorLocal(alignment_df.loc[[6474]], alignment_df.loc[[7308]], 
#                                      remove_empty_cols=True)[0])
# update_temp_align = []
# update_temp_align.append(alignRowMajorLocal(temp_align[2], temp_align[3])[0])
# update_temp_align.append(alignRowMajorLocal(temp_align[1], temp_align[4])[0])
# update_temp_align.append(alignRowMajorLocal(temp_align[0], alignment_df.loc[[6507]], 
#                                             remove_empty_cols=True)[0])
# temp_align = update_temp_align
# update_temp_align = []
# update_temp_align.append(alignRowMajorLocal(temp_align[0], temp_align[2])[0])
# manually_aligned_group, manually_aligned_group_score = alignRowMajorLocal(update_temp_align[0], temp_align[1])
# print(manually_aligned_group_score)
# extractTup(manually_aligned_group, tup_i='segment').sort_index()

In [None]:
# # Demonstrate a merge operation
# manually_aligned_group_merge = mergeCol(manually_aligned_group, 'txt4')
# manually_aligned_group_merge = mergeCol(manually_aligned_group_merge, 'txt4')
# manually_aligned_group_merge = mergeCol(manually_aligned_group_merge, 'txt1')
# manually_aligned_group_merge = mergeCol(manually_aligned_group_merge, 'txt4')
# manually_aligned_group_merge.sort_index()

In [None]:
# # Demonstrate a split operation
# manually_aligned_group_split = splitCol(manually_aligned_group_merge, 'txt0', right_align=True)
# manually_aligned_group_split = splitCol(manually_aligned_group_split, 'txt4', right_align=False)
# manually_aligned_group_split = splitCol(manually_aligned_group_split, 'txt5', right_align=True)
# manually_aligned_group_split = splitCol(manually_aligned_group_split, 'txt7', right_align=False)
# manually_aligned_group_split.sort_index()

In [None]:
# #          [5130, 5126, 5134, 4618, 6507, 4594, 7321, 7298, 2552, 6474, 7308] # experiment 1, topic "autism"
# #          [2380, 2711, 2437, 6915, 4887, 2078, 2030, 2849, 3194, 3285, 5437] # experiment 2, topic "cancer"
# #          [1248, 1275, 1381, 1387, 3871, 4039, 5202, 5204, 6563, 6569]       # experiment 3, topic "diabetes", untuned order
# id_order = [1248, 1275, 1381, 1387, 3871, 4039, 5202, 5204, 6563, 6569]

# alignment_wordsonly, alignment_score = alignRowMajorLocal(
#     alignment_df_tseg.loc[[id_order[0]]],
#     alignment_df_tseg.loc[[id_order[1]]],
#     remove_empty_cols=True, 
#     use_types=True
# )
# for i in range(2, len(id_order)):
#     alignment_wordsonly, alignment_score = alignRowMajorLocal(
#         alignment_wordsonly,
#         alignment_df_tseg.loc[[id_order[i]]],
#         remove_empty_cols=True, 
#         use_types=True
#     )
# extractTup(alignment_wordsonly, tup_i='segment').sort_index()
# alignment_wordsonly.sort_index()

### Parse alignment DF into copy-pastable spreadsheet format
(This relies on the pandas dataframe preview embed in Jupyter Notebook)

In [None]:
# note that this will output double quotes in POS rows oddly - need to manually merge them back due to a quirk in how pandas formats data
# TODO have this export directly to file if possible?
def spreadsheetFormat(alignment_df):
    alignment_df = alignment_df.sort_index()
    alignment_pos = extractTup(alignment_df, tup_i='cpos') # the token part of speech
    alignment_segment = extractTup(alignment_df, tup_i='segment')
    alignment_ppos = extractTup(alignment_df, tup_i='pos') # the phrase part of speech
    output_columns = ['id', 'fulltext', 'datatype', 'empty']+[str(i) for i in range(len(alignment_df.columns))]
    output_data = []
    row_length = len(output_columns)
    for i in alignment_df.index:
        fulltext = ' '.join([e for e in alignment_segment.loc[i].tolist() if len(e.strip())>0])
#         ppos = [i, fulltext, 's-ppos', '']+[e for e in alignment_ppos.loc[i].tolist() if len(e.strip())>0]
#         ppos = ppos + ['']*(len(output_columns)-len(ppos))
#         output_data.append(ppos)
        pos = [i, fulltext, 's-pos', '']+[([f'\'{i}\'' for i in e] if len(e)>0 else '') for e in alignment_pos.loc[i].tolist()]
        pos = pos + ['']*(len(output_columns)-len(pos))
        output_data.append(pos)
        txt = [i, fulltext, 's-txt', '']+alignment_segment.loc[i].tolist()
        txt = txt + ['']*(len(output_columns)-len(txt))
        output_data.append(txt)
    output_df = pd.DataFrame(
        output_data, 
        columns=output_columns)
    return output_df

# spreadsheetFormat(alignment_wordsonly)

# Alignment scoring

In [None]:
import os.path
import pickle

if not os.path.isfile('temp/align_temp_qualitycompare.pkl'):
    # create some reference alignments
    reference_alignment = []
    reference_alignment_orderings = []
    reference_alignment_scores = []
    temp_ids_list = [
    #     [5130, 5126, 6507, 6474, 7308, 5134, 2552, 4618, 7298, 4594, 7321], # decent
        [7321, 5134, 4594, 6507, 2552, 5130, 7298, 7308, 4618, 6474, 5126], # decent
    #     [5134, 7298, 4618, 6507, 7321, 5126, 6474, 5130, 4594, 2552, 7308], # bad
    #     [5126, 5134, 4618, 7298, 6507, 5130, 6474, 4594, 7308, 2552, 7321], # quite bad (chaotic)
        [5126, 7321, 7298, 5130, 6474, 4618, 4594, 7308, 2552, 6507, 5134]  # quite bad (sharply split)
    ]
    for i in range(len(temp_ids_list)):
        temp_ids = temp_ids_list[i]
        alignment, alignment_score = alignRowMajorLocal(
            alignment_df.loc[[temp_ids[0]]], 
            alignment_df.loc[[temp_ids[1]]], 
            remove_empty_cols=True
        )
        temp_scores = [0, alignment_score]
        for j in range(2, len(temp_ids)):
            alignment, alignment_score = alignRowMajorLocal(
                alignment,
                alignment_df.loc[[temp_ids[j]]], 
                remove_empty_cols=True
            )
            temp_scores.append(alignment_score)
        reference_alignment.append(alignment)
        reference_alignment_orderings.append(temp_ids)
        reference_alignment_scores.append(temp_scores)
    # dump to a file
    pickle.dump(
        (reference_alignment,reference_alignment_orderings,reference_alignment_scores), 
        open(f'temp/align_temp_qualitycompare.pkl', 'wb')
    )
# retrieve the alignments from file
temp = pickle.load(open(f'temp/align_temp_qualitycompare.pkl', 'rb'))
reference_alignment,reference_alignment_orderings,reference_alignment_scores = temp

# Here's an alignment that's pretty bad:
toy_alignment_poor = reference_alignment[1]
extractTup(toy_alignment_poor, tup_i='segment').sort_index()
# Here's an alignment that's okay:
toy_alignment_good = reference_alignment[0]
extractTup(toy_alignment_good, tup_i='segment').sort_index()

### total col count

In [None]:
def scoreNumColumns(align_df):
    return len(align_df.columns)

# # Lower is better
# print(' poor', scoreNumColumns(toy_alignment_poor))
# print(' good', scoreNumColumns(toy_alignment_good))

### content col count

In [None]:
def scoreNumFilledColumns(align_df):
    # empty columns don't count, so discount those...
    contents = [align_df[colname] for colname in align_df.columns]
    contents = [len([cell[0] for cell in t if cell[0].strip()!='']) for t in contents]
    contents = [(1 if t>0 else 0) for t in contents]
    return sum(contents)

# # Lower is better
# print(' poor', scoreNumFilledColumns(toy_alignment_poor))
# print(' good', scoreNumFilledColumns(toy_alignment_good))

### col embed variation

In [None]:
import math

def scoreColumnPhraseEmbedVariance(align_df, colname, embed_model):
    # Compute embeddings variance of all the phrases for a single column
    texts = [text for text in extractTup(align_df[colname], tup_i='segment', is_frame=False)]
    # take the set of row texts
    texts = list(set(texts))
    # build up the list of text embeds for the texts that we *can* compute an embed for
    text_embeds = []
    for word in texts:
        try:
            text_embeds.append(get_phrase_embed(embed_model, word).drop('word', 1))
        except:
            pass
    if len(text_embeds) > 1:
        output = pd.concat(text_embeds)
        # Reasoning for this operation (calculating variance as trace(covariance matrix) ...):
        # https://stats.stackexchange.com/questions/225434/a-measure-of-variance-from-the-covariance-matrix
        # This is equivalent to calculating the expected Euclidean distance of each element from the mean
        result = np.trace(output.cov())
    else:
        # one of two scenarios:
        # 1. all of the contents of this column aren't considered words, so, pretend they're all the same
        # 2. there is only one row in this column that contains text, so it has no variation
        # TODO is there a theoretically better way to handle them?
        result = 0
    return result

# # Lower is better
# print(' poor', scoreColumnPhraseEmbedVariance(toy_alignment_poor, 'txt0', word2vec))
# print(' good', scoreColumnPhraseEmbedVariance(toy_alignment_good, 'txt0', word2vec))

In [None]:
def scoreColumnTextCount(align_df, colname):
    # Count the number of unique texts in a single column
    # capture each cell text
    tokens = [text for text in extractTup(align_df[colname], tup_i='segment', is_frame=False)
              if text.strip() != '']
    # clean up whitespace
    tokens = [text.split() for text in tokens]
    # flatten
    tokens = [' '.join(sublist) for sublist in tokens]
    return len(set(tokens))

# # Lower is better
# print(' poor', scoreColumnTextCount(toy_alignment_poor, 'txt0'))
# print(' good', scoreColumnTextCount(toy_alignment_good, 'txt0'))

In [None]:
def variationCount(tokens):
    # remove empty rows entirely
    tokens = [e for e in tokens if len(e)!=0]
    numrows = len(tokens)
    # get the set-list
    tokenset = list(set([e for sl in tokens for e in sl]))
    # get the count of how many rows each token appears in
    tokencount = dict([(e, len([1 for sl in tokens if e in sl])) for e in tokenset])
    # remove the ones that are present in every row
    return len([e for e in tokencount if tokencount[e]!=numrows])

variationCount(
    [
        ['children'], 
        ['Fifty', 'children'], 
        ['children'], 
        ['45', 'children'], 
#         ['patients'], 
#         ['573', 'patients'], 
#         ['primary', 'care', 'patients'], 
    ]
)

### col token count

In [None]:
def scoreColumnTokenCount(align_df, colname):
    # TODO normalize this
    # Count the number of unique tokens in a single column
    # capture each cell text
    tokens = [
        text for text in extractTup(align_df[colname], tup_i='segment', is_frame=False)
        if text.strip() != ''
    ]
    # split it into tokens
    tokens = [text.split() for text in tokens]
    # flatten
    return len(set([token for sublist in tokens for token in sublist]))
#     # the implementation for variation count...
#     # Count the number of unique tokens in a single column that AREN'T IN ALL OF THE ROWS
#     # capture each cell text
#     tokens = [text for text in extractTup(align_df[colname], tup_i='segment', is_frame=False)
#               if text.strip() != '']
#     # split it into tokens
#     tokens = [text.split() for text in tokens]
#     return variationCount(tokens)

# # Lower is better
# print(' poor', scoreColumnTokenCount(toy_alignment_poor, 'txt0'))
# print(' good', scoreColumnTokenCount(toy_alignment_good, 'txt0'))

In [None]:
def scoreColumnTokenEntityCount(align_df, colname, scisp, scisp_linker):
    # input argument scisp: scispacy model (default should be scisp = spacy.load('en_core_sci_sm'))
    # input argument scisp_linker: scispacy model linker
    # Count the number of unique entity types in a single column
    types = [text for text in extractTup(align_df[colname], tup_i='segment', is_frame=False)
             if text.strip() != '']
    # process the texts through spacy and pick out entities
    types = [scisp(text).ents for text in types]
    # flatten the entities and put into a set
    types = [[ent for ent in ents] for ents in types]
    # get the UMLS mappings for each entity
    types = [[ent._.umls_ents[0][0] for ent in sl if (len(ent)>0) and (len(ent._.umls_ents)>0)] for sl in types]
    # get the TUI for each of these UMLS mappings
    # An informal guide to all of the TUIs: https://gist.github.com/joelkuiper/4869d148333f279c2b2e
    types_tui = [[scisp_linker.umls.cui_to_entity[ent].types for ent in sl] for sl in types]
    # check for edge case where there's 0 UMLS tuis for something?
    for sl in types_tui:
        if any([len(e)<1 for e in sl]):
            raise ValueError('<1 tui for a UMLS entity')
    types_tui = [[e for sl in row for e in sl] for row in types_tui]
    # TODO implement larger groupings of types / more general type groups...
    # https://semanticnetwork.nlm.nih.gov/download/SemGroups.txt
    # from https://semanticnetwork.nlm.nih.gov/
    return len(set([e for sl in types for e in sl])), len(set([e for sl in types_tui for e in sl]))

# # Lower is better
# print(' poor', scoreColumnTokenEntityCount(toy_alignment_poor, 'txt0', scisp=scisp, scisp_linker=linker))
# print(' good', scoreColumnTokenEntityCount(toy_alignment_good, 'txt0', scisp=scisp, scisp_linker=linker))

### col token entity variation count

In [None]:
def scoreColumnTokenEntityVariationCount(align_df, colname):
    # Count the number of unique entity types in a single column
    # while accounting for varying entity frequencies across rows
    types = [text for text in extractTup(align_df[colname], tup_i='segment', is_frame=False)
             if text.strip() != '']
    # process the texts through spacy and pick out entities
    types = [scisp(text).ents for text in types]
    # flatten the entities and put into a set
    types = [[ent for ent in ents] for ents in types]
    # get the UMLS mappings for each entity
    types = [[ent._.umls_ents[0][0] for ent in sl if (len(ent)>0) and (len(ent._.umls_ents)>0)] for sl in types]
    # get the TUI for each of these UMLS mappings
    # An informal guide to all of the TUIs: https://gist.github.com/joelkuiper/4869d148333f279c2b2e
    types_tui = [[linker.umls.cui_to_entity[ent].types for ent in sl] for sl in types]
    # check for edge case where there's 0 UMLS tuis for something?
    for sl in types_tui:
        if any([len(e)<1 for e in sl]):
            raise ValueError('<1 tui for a UMLS entity')
    types_tui = [[e for sl in row for e in sl] for row in types_tui]
    # TODO implement larger groupings of types / more general type groups...
    # https://semanticnetwork.nlm.nih.gov/download/SemGroups.txt
    # from https://semanticnetwork.nlm.nih.gov/
    return variationCount(types), variationCount(types_tui)

# # # Lower is better
# print(' poor', scoreColumnTokenEntityVariationCount(toy_alignment_poor, 'txt0'))
# print(' good', scoreColumnTokenEntityVariationCount(toy_alignment_good, 'txt0'))

In [None]:
def scoreColumnPhrasePOSCount(align_df, colname):
    # Count the number of unique phrase parts-of-speech in a single column
    tokens = [phrasepos for phrasepos in extractTup(align_df[colname], tup_i='pos', is_frame=False)
              if phrasepos.strip() != '']
    return len(set(tokens))

# # Lower is better
# print(' poor', scoreColumnPhrasePOSCount(toy_alignment_poor, 'txt0'))
# print(' good', scoreColumnPhrasePOSCount(toy_alignment_good, 'txt0'))

In [None]:
def scoreColumnPOSCount(align_df, colname):
    # Count the number of unique token parts-of-speech in a single column
    tokens = [pos_list for pos_list in extractTup(align_df[colname], tup_i='cpos', is_frame=False)]
    return len(set([pos for pos_list in tokens for pos in pos_list if pos.strip() != '']))

# # Lower is better
# print(' poor', scoreColumnPOSCount(toy_alignment_poor, 'txt0'))
# print(' good', scoreColumnPOSCount(toy_alignment_good, 'txt0'))

In [None]:
def scoreColumnPOSVariationCount(align_df, colname):
    # Count the number of unique token parts-of-speech in a single column
    # while accounting for varying token POS frequencies across rows
    tokens = [pos_list for pos_list in extractTup(align_df[colname], tup_i='cpos', is_frame=False)]
    return variationCount(tokens)

# # Lower is better
# print(' poor', scoreColumnPOSVariationCount(toy_alignment_poor, 'txt0'))
# print(' good', scoreColumnPOSVariationCount(toy_alignment_good, 'txt0'))

In [None]:
def scoreColumnRepresentation(align_df, colname):
    # Count the fraction of rows that are represented in the column (so penalizes gaps)
    tokens = [text for text in extractTup(align_df[colname], tup_i='segment', is_frame=False)]
    non_empty_count = len([text for text in tokens if text.strip() != ''])
    return non_empty_count/len(tokens)

# # Higher is better
# print(' poor', scoreColumnRepresentation(toy_alignment_poor, 'txt0'))
# print(' good', scoreColumnRepresentation(toy_alignment_good, 'txt0'))

In [None]:
def scoreColumnTotalTokens(align_df, colname):
    # Count the number of words (including repeats) in each column
    tokens = [text.split(' ') for text in extractTup(align_df[colname], tup_i='segment', is_frame=False)]
    tokens = [e for sublist in tokens for e in sublist if e!='']
    return len(tokens)

# # Higher is better
# print(' poor', scoreColumnTotalTokens(toy_alignment_poor, 'txt0'))
# print(' good', scoreColumnTotalTokens(toy_alignment_good, 'txt0'))

In [None]:
def scoreRowAlignment(align_df, focus_row):
    # Calculate the alignment score that a specific row would get if aligned with the df
    # Score is normalized by the number of operations that goes into calculating it
    # (there is a score matrix that is len(mat_a)*len(mat_b) dimensions)
    # TODO there should be a way to re-derive this based on the direct alignment?
    score = alignRowMajorLocal(align_df, focus_row, remove_empty_cols=True)[1]
    return score / (len(align_df.columns) + len(focus_row.columns))

# # Higher is better
# print(' poor', scoreRowAlignment(toy_alignment_poor, toy_alignment_poor.loc[[5130]]))
# print(' good', scoreRowAlignment(toy_alignment_good, toy_alignment_good.loc[[5130]]))

### term column count

In [None]:
def scoreTermColumnCount(align_df, term):
    # Count the number of columns that a certain phrase or term appears within
    # TODO should this be a fraction instead? what would that imply?
    # If it doesn't appear at all, returns 1 (TODO that might not be ideal?)
    # TODO add support for regex patterns (eg numbers?)
    tokens = [
        [text for text in extractTup(align_df[colname], tup_i='segment', is_frame=False)]
        for colname in align_df.columns
    ]
    tokens = [[e for e in col if (term.lower() in e.lower())] for col in tokens]
    tokens = [col for col in tokens if len(col) != 0]
    return max(1, len(tokens))

def scoreTermListColumnCount(align_df, term_list, term_weights=None):
    # if we don't have any terms to investigate, return 1 (default col count)
    if len(term_list) == 0:
        return 1
    # by default, weight each term equally
    if term_weights is None:
        term_weights = [1]*len(term_list)
    # And normalize the weights (assume that hasn't been done already)
    tw_sum = sum(term_weights)
    term_weights = [(tw/tw_sum) for tw in term_weights]
    scores = [scoreTermColumnCount(align_df, term) for term in term_list]
    return np.dot(scores, term_weights)

# # Lower is better
# print(' poor', scoreTermColumnCount(toy_alignment_poor, 'anxiety'))
# print(' good', scoreTermColumnCount(toy_alignment_good, 'anxiety'))
# print()

# # Test of using scoreTermListColumnCount with multiple terms (weighted equally)
# # Lower is better
# temp_list = ['anxiety', 'patient', 'children', 'child']
# scores = scoreTermListColumnCount(toy_alignment_poor, temp_list)
# print('poor', scores)
# scores = scoreTermListColumnCount(toy_alignment_good, temp_list)
# print('good', scores)

In [None]:
def scoreRowLayoutCount(align_df):
    # Count the number of unique content-gap orderings that are present in the alignment
    rows = [
        list(extractTup(align_df.iloc[i], tup_i='segment', is_frame=False)) 
        for i in range(len(align_df))]
    rows = [''.join([('.' if (e.strip() != '') else ' ') for e in r]) for r in rows]
    return len(set(rows))

# # Lower is better
# print(' poor', scoreRowLayoutCount(toy_alignment_poor))
# print(' good', scoreRowLayoutCount(toy_alignment_good))

In [None]:
def alignmentTermWeights(align_df, sp, all_stopwords=None, priority_pos=['NN', 'NNS', 'NNP', 'JJ', 'RB']):
    # input argument sp: spacy model (default should be sp = spacy.load('en_core_web_sm'))
    if all_stopwords is None:
        all_stopwords = sp.Defaults.stop_words
    # collect list forms of words and cPOS
    all_text = [
        [text for text in extractTup(align_df.iloc[rownum], tup_i='segment', is_frame=False)]
        for rownum in range(len(align_df))
    ]
    all_text = [' '.join(sublist).split() for sublist in all_text]
    all_cpos = [
        [text for sublist
         in extractTup(align_df.iloc[rownum], tup_i='cpos', is_frame=False)
         for text in sublist]
        for rownum in range(len(align_df))
    ]
    # get count of how many rows each word is present in
    tokens_df = dict([
        (word, sum([(word in row) for row in all_text]))
        for word
        in set([item for sublist in all_text for item in sublist])
        if word not in all_stopwords
    ])
    # remove the words that show up in at most one row
    for word in [word for word in tokens_df if tokens_df[word] <= 1]:
        discard = tokens_df.pop(word, None)
    # flatten the word and cPOS lists
    all_text = [e for sublist in all_text for e in sublist]
    all_cpos = [e for sublist in all_cpos for e in sublist if e != '']
    # count up how many POS is assigned to each word
    pos_mapping = {}
    for i in range(len(all_text)):
        if all_text[i] not in pos_mapping:
            pos_mapping[all_text[i]] = {}
        if all_cpos[i] not in pos_mapping[all_text[i]]:
            pos_mapping[all_text[i]][all_cpos[i]] = 0
        pos_mapping[all_text[i]][all_cpos[i]] += 1
    # pick the single POS that each word is tagged as most often
    for word in pos_mapping:
        max_pos = None
        max_count = 0
        for pos in pos_mapping[word]:
            if pos_mapping[word][pos] > max_count:
                max_pos = pos
                max_count = pos_mapping[word][pos]
        pos_mapping[word] = max_pos
    # exponentiate the count of all of the words in the dict that are in POS classes we care about
    for word in tokens_df:
        if any([(pos in pos_mapping[word]) for pos in priority_pos]):
            tokens_df[word] = pow(tokens_df[word], 2)
    return tokens_df

In [None]:
# # test how much sense this weighting works for scoreTermListColumnCount
# tokens_df = alignmentTermWeights(toy_alignment_bad, sp=sp)
# temp_list = list(tokens_df)
# temp_weights = list(tokens_df.values())
# scores = scoreTermListColumnCount(toy_alignment_poor, temp_list, term_weights=temp_weights)
# print('poor', scores)
# scores = scoreTermListColumnCount(toy_alignment_good, temp_list, term_weights=temp_weights)
# print('good', scores)

### alignment score function

In [None]:
import traceback

# TODO how do we design this score function that it may be comparable with other alignments?
def scoreAlignment(align_df, spacy_model, scispacy_model, scispacy_linker, embed_model, max_row_length=None, term_weight_func=None, weight_components=None):
    # set default score weights...
    if weight_components is None:
        weight_components = np.array([0.2, 0.2, 1, 0, 0, 0])

    # ideally, only calculate the max row length once for each optimization search, but we can do that per-alignment if it's not provided
    if max_row_length is None:
        print('scoreAlignment: prefer having max_row_length input')
        traceback.print_stack(limit=5)
        max_row_length = max([len([e[0] for e in align_df.loc[i] if len(e[0])!=0]) for i in align_df.index])

    # get term weights
    if term_weight_func is None:
        alignment_terms = alignmentTermWeights(align_df, sp=spacy_model)
        term_list = list(alignment_terms)
        weight_terms = list(alignment_terms.values())
    weight_terms = [r/sum(weight_terms) for r in weight_terms] # normalize the weights

    # get column weights
    score_colalltokens = [scoreColumnRepresentation(align_df, colname) for colname in align_df.columns]
#     score_colalltokens = [scoreColumnTotalTokens(align_df, colname) for colname in align_df.columns]
#     score_colalltokens = [1 for colname in align_df.columns]
    weight_columns = [r/sum(score_colalltokens) for r in score_colalltokens] # normalize the column weights

    # get score components
    score_numcolumns = scoreNumColumns(align_df)
    score_numfilledcolumns = scoreNumFilledColumns(align_df)
    score_colptxtembed = [scoreColumnPhraseEmbedVariance(align_df, colname, embed_model) for colname in align_df.columns]
    score_coltokncount = [scoreColumnTokenCount(align_df, colname) for colname in align_df.columns]
    raw_colentityscores = [
        scoreColumnTokenEntityCount(align_df, colname, scisp=scispacy_model, scisp_linker=scispacy_linker) 
        for colname in align_df.columns
    ]
#     raw_colentityscores = [(0,0) for colname in align_df.columns] # cheap filler score - use if scispacy not imported properly
    score_coltentcount = [s[0] for s in raw_colentityscores]
    score_colttuicount = [s[1] for s in raw_colentityscores]
    score_termcolcount = scoreTermListColumnCount(align_df, term_list, weight_terms)

    # put score components into a df of their own that is neatly readable for debug purposes
    rawscores = pd.DataFrame([
        # text embed vector variance; lower is better
        score_colptxtembed,
        # distinct tokens; lower is better
        score_coltokncount,
        # varying distinct entity TUIs; lower is better
        score_colttuicount,
        # the weighting we are giving each column; higher means more attention
        weight_columns,
    ], index=[
        'embed variance',
        'unique tokens (var)',
        'unique entity TUI (var)',
        'relevance (numtokens)',
    ])
    rawscores = rawscores.rename(columns=dict(zip(rawscores.columns, align_df.columns)))

    # apply column weights to the score components
    components = np.array([
        # number of columns; lower is better
        -1 * math.pow((score_numcolumns / max_row_length), 1),
        # number of filled columns; lower is better
        -1 * math.pow((score_numfilledcolumns / max_row_length), 1),
        # text embed vector variance; lower is better
        -1 * np.dot([math.pow(s, 2) for s in score_colptxtembed], weight_columns),
        # varying distinct tokens; lower is better
        -1 * np.dot(score_coltokncount, weight_columns),
        # varying distinct entity TUIs; lower is better
        -1 * np.dot(score_colttuicount, weight_columns),
        # column count of terms used; lower is better
        -1 * score_termcolcount,
    ])
    # apply score component weights (higher total score is better)
    bias = 5
    singlescore = bias + np.dot(weight_components, components)
    return singlescore, components, rawscores

# singlescore, components, rawscores = scoreAlignment(toy_alignment_poor, spacy_model=sp, scispacy_model=scisp, scispacy_linker=linker, embed_model=fasttext, max_row_length=13)
# singlescore
# components
# rawscores

In [None]:
# import random

# # TODO remove?
# # Experiment to see how much random ordering impacts alignment readability
# temp_ids = [2552, 4594, 4618, 5130, 6474, 7298, 7308, 7321, 5126, 5134, 6507] # experiment 1
# # temp_ids = [2030, 2078, 2380, 2437, 2711, 2849, 3194, 3285, 4887, 5437, 6915] # experiment 2
# # temp_ids = [1248, 1275, 1381, 1387, 3871, 4039, 5202, 5204, 6563, 6569] # experiment 3
# temp_alignment_orderings = []
# temp_alignment_outputs = []
# temp_alignment_score_progressions = []
# for i in range(20):
#     temp_ids = random.sample(temp_ids, len(temp_ids))
#     alignment, alignment_score = alignRowMajorLocal(
#         alignment_df.loc[[temp_ids[0]]], 
#         alignment_df.loc[[temp_ids[1]]], 
#         remove_empty_cols=True
#     )
#     temp_alignment_orderings.append(temp_ids)
#     temp_alignment_outputs.append(alignment)
#     temp_alignment_score_progressions.append([alignment_score])
#     for j in range(2, len(temp_ids)):
#         alignment, alignment_score = alignRowMajorLocal(
#             temp_alignment_outputs[i],
#             alignment_df.loc[[temp_ids[j]]], 
#             remove_empty_cols=True
#         )
#         temp_alignment_outputs[i] = alignment
#         temp_alignment_score_progressions[i].append(alignment_score)
# # extractTup(update_manually_aligned_group_realign, tup_i='segment').sort_index()

# Scoring function evaluation

In [None]:
def tempScoreVectorDetail(align_df):
    # get term weights
    alignment_terms = alignmentTermWeights(align_df, sp=sp)
    term_list = list(alignment_terms)
    term_weights = list(alignment_terms.values())
    # TODO make this an actual nice function later
    score_colptxtembed = [scoreColumnPhraseEmbedVariance(align_df, colname, word2vec) for colname in align_df.columns]
    score_coltextcount = [scoreColumnTextCount(align_df, colname) for colname in align_df.columns]
    score_coltokncount = [scoreColumnTokenCount(align_df, colname) for colname in align_df.columns]
    raw_colentityscores = [scoreColumnTokenEntityCount(align_df, colname) for colname in align_df.columns]
    score_coltentcount = [s[0] for s in raw_colentityscores]
    score_colttuicount = [s[1] for s in raw_colentityscores]
    raw_colentityvarscores = [scoreColumnTokenEntityVariationCount(align_df, colname) for colname in align_df.columns]
    score_coltentvarcount = [s[0] for s in raw_colentityvarscores]
    score_colttuivarcount = [s[1] for s in raw_colentityvarscores]
    score_colpposcount = [scoreColumnPhrasePOSCount(align_df, colname) for colname in align_df.columns]
    score_coltposcount = [scoreColumnPOSCount(align_df, colname) for colname in align_df.columns]
    score_coltposvarcount = [scoreColumnPOSVariationCount(align_df, colname) for colname in align_df.columns]
    score_colrepresent = [scoreColumnRepresentation(align_df, colname) for colname in align_df.columns]
    score_colalltokens = [scoreColumnTotalTokens(align_df, colname) for colname in align_df.columns]
    score_termcolcount = scoreTermListColumnCount(align_df, term_list, term_weights)
    scores = np.array([
        scoreNumColumns(align_df), # lower is better
        sum(score_coltextcount)/len(score_coltextcount), # lower is better
        sum(score_colptxtembed)/len(score_colptxtembed), # lower is better
        sum(score_coltokncount)/len(score_coltokncount), # lower is better
        sum(score_coltentcount)/len(score_coltentcount), # lower is better
        # haven't added score_coltentvarcount
        sum(score_colttuicount)/len(score_colttuicount), # lower is better
        # haven't added score_colttuivarcount
        sum(score_colpposcount)/len(score_colpposcount), # lower is better
        sum(score_coltposcount)/len(score_coltposcount), # lower is better
        # haven't added score_coltposvarcount
        sum(score_colrepresent)/len(score_colrepresent), # higher is better
        # haven't added score_colalltokens
        0,#scoreRowAlignment(align_df, align_df.loc[[5130]]), # higher is better
        score_termcolcount, #  lower is better
        scoreRowLayoutCount(align_df), # lower is better
    ])
    # weight and sum up the score (higher total score is better)
    score_direction = np.array([-1, -1, -1, -1, -1, -1, -1, -1,  1,  1, -1, -1])
    score_weights   = np.array([ 0,  0, 10,  0,  0,  0,  0,  0,  0, 0., 15,  0])
    singlescore = np.dot(np.multiply(score_weights, score_direction), scores)
    colrelevance_represent = [r/sum(score_colrepresent) for r in score_colrepresent]
    colrelevance_numtokens = [r/sum(score_colalltokens) for r in score_colalltokens]
    # collect the raw stats as a displayable df
    rawscores = pd.DataFrame([
        score_colptxtembed,
        score_coltextcount,
        score_coltokncount,
        score_coltentcount,
        score_coltentvarcount,
        score_colttuicount,
        score_colttuivarcount,
        score_colpposcount,
        score_coltposcount,
        score_coltposvarcount,
        score_colrepresent,
        colrelevance_represent,
        score_colalltokens,
        colrelevance_numtokens,
    ], index=[
        'embed variance',
        'unique texts',
        'unique tokens',
        'unique entity',
        'unique entity (var)',
        'unique entity TUI',
        'unique entity TUI (var)',
        'unique phrase pos', 
        'unique token pos', 
        'unique token pos (var)', 
        'fract rows filled',
        'relevance1 (rowsfilled)',
        'num tokens',
        'relevance2 (numtokens)',
    ])
    return singlescore, scores, rawscores

In [None]:
# alignment_being_scored = alignment_wordsonly
# singlescore, scores, rawscores = tempScoreVectorDetail(
#     alignment_being_scored
# )
# score_direction = np.array([-1, -1, -1, -1, -1, -1, -1, -1,  1,  1, -1, -1])
# score_weights   = np.array([ 0,  0, 10,  0,  0,  0,  0,  0,  0, 0., 15,  0])
# singlescore = np.dot(np.multiply(score_weights, score_direction), scores)
# singlescore
# scores
# pd.DataFrame([str(s) for s in scores]).loc[[0, 8, 9, 10]]
# alignment_detail = extractTup(alignment_being_scored, tup_i='segment').sort_index()
# alignment_detail.append(rawscores.rename(columns=dict(zip(rawscores.columns, alignment_detail.columns))))

### Live update score and check how score components respond to changes in an alignment

In [None]:
# alignment_being_scored = read_alignment_from_sheetstring_file('interactive_input/alignment')
# singlescore, scores, rawscores = tempScoreVectorDetail(
#     alignment_being_scored
# )
# score_direction = np.array([-1, -1, -1, -1, -1, -1, -1, -1,  1,  1, -1, -1])
# score_weights   = np.array([ 0,  0, 10,  0,  0,  0,  0,  0,  0, 0., 15,  0])
# singlescore = np.dot(np.multiply(score_weights, score_direction), scores)
# singlescore
# # scores
# rawspreadscores = pd.DataFrame([str(s) for s in scores]).iloc[[0, -3, -2]]
# rawspreadscores.index = ['colcount', 'row-alignment', 'termcolcount']
# rawspreadscores.rename(columns={0:'aggregate'}, inplace=True)
# # rawspreadscores
# alignment_detail = extractTup(alignment_being_scored, tup_i='segment').sort_index()
# rawscores = rawscores.rename(columns=dict(zip(rawscores.columns, alignment_detail.columns)))
# rawscores = rawscores.loc[[
#     'embed variance', 'unique texts', 'unique tokens', 'unique tokens (var)',
#     'unique entity', 'unique entity (var)', 'unique entity TUI', 'unique entity TUI (var)', 
#     'unique token pos', 'unique token pos (var)', 
#     'fract rows filled', 'relevance1 (rowsfilled)', 
#     'num tokens', 'relevance2 (numtokens)']]
# rawscores.insert(0, column='aggregate', value=['']*(len(rawscores.index)))
# scoretable = rawscores.append(rawspreadscores).replace(np.nan, '', regex=True)
# scoretable
# alignment_detail.append(scoretable)

### Test Cases

In [None]:
import json

def evaluate_test_case(test_case, weights=[0.2, 0.2, 1, 0, 0, 0], debug_print=False):
    # Read in A and B alignments
    align_df_a = read_alignment_from_jsondict_file(f'testcases/{test_case}/a.json')
    align_df_b = read_alignment_from_jsondict_file(f'testcases/{test_case}/b.json')

    # Read in parameters
    with open(f'testcases/{test_case}/params.yml', 'r') as f:
        params = json.load(f)

    # Get the maximum column count that we're working with (this is used for a score)
    max_row_length = params['max_row_length']

    singlescore_a, components_a, rawscores_a = scoreAlignment(
        align_df_a, 
        spacy_model=sp,
        scispacy_model=scisp, scispacy_linker=linker,
        embed_model=fasttext,
        max_row_length=max_row_length, 
#         weight_components=np.array(weights)
    )
    singlescore_b, components_b, rawscores_b = scoreAlignment(
        align_df_b, 
        spacy_model=sp,
        scispacy_model=scisp, scispacy_linker=linker,
        embed_model=fasttext,
        max_row_length=max_row_length, 
#         weight_components=np.array(weights)
    )

    if debug_print:
        print('=== TEST CASE:', test_case, '===')
        print(f'Result: {singlescore_b - singlescore_a:.5f}')
        print(f'(Raw scores: {singlescore_a:.5f} , {singlescore_b:.5f} )')
    
    return singlescore_b - singlescore_a, test_case, singlescore_a, singlescore_b, components_a, components_b, rawscores_a, rawscores_b

In [None]:
# import os

# results = []
# for test_case in sorted(os.listdir(f'testcases/')):
# #     results.append(evaluate_test_case(test_case, weights=[0.2, 0.2, 1, 0, 0, 0], debug_print=False))
#     results.append(evaluate_test_case(test_case, weights=[0.2, 0.2, 1, 0, 0, 0], debug_print=False))

# # # check again what the layout of each test result is
# # print(results[0])

# # define the test case groups and how to order them
# categories = ['basic-txtshift', 'basic-colcreat', 'basic-colmerge', 'basic-holistic']

# def alias_testname(testname):
#     testname = testname.split('-')
#     return testname[0][0]+'-'+testname[1]+'-'+testname[2]
# results_mapping = dict([(r[1], (r[0], r[2], r[3], r[4], r[5], r[6], r[7])) for r in results])

# # define templating / "ideal metric" scores
# category_counts = [len([r for r in results_mapping if (cname in r)]) for cname in categories]
# testnames = [[r[1] for r in results if (category_name in r[1])] for category_name in categories]
# testnames = [b for a in testnames for b in sorted(a)]
# # pd.DataFrame([[alias_testname(n) for n in testnames]])
# # pd.DataFrame([category_counts + [''] + ['specific scores go here']])

# # extract metric-specific performance
# category_counts = [len([r for r in results_mapping if (cname in r) and (results_mapping[r][0]>0)]) for cname in categories]
# scores = [results_mapping[tname][0] for tname in testnames]
# pd.DataFrame([category_counts + [''] + scores])

In [None]:
# for tname in testnames:
#     print('====================')
#     print(tname + ' : ' + str(results_mapping[tname][0]))
#     print(results_mapping[tname][1])
#     print(results_mapping[tname][3])
#     extractTup(read_alignment_from_jsondict_file(f'testcases/{tname}/a.json'), tup_i='segment').append(results_mapping[tname][5])
#     print(results_mapping[tname][2])
#     print(results_mapping[tname][4])
#     extractTup(read_alignment_from_jsondict_file(f'testcases/{tname}/b.json'), tup_i='segment').append(results_mapping[tname][6])

# Alignment state exploration

### alignment state space search

In [None]:
### RANDOM WALK ALIGNMENT SPACE SEARCH
# At each step, either:
# 1. Greedy step
# 2. Random step
# 3. Random restart? - currently not included.

import random

def alignmentStateSearch(
        alignment_file='interactive_input/alignment', 
        default_score_weighting=[0.2, 0.2, 1, 0, 0, 0], 
        debug_print=False,
        default_step_moves=[('greedy', 1), ('randomwalk', 1)], 
        default_num_steps=1000,
        default_greedy_cutoff=2,
    ):
    # Create the datastructure for output!
    # the ith element of each array: in step i (where 0 is the input / file reading step), 
    # we performed output_alignmentop[i] operation
    # as chosen by output_alignmentopmode[i] type of stochastic action (e.g. greedy, randomwalk...)
    # out of the entire list of output_alignmentopset[i] operations
    # to get output_alignmentstate[i] alignment table state
    # which has a score of output_alignmentscore[i] as scored by the function we used when performing the search
    # finally, output_hyperparams is a dict containing the hyperparameters we used in the search
    # e.g. (step count, greedy cutoff, stochastic greedy/random balance, etc)
    output_alignmentstate = []
    output_alignmentop = [('beginning')]
    output_alignmentopset = [[('beginning')]]
    output_alignmentopmode = ['beginning']
    output_alignmentscore = []
    output_alignmentscorecomponents = []
    output_alignmentscoreraw = []
    output_hyperparams = {}
    
    # Read in our manually created start alignment
    # TODO this could improve... for now just attempt loading in the JSON format, then fallback to sheetstring
    try:
        align_df = read_alignment_from_jsondict_file(alignment_file)
    except:
        align_df = read_alignment_from_sheetstring_file(alignment_file)

    # Get the maximum column count that we're working with (this is used for a score)
    max_row_length = max([len([e[0] for e in align_df.loc[i] if len(e[0])!=0]) for i in align_df.index])

    # define hyperparameters of the random walk and search process
    NUM_STEPS = default_num_steps
    STEP_MOVES = default_step_moves
    GREEDY_CUTOFF = default_greedy_cutoff
    SCORE_WEIGHTING = default_score_weighting
    # and save the hyperparameters
    output_hyperparams['NUM_STEPS'] = NUM_STEPS
    output_hyperparams['STEP_MOVES'] = STEP_MOVES
    output_hyperparams['GREEDY_CUTOFF'] = GREEDY_CUTOFF
    output_hyperparams['SCORE_WEIGHTING'] = SCORE_WEIGHTING

    # add the preview / state at the beginning of the search
    output_alignmentstate.append(align_df.sort_index())
    singlescore, components, rawscores = scoreAlignment(
        align_df, 
        spacy_model=sp,
        scispacy_model=scisp, scispacy_linker=linker,
        embed_model=fasttext,
        max_row_length=max_row_length, 
        weight_components=np.array(SCORE_WEIGHTING)
    )
    output_alignmentscore.append(singlescore)
    output_alignmentscorecomponents.append(components)
    output_alignmentscoreraw.append(rawscores)
    
    # normalize probabilities for step_moves
    sum_prob = sum([e[1] for e in STEP_MOVES])
    STEP_MOVES = [(e[0], e[1]/sum_prob) for e in STEP_MOVES]
    # transform STEP_MOVES into something that's easier for me to break down and de-probabilify
    temp_runningsum = 0
    STEP_MOVES_DIST = []
    for e in STEP_MOVES:
        STEP_MOVES_DIST.append((e[0], temp_runningsum+e[1]))
        temp_runningsum += e[1]
    # set up trans-loop variables
    running_progress = []
    current_score = scoreAlignment(
        align_df, 
        spacy_model=sp,
        scispacy_model=scisp, scispacy_linker=linker,
        embed_model=fasttext, 
        max_row_length=max_row_length, 
        weight_components=np.array(SCORE_WEIGHTING)
    )
    
    # perform the search!
    i = 0
    greedy_stuck = 0
    while (i < NUM_STEPS) and (greedy_stuck < GREEDY_CUTOFF):
        # Define/ build up the list of what operations we can make right now
        # TODO refine this so operations are fairly represented ... ?
        valid_operations = []
        valid_operations += [('none', 0)]
#         valid_operations += [('split', e, False) for e in align_df.columns]
#         valid_operations += [('split', e, True) for e in align_df.columns]
#         valid_operations += [('merge', e) for e in align_df.columns[:-1]]
        # be overeager with what is 'valid' for shift, calculating these ahead of time is sort of a pain
        for col_i in range(len(align_df.columns)):
            # get all valid clumps of rows in the column
            col_texts = [e for e in zip([e[0] for e in align_df[align_df.columns[col_i]]], align_df.index) if len(e[0])!=0]
            row_clumps = {}
            for col_word in set([e[0] for e in col_texts]):
                row_clumps[col_word] = [e[1] for e in col_texts if e[0]==col_word]
            if len(row_clumps)>0: # 1: # only cannot be done with other operations
                # don't do a shift if doing any shifting in this column would be identical to a merge...
                for row_clump_word in row_clumps:
                    for distance in range(-1 * len(align_df.columns), len(align_df.columns)): # [-1, 1]:
                        if distance != 0 and canShiftCells(align_df, row_clumps[row_clump_word], align_df.columns[col_i], distance, 1):
                            valid_operations += [
                                ('shift', row_clumps[row_clump_word], align_df.columns[col_i], distance, 1)
                            ]
        output_alignmentopset.append(valid_operations)
        # pull move from the move prob distribution
        move = random.uniform(0, 1)
        move = [e for e in STEP_MOVES_DIST if e[1]>=move][0][0] # de-probabilify the selected move
        output_alignmentopmode.append(move)
        if debug_print:
            print(f'step {i}: {move}')
            print(f'    # valid ops: {len(valid_operations)}')
        # run through all of the operations and calculate what their moves would be
        candidates = []
        operation_i = 1
        for selected_operation in valid_operations:
            if selected_operation[0]=='split':
                operated = splitCol(align_df, selected_operation[1], right_align=selected_operation[2])
            elif selected_operation[0]=='merge':
                operated = mergeCol(align_df, selected_operation[1])
            elif selected_operation[0]=='shift':
                operated = shiftCells(
                    align_df, 
                    selected_operation[1], 
                    selected_operation[2], 
                    selected_operation[3], 
                    shift_size=selected_operation[4], 
                )
            elif selected_operation[0]=='none':
                operated = align_df
            else:
                raise ValueError('uh oh, undefined operation')
            singlescore, components, rawscores = scoreAlignment(
                operated, 
                spacy_model=sp,
                scispacy_model=scisp, scispacy_linker=linker,
                embed_model=fasttext,
                max_row_length=max_row_length, 
                weight_components=np.array(SCORE_WEIGHTING)
            )
            candidates.append((operated, singlescore, selected_operation))
            if debug_print:
                print(f'\r    completed calculating operation#{operation_i}', end='', flush=True)
            operation_i += 1
        if debug_print:
            print()
        # sort all operations by score descending
        candidates.sort(key=lambda x: -1 * x[1])
        # keep track of if it's still possible to improve from here
        if candidates[0][2][0]=='none':
            # we would run greedy=none
            if debug_print:
                print(f'    greedy=none: {greedy_stuck} times in a row')
            greedy_stuck += 1
        else:
            greedy_stuck = 0
        # If we haven't hit the quit condition, make the alignment search step, depending on greedy or random...
        if (greedy_stuck < GREEDY_CUTOFF):
            if move == 'greedy':
                # Make the change that results in best score!
                align_df = candidates[0][0]
                output_alignmentop.append(candidates[0][2])
            elif move == 'randomwalk':
                selected = random.randint(0, len(candidates)-1)
                align_df = candidates[selected][0]
                output_alignmentop.append(candidates[selected][2])
            else:
                raise ValueError('uh oh, undefined search move/mode')
        else:
            output_alignmentop.append(('ending-nogreedymove'))
            if debug_print:
                print(f'    hit quit condition (greedy move = None for last {GREEDY_CUTOFF} steps); not moving')
        i += 1
        singlescore, components, rawscores = scoreAlignment(
            align_df, 
            spacy_model=sp,
            scispacy_model=scisp, scispacy_linker=linker,
            embed_model=fasttext,
            max_row_length=max_row_length, 
            weight_components=np.array(SCORE_WEIGHTING)
        )
        output_alignmentstate.append(align_df.sort_index())
        output_alignmentscore.append(singlescore)
        output_alignmentscorecomponents.append(components)
        output_alignmentscoreraw.append(rawscores)
    
    return output_alignmentstate, output_alignmentop, output_alignmentopset, output_alignmentopmode, \
        output_alignmentscore, output_alignmentscorecomponents, output_alignmentscoreraw, output_hyperparams

### do multiple searches

In [None]:
# import pickle

# nameprefix = 'row4score-'

# for align_name in ['a']:#, 'b', 'c', 'd', 'e']:
#     temp_search = alignmentStateSearch(
#         alignment_file='interactive_input/alignment',
#         default_score_weighting=[0, 0, 1, 0, 0, 0], 
#         default_step_moves=[('greedy', 1), ('randomwalk', 1)],
#         default_num_steps=5,
#         debug_print=True
#     )

#     pickle.dump(temp_search, open(f'temp/search_temp_{nameprefix}{align_name}.pkl', 'wb'))
    
# #     temp_search = pickle.load(open(f'temp/search_temp_search_{nameprefix}{align_name}.pkl', 'rb'))
# #     output_alignmentstate, output_alignmentop, output_alignmentopset, output_alignmentopmode, output_alignmentscore, \
# #             output_alignmentscorecomponents, output_alignmentscoreraw, output_hyperparams = temp_search

# #     print(output_alignmentscore)

# # other sample searches
# # alignmentStateSearch(alignment_file='interactive_input/alignment_nopunct') # "nice alignment (manually removed punctuation)"
# # alignmentStateSearch(alignment_file='interactive_input/alignment_handmade') # "handmade nice alignment"
# # alignmentStateSearch(alignment_file='interactive_input/alignment_diabetes') # "nice alignment (different topic)"
# # alignmentStateSearch(alignment_file='interactive_input/alignment_hyphenmerge') # "nice alignment (merged hyphen phrases)"

### manually tweak alignment and see where it leads us

In [None]:
# spreadsheetFormat(output_alignmentstate[13])

In [None]:
# temp_search_manually_adjusted = alignmentStateSearch(
#     alignment_file='interactive_input/manuallyadjusted', 
#     default_score_weighting=[0, 0, 0, 0, 0, 1], 
#     debug_print=True
# )
# output_alignmentstate, output_alignmentop, output_alignmentopset, output_alignmentopmode, output_alignmentscore, \
#         output_alignmentscorecomponents, output_alignmentscoreraw, output_hyperparams = temp_search_manually_adjusted

# print('===   assert sanity   ===')
# print()

# print('all of these should be the same:')
# print(len(output_alignmentstate), len(output_alignmentop), len(output_alignmentopset), len(output_alignmentopmode), len(output_alignmentscore))

# print()
# print('=== search algorithm! ===')
# print()

# output_hyperparams
# print()
# print(f'score components: [score_numcolumns     score_colptxtembed     score_coltokncount     score_colttuivarcount]')
# print()
# for i in range(len(output_alignmentop)):
#     print(f'step {i}: {output_alignmentopmode[i]}')
#     print(f'    performed op: {output_alignmentop[i]}, out of {len(output_alignmentopset[i])} valid op(s)')
#     print(f'        ops={output_alignmentopset[i]}')
#     print(f'    result score: {output_alignmentscore[i]}')
#     print(f'    components: {output_alignmentscorecomponents[i]}')
# #     output_alignmentstate[i].append(output_alignmentscoreraw[i])
#     extractTup(output_alignmentstate[i], tup_i='segment').append(output_alignmentscoreraw[i])