In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import numpy as np

import pandas as pd
pd.set_option("display.max_rows", None)
pd.set_option('display.max_columns', None)

import gensim

# Load Google's pre-trained Word2Vec model.
# model source: https://code.google.com/archive/p/word2vec/
word2vec = gensim.models.KeyedVectors.load_word2vec_format(
    'model/GoogleNews-vectors-negative300.bin', 
    binary=True)
cached_word2vec_phrases = {}

import spacy

sp = spacy.load('en_core_web_sm')

import scispacy
from scispacy.linking import EntityLinker

scisp = spacy.load('en_core_sci_sm')
linker = EntityLinker(resolve_abbreviations=True, name="umls")
scisp.add_pipe(linker)

In [None]:
# Get the word2vec embedding of a phrase
def get_phrase_embed_word2vec(word2vec, phrase, remove_label=False):
    try:
        phraseS = phrase.split()
    except:
        return pd.DataFrame()
    unknowns = []
    emb = []
    for w in phraseS:
        try:
            emb.append(word2vec[w])
        except:
            unknowns.append(w)
    if len(emb) == 0:
        return pd.DataFrame()
    emb_sum = pd.DataFrame(emb).sum() / len(phraseS)
    if not remove_label:
        emb_sum['word'] = phrase
    return pd.DataFrame([emb_sum])

get_phrase_embed_word2vec(
    word2vec, 
    'test sentence')
# get_phrase_embed_word2vec(
#     word2vec, 
#     'This is a test sentence !')

# Import sample dataset
(The code to construct the file `temp/ebm-pio_consegments.hdf` is in analyze.ipynb)

In [None]:
# Import the data we've already constructed out of constituency parse of specific phrases in specific sentences
con_segments = pd.read_hdf(f'temp/ebm-pio_consegments.hdf','mydata')
con_segments

In [None]:
# Transform that data into the format that is more readable for alignment
# (sorry, this is sort of an abuse of DataFrame datatypes)

def transformTuples(row):
    # turn each row into the segment tuples used for alignment
    output = pd.DataFrame()
    for i in range(len(row['alignsegments'])):
        output[f'txt{i}'] = [(row['alignsegments'][i], row['aligntypes'][i], row['alignctypes'][i])]
    return output.set_index(pd.Series([row.name]))

transformTuples(con_segments.loc[7298])

In [None]:
alignment_df = con_segments.groupby(con_segments.index, group_keys=False).apply(
    lambda group: transformTuples(group.iloc[0]))
alignment_df = alignment_df.applymap(lambda x: ('', '', []) if x is np.nan else x)
alignment_df

In [None]:
def splitAlignTuples(row, split_token=' '):
    # take each row of alignment tuples and split them on the split token
    output = pd.DataFrame()
    # extract the data we're going to use
    tokens = ' '.join(row.map(lambda x: x[0])).split()
    cpos = [e for sublist in list(row.map(lambda x: x[2])) for e in sublist]
    output_row = list(zip(tokens, cpos, [[e] for e in cpos]))
    for i in range(len(tokens)):
        output[f'txt{i}'] = [output_row[i]]
    return output.set_index(pd.Series([row.name]))

splitAlignTuples(alignment_df.loc[1])

In [None]:
alignment_df_tseg = alignment_df.groupby(alignment_df.index, group_keys=False).apply(
    lambda group: splitAlignTuples(group.iloc[0]))
alignment_df_tseg = alignment_df_tseg.applymap(lambda x: ('', '', []) if x is np.nan else x)
alignment_df_tseg

# Alignment operations

In [None]:
def mergeAdjacentNP(row):
    # merge adjacent noun phrases
    # TODO adapt this for alignment df format!!!
    # TODO fix the deep copy bug
    output = row.copy()
    for i in reversed(range(len(row['alignsegments'])-1)):
        if row['aligntypes'][i]=='NP' and row['aligntypes'][i+1]=='NP':
            row['alignsegments'][i] += ' ' + row['alignsegments'][i+1]
            row['alignsegments'][i+1] = []
            row['aligntypes'][i+1] = []
            row['alignctypes'][i] += row['alignctypes'][i+1]
            row['alignctypes'][i+1] = []
    row = row.drop('aligntup')
    row['alignsegments'] = [e for e in row['alignsegments'] if e != []]
    row['aligntypes'] = [e for e in row['aligntypes'] if e != []]
    row['alignctypes'] = [e for e in row['alignctypes'] if e != []]
    return output

# temp_df = con_segments
# temp_df = temp_df.apply(
#     lambda row: mergeAdjacentNP(row), 
#     axis=1, result_type='expand')
# temp_df

In [None]:
# TODO this is still buggy, don't use it
def mergeParentheses(row):
    # merge parenthetical clauses
    numOpenParens = 0
    lastStart = -1
    mergeSegments = []
    for i in range(len(row['alignsegments'])):
        for c in [c for c in row['alignsegments'][i] if c in ['(', ')']]:
            if c == '(':
                numOpenParens += 1
                if numOpenParens == 1:
                    lastStart = i
            else:
                numOpenParens -= 1
                if numOpenParens == 0 and lastStart != i:
                    # close the parentheses
                    mergeSegments.append((lastStart, i))
    if numOpenParens > 0:
        mergeSegments.append((lastStart, len(row['alignsegments'])))
    mergeSegments = list(set(mergeSegments))
    if mergeSegments != []:
        for t in reversed(mergeSegments):
            print(row['aligntypes'][t[0]:t[1]+1])
            print(row['alignsegments'][t[0]:t[1]+1])
        print()
    return row

# temp_df = con_segments
# temp_df.apply(
#     lambda row: mergeParentheses(row), 
#     axis=1, result_type='expand')

In [None]:
def extractTup(data, tup_i=0, is_frame=True):
    types = {
        'segment': 0,
        'pos': 1,
        'cpos': 2
    }
    if tup_i in types:
        tup_i = types[tup_i]
    else:
        raise ValueError(f'tup_i not in types: {types.keys()}')
    if is_frame:
        return data.applymap(lambda x: x[tup_i])
    else:
        return data.map(lambda x: x[tup_i])

# extractTup(transformTuples(temp_df.loc[7298]), tup_i='segment', is_frame=True)
extractTup(alignment_df.loc[[7298]], tup_i='segment', is_frame=True)

In [None]:
def removeEmptyColumns(align_df):
    output_columns = []
    for c in align_df.columns:
        align_df_c = extractTup(align_df.loc[:, c], tup_i='segment', is_frame=False)
        if len([e for e in align_df_c if e.strip() != '']) != 0:
            output_columns.append(c)
    output_df = align_df[output_columns]
    output_df.columns = [f'txt{i}' for i in range(len(output_df.columns))]
    return output_df
removeEmptyColumns(alignment_df.loc[[7298, 7321]])

In [None]:
import math
from nltk.metrics import edit_distance

def alignRowMajorLocal(align_a, align_b, use_types=False, remove_empty_cols=False, debug_print=False):
    # An implementation of Smith-Waterman alignment
    # RETURNS:
    #  1. The alignment DataFrame
    #  2. The score associated with this alignment
    def removeEmptyColumns(align_df):
        output_columns = []
        for c in align_df.columns:
            align_df_c = extractTup(align_df.loc[:, c], tup_i='segment', is_frame=False)
            if len([e for e in align_df_c if e.strip() != '']) != 0:
                output_columns.append(c)
        output_df = align_df[output_columns]
        output_df.columns = [f'txt{i}' for i in range(len(output_df.columns))]
        return output_df
    if remove_empty_cols:
        align_a = removeEmptyColumns(align_a)
        align_b = removeEmptyColumns(align_b)
    align_a_segment = extractTup(align_a, tup_i='segment')
    align_b_segment = extractTup(align_b, tup_i='segment')
    align_a_type = extractTup(align_a, tup_i='pos')
    align_b_type = extractTup(align_b, tup_i='pos')
    align_a_ctype = extractTup(align_a, tup_i='cpos')
    align_b_ctype = extractTup(align_b, tup_i='cpos')
    # Doing a general alignment
    align_a_elems = [i for i in range(len(align_a.columns))]
    align_b_elems = [i for i in range(len(align_b.columns))]
    if debug_print:
        print(align_a_elems)
        print(align_b_elems)
        print()
    def getScoreAligningIndices(index_a, index_b):
        # A higher score is better / more match!
        # make sure all the segment texts are precomputed lol
        text_a = list(align_a_segment[align_a.columns[index_a]])
        text_b = list(align_b_segment[align_b.columns[index_b]])
        for text in text_a+text_b:
            if text not in cached_word2vec_phrases:
                try:
                    cached_word2vec_phrases[text] = get_phrase_embed_word2vec(word2vec, text).drop('word', 1)
                except KeyError:
                    pass
        # start off with phrase embedding distance (current max is 60 for perfect match)
        # if we have embeds for any word in each set, ignore others and just use words we have embeds for
        if any(s in cached_word2vec_phrases for s in text_a)\
                and any(s in cached_word2vec_phrases for s in text_b):
            # calculate overall embeds
            embed_a = pd.concat([cached_word2vec_phrases[text] for text
                                 in text_a if text in cached_word2vec_phrases]).apply(lambda x: x.mean())
            embed_b = pd.concat([cached_word2vec_phrases[text] for text
                                 in text_b if text in cached_word2vec_phrases]).apply(lambda x: x.mean())
            # TODO can tweak this scoring calculation a little for performance
            score = 10 * (6 - np.linalg.norm(embed_a-embed_b))
        else:
            # use levenshtein dist as fallback... if either set has NO words with embeds available
            scaled_edits_sum = 0
            for phrase_a in [p for p in text_a if len(p) != 0]:
                for phrase_b in [p for p in text_b if len(p) != 0]:
                    scaled_edits_sum += edit_distance(phrase_a,phrase_b) / max(len(phrase_a), len(phrase_b))
            score = 60 * (1 - (scaled_edits_sum / (len(text_a) * len(text_b))))
        # add a component based on phrase type if that flag is set
        # TODO improve this?; this currently just returns -inf if mismatch of type sets
        # Might want to add support for aligning different types of phrase together...
        if use_types:
            # reduce to set
            types_a = set([t for t in align_a_type[align_a.columns[index_a]] if t.strip() != ''])
            types_b = set([t for t in align_b_type[align_b.columns[index_b]] if t.strip() != ''])
#             # check if we are handling a hard pos match
#             if any([((p in types_a) or (p in types_b)) for p in pos_must_match]):
            if len(types_a) != 0 and len(types_b) != 0 and types_a != types_b:
                score = -1 * math.inf
        # TODO: add a component based on phrase ctype (phrase POS breakdown) (?)
        if debug_print:
            print(f'scoring between '
                  +f'"{list(align_a_segment[align_a.columns[index_a]])}" and '
                  +f'"{list(align_b_segment[align_b.columns[index_b]])}": {score}')
        return score
    def getGapPenalty(length, size=1):
        return -1 * (1 * min(length,1) + 0.1 * max(length-1,0)) #* (1 + math.log(size))
    # Build score matrix of size (a-alignables + 1)x(b-alignables + 1)
    scores = np.zeros((len(align_a_elems)+1, len(align_b_elems)+1))
    # Build traceback matrix
    # traceback = 0 for end, 4 for W, 7 for NW, 9 for N (to calculate traceback, t%2 is N-ness, t%3 is W-ness)
    traceback = np.zeros((len(align_a_elems)+1, len(align_b_elems)+1))
    # Iterate through all of the cells to populate both the score and traceback matrices
    for i in range(1, scores.shape[0]):
        for j in range(1, scores.shape[1]):
            score_map = {}
            # calculate score for aligning nouns a[i] and b[j]
            score_map[
                scores[i-1,j-1] + getScoreAligningIndices(align_a_elems[i-1], align_b_elems[j-1])
            ] = 7
            # calculate score for gap in i
            for i_gap in range(1, i):
                igap_score = scores[i-i_gap,j] + getGapPenalty(i_gap, size=len(align_a_elems))
                score_map[igap_score] = 9
            # calculate score for gap in j
            for j_gap in range(1, j):
                jgap_score = scores[i,j-j_gap] + getGapPenalty(j_gap, size=len(align_b_elems))
                score_map[jgap_score] = 4
            # add the possibility for unrelatedness
            score_map[0] = 0
            scores[i,j] = max(score_map.keys())
            traceback[i,j] = score_map[max(score_map.keys())]
    if debug_print:
        print()
        print(scores)
        print(traceback)
        print()
    # Do traceback to build our final alignment
    tracepoint = np.unravel_index(np.argmax(scores, axis=None), scores.shape)
    points_a = []
    points_b = []
    while traceback[tracepoint] != 0:
        # contribute to the align information
        if traceback[tracepoint] == 7:
            # this is a point where two elements were aligned
            points_a.append(align_a_elems[tracepoint[0]-1])
            points_b.append(align_b_elems[tracepoint[1]-1])
        elif traceback[tracepoint] == 4:
            # this is a point where there was a gap inserted for row_a
            points_a.append(-1)
            points_b.append(align_b_elems[tracepoint[1]-1])
        elif traceback[tracepoint] == 9:
            # this is a point where there was a gap inserted for row_b
            points_a.append(align_a_elems[tracepoint[0]-1])
            points_b.append(-1)
        # step backwards
        tracepoint = (
            tracepoint[0] - int(traceback[tracepoint] % 2),
            tracepoint[1] - int(traceback[tracepoint] % 3))
    points_a = list(reversed(points_a))
    points_b = list(reversed(points_b))
    if len(points_a) != len(points_b):
        # enforce that align_a and align_b are the same length (they should be)
        raise ValueError('should not occur; bug in S-W local alignment?')
    if debug_print:
        print(points_a)
        print(points_b)
        print()
    # Create a nice neat form of this alignment
    # TODO add support for NP-only alignment gaps?
    range_a = [i for i in points_a if i >= 0]
    range_b = [i for i in points_b if i >= 0]
    range_a = (range_a[0], range_a[-1])
    range_b = (range_b[0], range_b[-1])
    output = pd.DataFrame(columns=[f'txt{i}' for i in range(
        (range_a[0] + range_b[0]) + len(points_a)
        + max(0, (len(align_a.columns) - range_a[1]) - 1)
        + max(0, (len(align_b.columns) - range_b[1]) - 1)
    )])
    # build the segment from align_a
    realign_a = align_a.loc[:, [f'txt{i}' for i in range(range_a[0])]]
    for i in range(range_b[0]):
        realign_a.insert(len(realign_a.columns), f'insx{i}', np.nan, True)
    for i in points_a:
        if i >= 0:
            realign_a[align_a.columns[i]] = align_a.loc[:, align_a.columns[i]]
        else:
            realign_a.insert(len(realign_a.columns), f'ins{len(realign_a.columns)}', np.nan, True)
    for i in range(range_a[1]+1, len(align_a.columns)):
        realign_a[align_a.columns[i]] = align_a.loc[:, align_a.columns[i]]
    for i in range(range_b[1]+1, len(align_b.columns)):
        realign_a.insert(len(realign_a.columns), f'insx{i+range_b[0]}', np.nan, True)
    # build the segment from align_b
    realign_b = align_b.loc[:, [f'txt{i}' for i in range(range_b[0])]]
    for i in range(range_a[0]):
        realign_b.insert(0, f'insx{i}', np.nan, True)
    for i in points_b:
        if i >= 0:
            realign_b[align_b.columns[i]] = align_b.loc[:, align_b.columns[i]]
        else:
            realign_b.insert(len(realign_b.columns), f'ins{len(realign_b.columns)}', np.nan, True)
    for i in range(range_a[1]+1, len(align_a.columns)):
        realign_b.insert(len(realign_b.columns), f'insx{i+range_a[0]}', np.nan, True)
    for i in range(range_b[1]+1, len(align_b.columns)):
        realign_b[align_b.columns[i]] = align_b.loc[:, align_b.columns[i]]
    # build final output
    realign_a.columns = output.columns
    realign_b.columns = output.columns
    output = output.append(realign_a)
    output = output.append(realign_b)
    return output.applymap(lambda x: ('', '', []) if x is np.nan else x), np.amax(scores, axis=None)

# toy_align, toy_align_score = alignRowMajorLocal(
#     alignment_df.loc[[7298]],
#     alignment_df.loc[[7321]],
#     remove_empty_cols=True)
# print(toy_align_score)
# toy_align
toy_align, toy_align_score = alignRowMajorLocal(
    alignment_df_tseg.loc[[7298]],
    alignment_df_tseg.loc[[7321]],
    remove_empty_cols=True, use_types=True)
print(toy_align_score)
toy_align

In [None]:
# Create the toy data
toy_data = pd.DataFrame(
    ['Asperger syndrome', 
     'high - functioning ASD', 
     'unrecognized and untreated anxiety', 
     'generalized anxiety disorders', 
     'anxiety', 
     'high - functioning autism spectrum disorders and anxiety', 
     'high - functioning ASD and anxiety', 
     'high - functioning ASD', 
     'high - functioning autism spectrum disorders', 
     'previously undetected anxiety', 
     'untreated anxiety']
).rename({0: 'txt'}, axis=1)
toy_data

In [None]:
# Column split step 1: Build word tree with node = word units running right->left

# add text to the given trienode
def wordTreeHelper(tree_node, text, id_data=None, right_align=False):
    text = text.strip()
    # Check for base case
    if text == '':
        tree_node[id_data] = id_data
        return tree_node
    # Select the right key (for now, just pick the key based on right-to-left ordering)
    key = ''
    if right_align:
        key = text.split(' ')[-1]
        text = ' '.join(text.split(' ')[0:-1])
    else:
        key = text.split(' ')[0]
        text = ' '.join(text.split(' ')[1:])
    # Put the key and text into the trie
    if key not in tree_node:
        tree_node[key] = {}
    tree_node[key] = wordTreeHelper(tree_node[key], text, id_data=id_data, right_align=right_align)
    return tree_node

def wordTree(df, colname, right_align=False):
    tree = {}
    for e_id in df.index:
        tree = wordTreeHelper(tree, df.loc[e_id][colname], id_data=e_id, right_align=right_align)
    return tree

st = wordTree(toy_data, 'txt')
st

In [None]:
# Column split step 2: Collapse the suffix trie (merge nodes with only one child)

# edits the input trie
def wordTreeCollapse(tree, right_align=False):
    # Collapse children nodes first
    added_keys = {}
    removed_keys = []
    for child in tree:
        if type(child) is str:
            tree[child] = wordTreeCollapse(tree[child], right_align=right_align)
            # Check if the new child node is collapsible
            if len(tree[child]) == 1 and type(list(tree[child])[0]) is str:
                grandchild = list(tree[child])[0]
                grandchild_tree = tree[child][grandchild]
                # Perform the merge (put into edit queue)
                removed_keys.append(child)
                if right_align:
                    added_keys[grandchild + ' ' + child] = grandchild_tree
                else:
                    added_keys[child + ' ' + grandchild] = grandchild_tree
    # Perform removals
    for key in removed_keys:
        tree.pop(key)
    # Perform additions
    for key in added_keys:
        tree[key] = added_keys[key]
    return tree

st = wordTreeCollapse(st)
st

In [None]:
# Column split step 3: Output the suffix trie to multiple columns

# Calculate how many output columns we'll need
# Get the depth of the trie (a trie with one terminal node {0:0} has depth 0)
def wordTreeDepth(tree):
    max_depth = 0
    for child in tree:
        if type(child) is str:
            max_depth = max(max_depth, 1 + wordTreeDepth(tree[child]))
    return max_depth

def wordTreeSplitHelper(tree, max_depth, output, so_far=[], right_align=False):
    for child in tree:
        if type(child) is not str:
            # we have hit a base, put in an entry
            if right_align:
                output[child] = ['']*(max_depth - len(so_far)) + so_far
            else:
                output[child] = so_far + ['']*(max_depth - len(so_far))
        else:
            # this node has further children!
            if right_align:
                output = wordTreeSplitHelper(tree[child], 
                                             max_depth, 
                                             output, 
                                             [child] + so_far, 
                                             right_align=right_align)
            else:
                output = wordTreeSplitHelper(tree[child], 
                                             max_depth, 
                                             output, 
                                             so_far + [child], 
                                             right_align=right_align)
    return output

def wordTreeSplit(tree, colname, right_align=False):
    tree_depth = wordTreeDepth(tree)
    output = pd.DataFrame(columns=[f'{colname}{i}' for i in range(tree_depth)])
    rearranged = wordTreeSplitHelper(tree, tree_depth, {}, right_align=right_align)
    for id in rearranged:
        output.loc[id] = rearranged[id]
    return output
    
wordTreeSplit(st, 'split')

In [None]:
def splitCol(src_alignment, split_col, right_align):
    # TODO make splitCol preserve some POS information?
    splitted = wordTreeSplit(
        wordTreeCollapse(wordTree(
            src_alignment,
            split_col, 
            right_align=right_align), right_align=right_align),
        f'{split_col}.', 
        right_align=right_align)
    result = src_alignment.join(splitted)
    result = result.drop(split_col, 1)
    result = result.reindex(
        columns=[x for _,x in sorted(zip(
            [float(c[3:]) for c in result.columns],
            result.columns))]
    )
    result.columns = [f'txt{i}' for i in range(len(result.columns))]
    return result

toy_align = splitCol(extractTup(toy_align, tup_i='segment'), 'txt0', right_align=True)
toy_align

In [None]:
# TODO re-apply the old POS tags and info?

def applyEmptyTup(row):
    output = pd.DataFrame()
    for i in row.index:
        output[i] = [(row[i], '', [])]
    return output.set_index(pd.Series([row.name]))

# applyEmptyTup(toy_align.loc[7298])

toy_align = toy_align.groupby(toy_align.index, group_keys=False).apply(
    lambda group: applyEmptyTup(group.iloc[0])
)
toy_align

In [None]:
def mergeCol(src_alignment, merge_col):
    # TODO make mergeCol preserve some POS information?
    merge_col_next = src_alignment.columns[list(src_alignment.columns).index(merge_col)+1]
    merged = src_alignment[merge_col] + ' ' + src_alignment[merge_col_next]
    result = src_alignment.copy()
    result[merge_col] = merged
    del result[merge_col_next]
    result.columns = [f'txt{i}' for i in range(len(result.columns))]
    return result

mergeCol(extractTup(toy_align, tup_i='segment'), 'txt0')

In [None]:
for i in [7298, 7321, 5126, 5134, 4594, 4618, 6507, 6474, 7308, 5130, 2552]:
    alignment_df.loc[[i]]

In [None]:
# # Alignment in manually selected "nice" order with types enforced
# temp_align = []
# temp_align.append(alignRowMajorLocal(alignment_df.loc[[5126]], alignment_df.loc[[5134]],
#                                     remove_empty_cols=True, use_types=True)[0])
# temp_align.append(alignRowMajorLocal(alignment_df.loc[[7298]], alignment_df.loc[[7321]],
#                                     remove_empty_cols=True, use_types=True)[0])
# temp_align.append(alignRowMajorLocal(alignment_df.loc[[4594]], alignment_df.loc[[4618]],
#                                     remove_empty_cols=True, use_types=True)[0])
# temp_align.append(alignRowMajorLocal(alignment_df.loc[[5130]], alignment_df.loc[[2552]],
#                                     remove_empty_cols=True, use_types=True)[0])
# temp_align.append(alignRowMajorLocal(alignment_df.loc[[6474]], alignment_df.loc[[7308]],
#                                     remove_empty_cols=True, use_types=True)[0])
# update_temp_align = []
# update_temp_align.append(alignRowMajorLocal(temp_align[2], temp_align[3], 
#                                             remove_empty_cols=True, use_types=True)[0])
# update_temp_align.append(alignRowMajorLocal(temp_align[1], temp_align[4], 
#                                             remove_empty_cols=True, use_types=True)[0])
# update_temp_align.append(alignRowMajorLocal(temp_align[0], alignment_df.loc[[6507]], 
#                                             remove_empty_cols=True, use_types=True)[0])
# temp_align = update_temp_align
# update_temp_align = []
# update_temp_align.append(alignRowMajorLocal(temp_align[0], temp_align[2], 
#                                             remove_empty_cols=True, use_types=True)[0])
# manually_aligned_group, manually_aligned_group_score = alignRowMajorLocal(
#     update_temp_align[0], temp_align[1], remove_empty_cols=True, use_types=True)
# print(manually_aligned_group_score)
# extractTup(manually_aligned_group, tup_i='segment').sort_index()

In [None]:
# Alignment in manually selected "nice" order without types enforced
temp_align = []
temp_align.append(alignRowMajorLocal(alignment_df.loc[[5126]], alignment_df.loc[[5134]], 
                                     remove_empty_cols=True)[0])
temp_align.append(alignRowMajorLocal(alignment_df.loc[[7298]], alignment_df.loc[[7321]], 
                                     remove_empty_cols=True)[0])
temp_align.append(alignRowMajorLocal(alignment_df.loc[[4594]], alignment_df.loc[[4618]], 
                                     remove_empty_cols=True)[0])
temp_align.append(alignRowMajorLocal(alignment_df.loc[[5130]], alignment_df.loc[[2552]], 
                                     remove_empty_cols=True)[0])
temp_align.append(alignRowMajorLocal(alignment_df.loc[[6474]], alignment_df.loc[[7308]], 
                                     remove_empty_cols=True)[0])
update_temp_align = []
update_temp_align.append(alignRowMajorLocal(temp_align[2], temp_align[3])[0])
update_temp_align.append(alignRowMajorLocal(temp_align[1], temp_align[4])[0])
update_temp_align.append(alignRowMajorLocal(temp_align[0], alignment_df.loc[[6507]], 
                                            remove_empty_cols=True)[0])
temp_align = update_temp_align
update_temp_align = []
update_temp_align.append(alignRowMajorLocal(temp_align[0], temp_align[2])[0])
manually_aligned_group, manually_aligned_group_score = alignRowMajorLocal(update_temp_align[0], temp_align[1])
print(manually_aligned_group_score)
extractTup(manually_aligned_group, tup_i='segment').sort_index()

In [None]:
# # Alignment in random-ish order without types enforced
# temp_align = alignRowMajorLocal(alignment_df.loc[[7298]], alignment_df.loc[[7321]], remove_empty_cols=True)[0]
# temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[5126]], remove_empty_cols=True)[0]
# temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[5134]], remove_empty_cols=True)[0]
# temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[4594]], remove_empty_cols=True)[0]
# temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[4618]], remove_empty_cols=True)[0]
# temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[6507]], remove_empty_cols=True)[0]
# temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[6474]], remove_empty_cols=True)[0]
# temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[7308]], remove_empty_cols=True)[0]
# temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[5130]], remove_empty_cols=True)[0]
# manually_aligned_group2, temp_align_score = alignRowMajorLocal(
#     temp_align, alignment_df.loc[[2552]], remove_empty_cols=True)
# print(temp_align_score)
# extractTup(manually_aligned_group2, tup_i='segment').sort_index()

In [None]:
# # Alignment in random-ish order without types enforced (of a slightly different dataset!!!)
# temp_align = alignRowMajorLocal(alignment_df.loc[[7494]], alignment_df.loc[[7541]], remove_empty_cols=True)[0]
# temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[7549]], remove_empty_cols=True)[0]
# temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[7585]], remove_empty_cols=True)[0]
# temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[7594]], remove_empty_cols=True)[0]
# temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[416]], remove_empty_cols=True)[0]
# temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[423]], remove_empty_cols=True)[0]
# temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[443]], remove_empty_cols=True)[0]
# temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[447]], remove_empty_cols=True)[0]
# temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[409]], remove_empty_cols=True)[0]
# temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[1960]], remove_empty_cols=True)[0]
# manually_aligned_group2, temp_align_score = alignRowMajorLocal(
#     temp_align, alignment_df.loc[[1989]], remove_empty_cols=True)
# print(temp_align_score)
# extractTup(manually_aligned_group2, tup_i='segment').sort_index()

In [None]:
# Demonstrate a merge operation
manually_aligned_group_merge = mergeCol(extractTup(manually_aligned_group, tup_i='segment'), 'txt4')
manually_aligned_group_merge = mergeCol(manually_aligned_group_merge, 'txt4')
manually_aligned_group_merge = mergeCol(manually_aligned_group_merge, 'txt1')
manually_aligned_group_merge = mergeCol(manually_aligned_group_merge, 'txt4')
manually_aligned_group_merge.sort_index()

In [None]:
# Demonstrate a split operation
manually_aligned_group_split = splitCol(manually_aligned_group_merge, 'txt0', right_align=True)
manually_aligned_group_split = splitCol(manually_aligned_group_split, 'txt4', right_align=False)
manually_aligned_group_split = splitCol(manually_aligned_group_split, 'txt5', right_align=True)
manually_aligned_group_split = splitCol(manually_aligned_group_split, 'txt7', right_align=False)
manually_aligned_group_split.sort_index()

In [None]:
# Put the split operation output into something more standard alignment DF format
manually_aligned_group_split = manually_aligned_group_split.groupby(
    manually_aligned_group_split.index, group_keys=False).apply(
    lambda group: applyEmptyTup(group.iloc[0])
)
manually_aligned_group_split

In [None]:
# # Re-align the output of the split function using the same ordering as initial alignment
# manually_aligned_group_realign = []
# manually_aligned_group_realign.append(
#     alignRowMajorLocal(manually_aligned_group_split.loc[[5126]],
#                        manually_aligned_group_split.loc[[5134]],
#                        remove_empty_cols=True)[0])
# manually_aligned_group_realign.append(
#     alignRowMajorLocal(manually_aligned_group_split.loc[[7298]],
#                        manually_aligned_group_split.loc[[7321]],
#                        remove_empty_cols=True)[0])
# manually_aligned_group_realign.append(
#     alignRowMajorLocal(manually_aligned_group_split.loc[[4594]],
#                        manually_aligned_group_split.loc[[4618]],
#                        remove_empty_cols=True)[0])
# manually_aligned_group_realign.append(
#     alignRowMajorLocal(manually_aligned_group_split.loc[[5130]],
#                        manually_aligned_group_split.loc[[2552]],
#                        remove_empty_cols=True)[0])
# manually_aligned_group_realign.append(
#     alignRowMajorLocal(manually_aligned_group_split.loc[[6474]],
#                        manually_aligned_group_split.loc[[7308]],
#                        remove_empty_cols=True)[0])
# update_manually_aligned_group_realign = []
# update_manually_aligned_group_realign.append(
#     alignRowMajorLocal(manually_aligned_group_realign[2],
#                        manually_aligned_group_realign[3],
#                        remove_empty_cols=True)[0])
# update_manually_aligned_group_realign.append(
#     alignRowMajorLocal(manually_aligned_group_realign[1],
#                        manually_aligned_group_realign[4],
#                        remove_empty_cols=True)[0])
# update_manually_aligned_group_realign.append(
#     alignRowMajorLocal(manually_aligned_group_realign[0],
#                        manually_aligned_group_split.loc[[6507]],
#                        remove_empty_cols=True)[0])
# manually_aligned_group_realign = update_manually_aligned_group_realign
# update_manually_aligned_group_realign = []
# update_manually_aligned_group_realign.append(
#     alignRowMajorLocal(manually_aligned_group_realign[0],
#                        manually_aligned_group_realign[2],
#                        remove_empty_cols=True)[0])
# manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign[0], manually_aligned_group_realign[1], remove_empty_cols=True)[0]
# extractTup(manually_aligned_group_realign, tup_i='segment').sort_index()

In [None]:
# # Re-align the output of the split function by a manual ordering
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     manually_aligned_group_split.loc[[7298]],
#     manually_aligned_group_split.loc[[7321]],
#     remove_empty_cols=True)[0]
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign,
#     manually_aligned_group_split.loc[[5126]],
#     remove_empty_cols=True)[0]
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign,
#     manually_aligned_group_split.loc[[7308]],
#     remove_empty_cols=True)[0]
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign,
#     manually_aligned_group_split.loc[[6474]],
#     remove_empty_cols=True)[0]
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign,
#     manually_aligned_group_split.loc[[4594]],
#     remove_empty_cols=True)[0]
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign,
#     manually_aligned_group_split.loc[[4618]],
#     remove_empty_cols=True)[0]
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign,
#     manually_aligned_group_split.loc[[5130]],
#     remove_empty_cols=True)[0]
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign,
#     manually_aligned_group_split.loc[[5134]],
#     remove_empty_cols=True)[0]
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign,
#     manually_aligned_group_split.loc[[2552]],
#     remove_empty_cols=True)[0]
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign,
#     manually_aligned_group_split.loc[[6507]],
#     remove_empty_cols=True)[0]
# extractTup(update_manually_aligned_group_realign, tup_i='segment').sort_index()

In [None]:
# # Re-align the output of the split function using a DIFFERENT manual ordering
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     manually_aligned_group_split.loc[[2552]],
#     manually_aligned_group_split.loc[[4618]],
#     remove_empty_cols=True)[0]
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign,
#     manually_aligned_group_split.loc[[4594]],
#     remove_empty_cols=True)[0]
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign,
#     manually_aligned_group_split.loc[[5130]],
#     remove_empty_cols=True)[0]
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign,
#     manually_aligned_group_split.loc[[7298]],
#     remove_empty_cols=True)[0]
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign,
#     manually_aligned_group_split.loc[[7308]],
#     remove_empty_cols=True)[0]
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign,
#     manually_aligned_group_split.loc[[7321]],
#     remove_empty_cols=True)[0]
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign,
#     manually_aligned_group_split.loc[[6474]],
#     remove_empty_cols=True)[0]
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign,
#     manually_aligned_group_split.loc[[5134]],
#     remove_empty_cols=True)[0]
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign,
#     manually_aligned_group_split.loc[[6507]],
#     remove_empty_cols=True)[0]
# update_manually_aligned_group_realign = alignRowMajorLocal(
#     update_manually_aligned_group_realign,
#     manually_aligned_group_split.loc[[5126]],
#     remove_empty_cols=True)[0]
# extractTup(update_manually_aligned_group_realign, tup_i='segment').sort_index()

In [None]:
# extractTup(update_manually_aligned_group_realign, tup_i='segment').loc[
#     [2552, 4594, 4618, 5126, 5130, 5134, 6474, 6507, 7298, 7308, 7321]
# ]
# extractTup(update_manually_aligned_group_realign, tup_i='segment').loc[
#     [2552, 4594, 4618, 5130, 6474, 7298, 7308, 7321, 5126, 5134, 6507]
# ]

In [None]:
# TODO: implement a similarity score / phylo multiple sequence alignment ordering

# Alignment scoring

In [None]:
#          [5130, 5126, 5134, 4618, 6507, 4594, 7321, 7298, 2552, 6474, 7308]
id_order = [5130, 5126, 5134, 4618, 6507, 4594, 7321, 7298, 2552, 6474, 7308]

alignment_wordsonly, alignment_score = alignRowMajorLocal(
    alignment_df_tseg.loc[[id_order[0]]],
    alignment_df_tseg.loc[[id_order[1]]],
    remove_empty_cols=True, 
    use_types=True
)
for i in range(2, len(id_order)):
    alignment_wordsonly, alignment_score = alignRowMajorLocal(
        alignment_wordsonly,
        alignment_df_tseg.loc[[id_order[i]]],
        remove_empty_cols=True, 
        use_types=True
    )
extractTup(alignment_wordsonly, tup_i='segment').sort_index()
alignment_wordsonly

In [None]:
reference_alignment = []
reference_alignment_orderings = []
reference_alignment_scores = []

temp_ids_list = [
    [5130, 5126, 6507, 6474, 7308, 5134, 2552, 4618, 7298, 4594, 7321], # decent
    [7321, 5134, 4594, 6507, 2552, 5130, 7298, 7308, 4618, 6474, 5126], # decent
    [5134, 7298, 4618, 6507, 7321, 5126, 6474, 5130, 4594, 2552, 7308], # bad
    [5126, 5134, 4618, 7298, 6507, 5130, 6474, 4594, 7308, 2552, 7321], # quite bad (chaotic)
    [5126, 7321, 7298, 5130, 6474, 4618, 4594, 7308, 2552, 6507, 5134]  # quite bad (sharply split)
]

for i in range(len(temp_ids_list)):
    temp_ids = temp_ids_list[i]
    alignment, alignment_score = alignRowMajorLocal(
        alignment_df.loc[[temp_ids[0]]], 
        alignment_df.loc[[temp_ids[1]]], 
        remove_empty_cols=True
    )
    temp_scores = [0, alignment_score]
    for j in range(2, len(temp_ids)):
        alignment, alignment_score = alignRowMajorLocal(
            alignment,
            alignment_df.loc[[temp_ids[j]]], 
            remove_empty_cols=True
        )
        temp_scores.append(alignment_score)
    reference_alignment.append(alignment)
    reference_alignment_orderings.append(temp_ids)
    reference_alignment_scores.append(temp_scores)
#     print(temp_ids)
#     extractTup(alignment, tup_i='segment').sort_index()
#     print()

# Add flat smushing alignment :)
reference_alignment.append(
    removeEmptyColumns(alignment_df.loc[
        [2552, 4594, 4618, 5130, 6474, 7298, 7308, 7321, 5126, 5134, 6507]
    ])
)
reference_alignment_orderings.append(['flat smush'])
reference_alignment_scores.append(['no scores'])

# Add manually tuned (?) alignment
reference_alignment.append(manually_aligned_group)
reference_alignment_orderings.append(['manually tuned'])
reference_alignment_scores.append(['no scores'])

In [None]:
# Here's an "alignment" that's really bad:
toy_alignment_bad = reference_alignment[5]
extractTup(toy_alignment_bad, tup_i='segment').sort_index()
# Here's an alignment that's fairly bad:
toy_alignment_poor = reference_alignment[4]
extractTup(toy_alignment_poor, tup_i='segment').sort_index()
# Here's an alignment that's a little better:
toy_alignment_good = reference_alignment[6]
extractTup(toy_alignment_good, tup_i='segment').sort_index()
# Here's an alignment that's a lot better:
toy_alignment_great = reference_alignment[1]
extractTup(toy_alignment_great, tup_i='segment').sort_index()

In [None]:
def scoreNumColumns(align_df):
    return len(align_df.columns)

# Lower is better
print('  bad', scoreNumColumns(toy_alignment_bad))
print(' poor', scoreNumColumns(toy_alignment_poor))
print(' good', scoreNumColumns(toy_alignment_good))
print('great', scoreNumColumns(toy_alignment_great))

In [None]:
import math

def scoreColumnPhraseEmbedVariance(align_df, colname):
    # Compute embeddings variance of all the phrases for a single column
    # Reasoning for this operation (calculating variance as trace(covariance matrix) ...):
    # https://stats.stackexchange.com/questions/225434/a-measure-of-variance-from-the-covariance-matrix
    texts = [text for text in extractTup(align_df[colname], tup_i='segment', is_frame=False)]
    texts_w2v = []
    # make sure the words used are cached
    for word in texts:
        try:
            if word not in cached_word2vec_phrases:
                cached_word2vec_phrases[word] = get_phrase_embed_word2vec(word2vec, word).drop('word', 1)
            texts_w2v = texts_w2v + [word]
        except KeyError:
            pass
    texts = texts_w2v
    if len(texts) > 1:
        output = pd.concat([cached_word2vec_phrases[text] 
                            for text in texts])
        result = np.trace(output.cov())
    else:
        # one of two scenarios:
        # 1. all of the contents of this column aren't considered words, so, pretend they're all the same
        # 2. there is only one row in this column that contains text, so it has no variation
        # TODO is there a theoretically better way to handle them?
        result = 0
    return result

# Lower is better
print('  bad', scoreColumnPhraseEmbedVariance(toy_alignment_bad, 'txt0'))
print(' poor', scoreColumnPhraseEmbedVariance(toy_alignment_poor, 'txt0'))
print(' good', scoreColumnPhraseEmbedVariance(toy_alignment_good, 'txt0'))
print('great', scoreColumnPhraseEmbedVariance(toy_alignment_great, 'txt0'))

In [None]:
def scoreColumnTextCount(align_df, colname):
    # Count the number of unique texts in a single column
    # capture each cell text
    tokens = [text for text in extractTup(align_df[colname], tup_i='segment', is_frame=False)
              if text.strip() != '']
    # clean up whitespace
    tokens = [text.split() for text in tokens]
    # flatten
    tokens = [' '.join(sublist) for sublist in tokens]
    return len(set(tokens))

# Lower is better
print('  bad', scoreColumnTextCount(toy_alignment_bad, 'txt0'))
print(' poor', scoreColumnTextCount(toy_alignment_poor, 'txt0'))
print(' good', scoreColumnTextCount(toy_alignment_good, 'txt0'))
print('great', scoreColumnTextCount(toy_alignment_great, 'txt0'))

In [None]:
def scoreColumnTokenCount(align_df, colname):
    # Count the number of unique tokens in a single column
    # capture each cell text
    tokens = [text for text in extractTup(align_df[colname], tup_i='segment', is_frame=False)
              if text.strip() != '']
    # split it into tokens
    tokens = [text.split() for text in tokens]
    # flatten
    tokens = [token for sublist in tokens for token in sublist]
    return len(set(tokens))

# Lower is better
print('  bad', scoreColumnTokenCount(toy_alignment_bad, 'txt0'))
print(' poor', scoreColumnTokenCount(toy_alignment_poor, 'txt0'))
print(' good', scoreColumnTokenCount(toy_alignment_good, 'txt0'))
print('great', scoreColumnTokenCount(toy_alignment_great, 'txt0'))

In [None]:
def scoreColumnTokenEntityCount(align_df, colname):
    # Count the number of unique entity types in a single column
    types = [text for text in extractTup(align_df[colname], tup_i='segment', is_frame=False)
             if text.strip() != '']
    # process the texts through spacy and pick out entities
    types = [scisp(text).ents for text in types]
    # flatten the entities and put into a set
    types = [[ent for ent in ents] for ents in types if len(ents) != 0]
    types = list(set([w for sl in types for w in sl]))
    # get the UMLS mappings for each entity
    types = [ent._.umls_ents for ent in types]
    types = [ent[0][0] for ent in types if len(ent)>0]
    # get the TUI for each of these UMLS mappings
    # An informal guide to all of the TUIs: https://gist.github.com/joelkuiper/4869d148333f279c2b2e
    types_tui = [linker.umls.cui_to_entity[ent].types for ent in types]
    # check for edge case where there's zero or >1 UMLS tui for something?
    if any(len(e)!=1 for e in types_tui):
        raise ValueError('!=1 tui for a UMLS entity')
    types_tui = [ent[0] for ent in types_tui]
    # TODO implement larger groupings of types / more general type groups...
    # https://semanticnetwork.nlm.nih.gov/download/SemGroups.txt
    # from https://semanticnetwork.nlm.nih.gov/
    return len(set(types)), len(set(types_tui))

# # Lower is better
print('  bad', scoreColumnTokenEntityCount(toy_alignment_bad, 'txt0'))
print(' poor', scoreColumnTokenEntityCount(toy_alignment_poor, 'txt0'))
print(' good', scoreColumnTokenEntityCount(toy_alignment_good, 'txt0'))
print('great', scoreColumnTokenEntityCount(toy_alignment_great, 'txt0'))

In [None]:
def scoreColumnPhrasePOSCount(align_df, colname):
    # Count the number of unique phrase parts-of-speech in a single column
    tokens = [phrasepos for phrasepos in extractTup(align_df[colname], tup_i='pos', is_frame=False)
              if phrasepos.strip() != '']
    return len(set(tokens))

# Lower is better
print('  bad', scoreColumnPhrasePOSCount(toy_alignment_bad, 'txt0'))
print(' poor', scoreColumnPhrasePOSCount(toy_alignment_poor, 'txt0'))
print(' good', scoreColumnPhrasePOSCount(toy_alignment_good, 'txt0'))
print('great', scoreColumnPhrasePOSCount(toy_alignment_great, 'txt0'))

In [None]:
def scoreColumnPOSCount(align_df, colname):
    # Count the number of unique token parts-of-speech in a single column
    tokens = [pos_list for pos_list in extractTup(align_df[colname], tup_i='cpos', is_frame=False)]
    tokens = [pos for pos_list in tokens for pos in pos_list
              if pos.strip() != '']
    return len(set(tokens))

# Lower is better
print('  bad', scoreColumnPOSCount(toy_alignment_bad, 'txt0'))
print(' poor', scoreColumnPOSCount(toy_alignment_poor, 'txt0'))
print(' good', scoreColumnPOSCount(toy_alignment_good, 'txt0'))
print('great', scoreColumnPOSCount(toy_alignment_great, 'txt0'))

In [None]:
def scoreColumnRepresentation(align_df, colname):
    # Count the fraction of rows that are represented in the column (so penalizes gaps)
    tokens = [text for text in extractTup(align_df[colname], tup_i='segment', is_frame=False)]
    non_empty_count = len([text for text in tokens if text.strip() != ''])
    return non_empty_count/len(tokens)

# Higher is better
print('  bad', scoreColumnRepresentation(toy_alignment_bad, 'txt0'))
print(' poor', scoreColumnRepresentation(toy_alignment_poor, 'txt0'))
print(' good', scoreColumnRepresentation(toy_alignment_good, 'txt0'))
print('great', scoreColumnRepresentation(toy_alignment_great, 'txt0'))

In [None]:
def scoreColumnTotalTokens(align_df, colname):
    # Count the number of words (including repeats) in each column
    tokens = [text.split(' ') for text in extractTup(align_df[colname], tup_i='segment', is_frame=False)]
    tokens = [e for sublist in tokens for e in sublist if e!='']
    return len(tokens)

# Higher is better
print('  bad', scoreColumnTotalTokens(toy_alignment_bad, 'txt0'))
print(' poor', scoreColumnTotalTokens(toy_alignment_poor, 'txt0'))
print(' good', scoreColumnTotalTokens(toy_alignment_good, 'txt0'))
print('great', scoreColumnTotalTokens(toy_alignment_great, 'txt0'))

In [None]:
def scoreRowAlignment(align_df, focus_row):
    # Calculate the alignment score that a specific row would get if aligned with the df
    # Score is normalized by the number of operations that goes into calculating it
    # (there is a score matrix that is len(mat_a)*len(mat_b) dimensions)
    # TODO there should be a way to re-derive this based on the direct alignment?
    score = alignRowMajorLocal(align_df, focus_row, remove_empty_cols=True)[1]
    return score / (len(align_df.columns) + len(focus_row.columns))

# Higher is better
print('  bad', scoreRowAlignment(toy_alignment_bad, toy_alignment_bad.loc[[5130]]))
print(' poor', scoreRowAlignment(toy_alignment_poor, toy_alignment_poor.loc[[5130]]))
print(' good', scoreRowAlignment(toy_alignment_good, toy_alignment_good.loc[[5130]]))
print('great', scoreRowAlignment(toy_alignment_great, toy_alignment_great.loc[[5130]]))

In [None]:
def scoreTermColumnCount(align_df, term):
    # Count the number of columns that a certain phrase or term appears within
    # TODO should this be a fraction instead? what would that imply?
    # If it doesn't appear at all, returns 1 (TODO that might not be ideal?)
    # TODO add support for regex patterns (eg numbers?)
    tokens = [
        [text for text in extractTup(align_df[colname], tup_i='segment', is_frame=False)]
        for colname in align_df.columns
    ]
    tokens = [[e for e in col if (term.lower() in e.lower())] for col in tokens]
    tokens = [col for col in tokens if len(col) != 0]
    return max(1, len(tokens))

def scoreTermListColumnCount(align_df, term_list, term_weights=None):
    # if we don't have any terms to investigate, return 1 (default col count)
    if len(term_list) == 0:
        return 1
    # by default, weight each term equally
    if term_weights is None:
        term_weights = [1]*len(term_list)
    # And normalize the weights (assume that hasn't been done already)
    tw_sum = sum(term_weights)
    term_weights = [(tw/tw_sum) for tw in term_weights]
    scores = [scoreTermColumnCount(align_df, term) for term in term_list]
    return np.dot(scores, term_weights)

# Lower is better
print('  bad', scoreTermColumnCount(toy_alignment_bad, 'anxiety'))
print(' poor', scoreTermColumnCount(toy_alignment_poor, 'anxiety'))
print(' good', scoreTermColumnCount(toy_alignment_good, 'anxiety'))
print('great', scoreTermColumnCount(toy_alignment_great, 'anxiety'))
print()

# Test of using scoreTermListColumnCount with multiple terms (weighted equally)
# Lower is better
temp_list = ['anxiety', 'patient', 'children', 'child']
scores = scoreTermListColumnCount(toy_alignment_bad, temp_list)
print('  bad', scores)
scores = scoreTermListColumnCount(toy_alignment_poor, temp_list)
print(' poor', scores)
scores = scoreTermListColumnCount(toy_alignment_good, temp_list)
print(' good', scores)
scores = scoreTermListColumnCount(toy_alignment_great, temp_list)
print('great', scores)

In [None]:
def scoreRowLayoutCount(align_df):
    # Count the number of unique content-gap orderings that are present in the alignment
    rows = [
        list(extractTup(align_df.iloc[i], tup_i='segment', is_frame=False)) 
        for i in range(len(align_df))]
    rows = [''.join([('.' if (e.strip() != '') else ' ') for e in r]) for r in rows]
    return len(set(rows))

# Lower is better
print('  bad', scoreRowLayoutCount(toy_alignment_bad))
print(' poor', scoreRowLayoutCount(toy_alignment_poor))
print(' good', scoreRowLayoutCount(toy_alignment_good))
print('great', scoreRowLayoutCount(toy_alignment_great))

In [None]:
def tempScoreVector(align_df, term_list=[], term_weights=None):
    # TODO make this an actual nice function later
    score_colptxtembed = [scoreColumnPhraseEmbedVariance(align_df, colname) for colname in align_df.columns]
    score_coltextcount = [scoreColumnTextCount(align_df, colname) for colname in align_df.columns]
    score_coltokncount = [scoreColumnTokenCount(align_df, colname) for colname in align_df.columns]
    raw_colentityscores = [scoreColumnTokenEntityCount(align_df, colname) for colname in align_df.columns]
    score_coltentcount = [s[0] for s in raw_colentityscores]
    score_colttuicount = [s[1] for s in raw_colentityscores]
    score_colpposcount = [scoreColumnPhrasePOSCount(align_df, colname) for colname in align_df.columns]
    score_coltposcount = [scoreColumnPOSCount(align_df, colname) for colname in align_df.columns]
    score_colrepresent = [scoreColumnRepresentation(align_df, colname) for colname in align_df.columns]
    score_termcolcount = scoreTermListColumnCount(align_df, term_list, term_weights)
    scores = np.array([
        scoreNumColumns(align_df), # lower is better
        sum(score_coltextcount)/len(score_coltextcount), # lower is better
        sum(score_colptxtembed)/len(score_colptxtembed), # lower is better
        sum(score_coltokncount)/len(score_coltokncount), # lower is better
        sum(score_coltentcount)/len(score_coltentcount), # lower is better
        sum(score_colttuicount)/len(score_colttuicount), # lower is better
        sum(score_colpposcount)/len(score_colpposcount), # lower is better
        sum(score_coltposcount)/len(score_coltposcount), # lower is better
        sum(score_colrepresent)/len(score_colrepresent), # higher is better
        0,#scoreRowAlignment(align_df, align_df.loc[[5130]]), # higher is better
        score_termcolcount, #  lower is better
        scoreRowLayoutCount(align_df), # lower is better
    ])
    # weight and sum up the score (higher total score is better)
    score_direction = np.array([-1, -1, -1, -1, -1, -1, -1, -1,  1,  1, -1, -1])
    score_weights   = np.array([ 0,  0, 10,  0,  0,  0,  0,  0,  0, 0., 15,  0])
    singlescore = np.dot(np.multiply(score_weights, score_direction), scores)
    return singlescore, scores

In [None]:
def alignmentTerms(align_df, all_stopwords=None, priority_pos=['NN', 'JJ', 'RB']):
    if all_stopwords is None:
        all_stopwords = sp.Defaults.stop_words
    # collect list forms of words and cPOS
    all_text = [
        [text for text in extractTup(align_df.iloc[rownum], tup_i='segment', is_frame=False)]
        for rownum in range(len(align_df))
    ]
    all_text = [' '.join(sublist).split() for sublist in all_text]
    all_cpos = [
        [text for sublist 
         in extractTup(align_df.iloc[rownum], tup_i='cpos', is_frame=False) 
         for text in sublist]
        for rownum in range(len(align_df))
    ]
    # get count of how many rows each word is present in
    tokens_df = dict([
        (word, sum([(word in row) for row in all_text])) 
        for word 
        in set([item for sublist in all_text for item in sublist])
        if word not in all_stopwords
    ])
    # remove the words that show up in less than one row
    for word in [word for word in tokens_df if tokens_df[word] <= 1]:
        discard = tokens_df.pop(word, None)
    # flatten the word and cPOS lists
    all_text = [e for sublist in all_text for e in sublist]
    all_cpos = [e for sublist in all_cpos for e in sublist if e != '']
    # count up how many POS is assigned to each word
    pos_mapping = {}
    for i in range(len(all_text)):
        if all_text[i] not in pos_mapping:
            pos_mapping[all_text[i]] = {}
        if all_cpos[i] not in pos_mapping[all_text[i]]:
            pos_mapping[all_text[i]][all_cpos[i]] = 0
        pos_mapping[all_text[i]][all_cpos[i]] += 1
    # pick the single POS that each word is tagged as most often
    for word in pos_mapping:
        max_pos = None
        max_count = 0
        for pos in pos_mapping[word]:
            if pos_mapping[word][pos] > max_count:
                max_pos = pos
                max_count = pos_mapping[word][pos]
        pos_mapping[word] = max_pos
    # exponentiate the count of all of the words in the dict that are in POS classes we care about
    for word in tokens_df:
        if any([(pos in pos_mapping[word]) for pos in priority_pos]):
            tokens_df[word] = pow(tokens_df[word], 2)
    return tokens_df

alignmentTerms(toy_alignment_bad)

In [None]:
# test how much sense this weighting works for scoreTermListColumnCount
tokens_df = alignmentTerms(toy_alignment_bad)
temp_list = list(tokens_df)
temp_weights = list(tokens_df.values())
scores = scoreTermListColumnCount(toy_alignment_bad, temp_list, term_weights=temp_weights)
print('  bad', scores)
scores = scoreTermListColumnCount(toy_alignment_poor, temp_list, term_weights=temp_weights)
print(' poor', scores)
scores = scoreTermListColumnCount(toy_alignment_good, temp_list, term_weights=temp_weights)
print(' good', scores)
scores = scoreTermListColumnCount(toy_alignment_great, temp_list, term_weights=temp_weights)
print('great', scores)

In [None]:
# tokens_df = alignmentTerms(toy_alignment_bad)
# temp_list = list(tokens_df)
# temp_weights = list(tokens_df.values())
# score_vector_bad = tempScoreVector(
#     toy_alignment_bad, 
#     term_list=temp_list,
#     term_weights=temp_weights)
# score_vector_poor = tempScoreVector(
#     toy_alignment_poor, 
#     term_list=temp_list,
#     term_weights=temp_weights)
# score_vector_good = tempScoreVector(
#     toy_alignment_good, 
#     term_list=temp_list,
#     term_weights=temp_weights)
# score_vector_great = tempScoreVector(
#     toy_alignment_great, 
#     term_list=temp_list,
#     term_weights=temp_weights)

# score_vector_bad
# score_vector_poor
# score_vector_good
# score_vector_great

In [None]:
# ref_alignment_scores = []
# for i in range(len(reference_alignment)):
#     ref_alignment_scores.append(
#         tempScoreVector(
#             reference_alignment[i], 
#             term_list=temp_list,
#             term_weights=temp_weights))

In [None]:
# for i in range(len(reference_alignment)):
#     print('   ', ref_alignment_scores[i][0])
#     print(ref_alignment_scores[i][1])
#     print()
# #     ref_alignment_scores[i][0]
# #     extractTup(reference_alignment[i], tup_i='segment').sort_index()

In [None]:
# TODO how do we design this score function that it may be comparable with other alignments?

In [None]:
# import random

# # Experiment to see how much random ordering impacts alignment readability
# temp_ids = [2552, 4594, 4618, 5130, 6474, 7298, 7308, 7321, 5126, 5134, 6507] # experiment 1
# # temp_ids = [2030, 2078, 2380, 2437, 2711, 2849, 3194, 3285, 4887, 5437, 6915] # experiment 2
# # temp_ids = [1248, 1275, 1381, 1387, 3871, 4039, 5202, 5204, 6563, 6569] # experiment 3
# temp_alignment_orderings = []
# temp_alignment_outputs = []
# temp_alignment_score_progressions = []
# for i in range(20):
#     temp_ids = random.sample(temp_ids, len(temp_ids))
#     alignment, alignment_score = alignRowMajorLocal(
#         alignment_df.loc[[temp_ids[0]]], 
#         alignment_df.loc[[temp_ids[1]]], 
#         remove_empty_cols=True
#     )
#     temp_alignment_orderings.append(temp_ids)
#     temp_alignment_outputs.append(alignment)
#     temp_alignment_score_progressions.append([alignment_score])
#     for j in range(2, len(temp_ids)):
#         alignment, alignment_score = alignRowMajorLocal(
#             temp_alignment_outputs[i],
#             alignment_df.loc[[temp_ids[j]]], 
#             remove_empty_cols=True
#         )
#         temp_alignment_outputs[i] = alignment
#         temp_alignment_score_progressions[i].append(alignment_score)
# # extractTup(update_manually_aligned_group_realign, tup_i='segment').sort_index()

In [None]:
# import pickle
# pickle.dump((
#     temp_alignment_orderings, temp_alignment_outputs, temp_alignment_score_progressions,
#     temp_alignment2_orderings, temp_alignment2_outputs, temp_alignment2_score_progressions,
#     temp_alignment3_orderings, temp_alignment3_outputs, temp_alignment3_score_progressions,
# ), open('temp/ebm-alignmentswithscores.pkl', 'wb'))

In [None]:
# import pickle
# temp_alignment_orderings, temp_alignment_outputs, temp_alignment_score_progressions,\
# temp_alignment2_orderings, temp_alignment2_outputs, temp_alignment2_score_progressions,\
# temp_alignment3_orderings, temp_alignment3_outputs, temp_alignment3_score_progressions \
#     = pickle.load(open('temp/ebm-alignmentswithscores.pkl', 'rb'))

In [None]:
# tokens_df = alignmentTerms(toy_alignment_bad)
# temp_list = list(tokens_df)
# temp_weights = list(tokens_df.values())

# temp_alignment_scores = []
# temp_alignment_scores_detail = []
# for i in range(len(temp_alignment_outputs)):
#     # score the alignments
#     alignment_score = tempScoreVector(
#         temp_alignment_outputs[i], 
#         term_list=temp_list,
#         term_weights=temp_weights)
#     temp_alignment_scores.append(alignment_score[0])
#     temp_alignment_scores_detail.append(alignment_score[1])
    
# # the "best" alignment
# index = temp_alignment_scores.index(max(temp_alignment_scores))
# temp_alignment_orderings[index]
# temp_alignment_scores[index]
# extractTup(temp_alignment_outputs[index], tup_i='segment').sort_index()

# # the "worst" alignment
# index = temp_alignment_scores.index(min(temp_alignment_scores))
# temp_alignment_orderings[index]
# temp_alignment_scores[index]
# extractTup(temp_alignment_outputs[index], tup_i='segment').sort_index()

# # some random alignment
# index = int(0.5*len(temp_alignment_scores))
# temp_alignment_orderings[index]
# temp_alignment_scores[index]
# extractTup(temp_alignment_outputs[index], tup_i='segment').sort_index()

In [None]:
# for index in range(len(temp_alignment_scores)):
#     temp_alignment_orderings[index]
#     temp_alignment_scores[index]
#     extractTup(temp_alignment_outputs[index], tup_i='segment').sort_index()

In [None]:
def scoreTermListColumnCountDetail(align_df, term_list, term_weights=None):
    # if we don't have any terms to investigate, return 1 (default col count)
    if len(term_list) == 0:
        return 1
    # by default, weight each term equally
    if term_weights is None:
        term_weights = [1]*len(term_list)
    # And normalize the weights (assume that hasn't been done already)
    tw_sum = sum(term_weights)
    term_weights = [(tw/tw_sum) for tw in term_weights]
    scores = [scoreTermColumnCount(align_df, term) for term in term_list]
    return np.dot(scores, term_weights), dict(zip(term_list, zip(term_weights, scores)))

def tempScoreVectorDetail(align_df, term_list=[], term_weights=None):
    # TODO make this an actual nice function later
    score_colptxtembed = [scoreColumnPhraseEmbedVariance(align_df, colname) for colname in align_df.columns]
    score_coltextcount = [scoreColumnTextCount(align_df, colname) for colname in align_df.columns]
    score_coltokncount = [scoreColumnTokenCount(align_df, colname) for colname in align_df.columns]
    raw_colentityscores = [scoreColumnTokenEntityCount(align_df, colname) for colname in align_df.columns]
    score_coltentcount = [s[0] for s in raw_colentityscores]
    score_colttuicount = [s[1] for s in raw_colentityscores]
    score_colpposcount = [scoreColumnPhrasePOSCount(align_df, colname) for colname in align_df.columns]
    score_coltposcount = [scoreColumnPOSCount(align_df, colname) for colname in align_df.columns]
    score_colrepresent = [scoreColumnRepresentation(align_df, colname) for colname in align_df.columns]
    score_colalltokens = [scoreColumnTotalTokens(align_df, colname) for colname in align_df.columns]
    score_termcolcount, score_termcolcount_detail = scoreTermListColumnCountDetail(align_df, term_list, term_weights)
    scores = np.array([
        scoreNumColumns(align_df), # lower is better
        sum(score_coltextcount)/len(score_coltextcount), # lower is better
        sum(score_colptxtembed)/len(score_colptxtembed), # lower is better
        sum(score_coltokncount)/len(score_coltokncount), # lower is better
        sum(score_coltentcount)/len(score_coltentcount), # lower is better
        sum(score_colttuicount)/len(score_colttuicount), # lower is better
        sum(score_colpposcount)/len(score_colpposcount), # lower is better
        sum(score_coltposcount)/len(score_coltposcount), # lower is better
        sum(score_colrepresent)/len(score_colrepresent), # higher is better
        # haven't added score_colalltokens
        0,#scoreRowAlignment(align_df, align_df.loc[[5130]]), # higher is better
        score_termcolcount, #  lower is better
        scoreRowLayoutCount(align_df), # lower is better
    ])
    # weight and sum up the score (higher total score is better)
    score_direction = np.array([-1, -1, -1, -1, -1, -1, -1, -1,  1,  1, -1, -1])
    score_weights   = np.array([ 0,  0, 10,  0,  0,  0,  0,  0,  0, 0., 15,  0])
    singlescore = np.dot(np.multiply(score_weights, score_direction), scores)
    colrelevance_represent = [r/sum(score_colrepresent) for r in score_colrepresent]
    colrelevance_numtokens = [r/sum(score_colalltokens) for r in score_colalltokens]
    # collect the raw stats as a displayable df
    rawscores = pd.DataFrame([
        score_colptxtembed,
        score_coltextcount,
        score_coltokncount,
        score_coltentcount,
        score_colttuicount,
        score_colpposcount,
        score_coltposcount,
        score_colrepresent,
        colrelevance_represent,
        score_colalltokens,
        colrelevance_numtokens,
    ], index=[
        'embed variance',
        'unique texts',
        'unique tokens',
        'unique entity',
        'unique entity TUI',
        'unique phrase pos', 
        'unique token pos', 
        'fract rows filled',
        'relevance1 (rowsfilled)',
        'num tokens',
        'relevance2 (numtokens)',
    ])
    return singlescore, scores, rawscores, score_termcolcount_detail

In [None]:
alignment_wordsonly

In [None]:
alignment_being_scored = alignment_wordsonly
alignment_terms = alignmentTerms(alignment_being_scored)
singlescore, scores, rawscores, termdetail = tempScoreVectorDetail(
    alignment_being_scored, 
    term_list=list(alignment_terms), 
    term_weights=list(alignment_terms.values())
)
score_direction = np.array([-1, -1, -1, -1, -1, -1, -1, -1,  1,  1, -1, -1])
score_weights   = np.array([ 0,  0, 10,  0,  0,  0,  0,  0,  0, 0., 15,  0])
singlescore = np.dot(np.multiply(score_weights, score_direction), scores)
singlescore
scores
pd.DataFrame([str(s) for s in scores]).loc[[0, 8, 9, 10]]
alignment_detail = extractTup(alignment_being_scored, tup_i='segment').sort_index()
alignment_detail.append(rawscores.rename(columns=dict(zip(rawscores.columns, alignment_detail.columns))))
termdetail

In [None]:
def parse_string_to_alignment_df(alignment_text):
    data_start = 4
    nonpos_characters = [' ', '\'', '[', ']']
    rows = {}
    alignment_text = alignment_text.split('\n')
    # fill in actual data
    for line in alignment_text:
        cells = line.split('\t')
        if cells[2] == 's-txt':
            # input the text data
            if cells[0] not in rows:
                rows[cells[0]] = [('','',[])]*len(cells[data_start:])
            for i in range(len(cells[data_start:])):
                prev_tuple = rows[cells[0]][i]
                rows[cells[0]][i] = (cells[data_start+i].strip(), prev_tuple[1], prev_tuple[2])
        elif cells[2] == 's-pos':
            # input the token pos data
            if cells[0] not in rows:
                rows[cells[0]] = [('','',[])]*len(cells[data_start:])
            for i in range(len(cells[data_start:])):
                prev_tuple = rows[cells[0]][i]
                cell_pos = [''.join([c for c in pos if c not in nonpos_characters]) 
                            for pos in cells[data_start+i].split(',')]
                cell_pos = [pos for pos in cell_pos if pos != '']
                rows[cells[0]][i] = (prev_tuple[0], prev_tuple[1], cell_pos)
        elif cells[2] == 's-ppos':
            if cells[0] not in rows:
                rows[cells[0]] = [('','',[])]*len(cells[data_start:])
            for i in range(len(cells[data_start:])):
                prev_tuple = rows[cells[0]][i]
                rows[cells[0]][i] = (prev_tuple[0], cells[data_start+i].strip(), prev_tuple[2])
    # fill in the blank cells
    output_width = max([len(r) for r in rows.values()])
    for k in rows:
        for i in range(output_width - len(rows[k])):
            rows[k] = rows[k] + [('','',[])]
    output_df = pd.DataFrame(rows.values(), index=rows.keys())
    return output_df

# parse_string_to_alignment_df(test_text)

## TODO live update score

In [None]:
# write something like this to the file

# 2552	children with Asperger syndrome ( AS ) :	s-pos					[['NNS'], 			['IN'], 		['NNP',		'NN'],			['-LRB-'],	['NNP'], 	['-RRB-'], 									[':']]
# 2552	children with Asperger syndrome ( AS ) :	s-txt					children			with		Asperger		syndrome			 (	AS	)									:
# 4594	Fifty children with high-functioning ASD and anxiety	s-pos		[['CD',			'NNS'],			['IN'], 	['RB'], ['HYPH'], ['VBG',							NN', 					CC', 			NN']]
# 4594	Fifty children with high-functioning ASD and anxiety	s-txt		Fifty			children			with	high - functioning							 ASD. 					and			anxiety
# 4618	children with high-functioning autism spectrum disorders and anxiety :	s-pos					[['NNS'], 			['IN'], 	['RB'], ['HYPH'], ['VBG',			NN', 	NN', 	NNS',							CC', 			NN'], 		[':']]
# 4618	children with high-functioning autism spectrum disorders and anxiety :	s-txt					children			with	high - functioning			 autism 	spectrum 	disorders							and			anxiety 		:
# 5126	high-functioning autism spectrum disorders ( ASD ) and clinically significant anxiety	s-pos									['RB'], ['HYPH'], ['VBG',			NN', 	NN', 	NNS'],	['-LRB-'], 	['NNP'], 	['-RRB-'],				['CC'], 	['RB', 	JJ'],	['NN']]
# 5126	high-functioning autism spectrum disorders ( ASD ) and clinically significant anxiety	s-txt									high - functioning			 autism 	spectrum 	disorders	(	ASD	)				and	clinically	significant	anxiety
# 5130	45 children ( 7-11 years of age ) with high-functioning ASD and clinically significant anxiety	s-pos		[['CD',			'NNS'], 	['-LRB-'], ['CD', 'SYM', 'CD', 'NNS'], ['IN'], ['NN'], ['-RRB-'],		['IN'], 	['RB'], ['HYPH'], ['VBG',							NNP'], 					['CC'],	['RB', 	JJ'],	['NN']]
# 5130	45 children ( 7-11 years of age ) with high-functioning ASD and clinically significant anxiety	s-txt		45			children	( 7 - 11 years of age )		with	high - functioning							 ASD  					and	clinically	significant	anxiety
# 5134	high-functioning ASD and clinically significant anxiety	s-pos									['RB'], ['HYPH'], ['VBG',							['NNP'], 					['CC'], 	['RB', 	JJ'],	['NN']]
# 5134	high-functioning ASD and clinically significant anxiety	s-txt									high - functioning							ASD					and	clinically 	significant	anxiety
# 6474	92 outpatients affected by generalized anxiety disorders	s-pos		[['CD',			'NNS'],		['VBN'], 	['IN'], 															['VBN',	'NN', 	NNS']]
# 6474	92 outpatients affected by generalized anxiety disorders	s-txt		92			 outpatients		affected 	by															generalized	anxiety 	disorders
# 6507	anxiety :	s-pos																								[['NN'], 		[':']]
# 6507	anxiety :	s-txt																								anxiety		:
# 7298	patients with previously undetected anxiety	s-pos					[['NNS'], 			['IN'], 														['RB', 	'JJ'],	['NN']]
# 7298	patients with previously undetected anxiety	s-txt					patients			with														previously 	undetected	anxiety
# 7308	573 patients who had unrecognized and untreated anxiety	s-pos		[['CD',			'NNS'],													['WP'],	['VBD'], 	['JJ',	 'CC',		 'JJ'], 	['NN']]
# 7308	573 patients who had unrecognized and untreated anxiety	s-txt		573			patients													who	had	unrecognized	 and		 untreated	anxiety
# 7321	primary care patients with untreated anxiety .	s-pos			[['JJ', 	'NN',	NNS'],			['IN'], 															['JJ',	'NN'], 		['.']]
# 7321	primary care patients with untreated anxiety .	s-txt			primary 	care	patients			with															untreated	anxiety		.

In [None]:
with open('interactive_input/alignment', 'r') as f:
    file = f.read()
file = file.splitlines()
input_alignment_text = '\n'.join(file)

alignment_being_scored = parse_string_to_alignment_df(input_alignment_text)
alignment_terms = alignmentTerms(alignment_being_scored)
singlescore, scores, rawscores, termdetail = tempScoreVectorDetail(
    alignment_being_scored, 
    term_list=list(alignment_terms), 
    term_weights=list(alignment_terms.values())
)
score_direction = np.array([-1, -1, -1, -1, -1, -1, -1, -1,  1,  1, -1, -1])
score_weights   = np.array([ 0,  0, 10,  0,  0,  0,  0,  0,  0, 0., 15,  0])
singlescore = np.dot(np.multiply(score_weights, score_direction), scores)
singlescore
# scores
rawspreadscores = pd.DataFrame([str(s) for s in scores]).iloc[[0, -3, -2]]
rawspreadscores.index = ['colcount', 'row-alignment', 'termcolcount']
rawspreadscores.rename(columns={0:'aggregate'}, inplace=True)
# rawspreadscores
alignment_detail = extractTup(alignment_being_scored, tup_i='segment').sort_index()
rawscores = rawscores.rename(columns=dict(zip(rawscores.columns, alignment_detail.columns)))
rawscores = rawscores.loc[[
    'embed variance', 'unique texts', 'unique tokens', 
    'unique entity', 'unique entity TUI', 'unique token pos', 
    'fract rows filled', 'relevance1 (rowsfilled)', 
    'num tokens', 'relevance2 (numtokens)']]
rawscores.insert(0, column='aggregate', value=['']*(len(rawscores.index)))
scoretable = rawscores.append(rawspreadscores).replace(np.nan, '', regex=True)
scoretable
alignment_detail.append(scoretable)
termdetail

# Alignment ordering exploration

In [None]:
def alignmentTerms(align_df, all_stopwords=None, priority_pos=['NN', 'JJ', 'RB']):
    if all_stopwords is None:
        all_stopwords = sp.Defaults.stop_words
    # collect list forms of words and cPOS
    all_text = [
        [text for text in extractTup(align_df.iloc[rownum], tup_i='segment', is_frame=False)]
        for rownum in range(len(align_df))
    ]
    all_text = [' '.join(sublist).split() for sublist in all_text]
    all_cpos = [
        [text for sublist 
         in extractTup(align_df.iloc[rownum], tup_i='cpos', is_frame=False) 
         for text in sublist]
        for rownum in range(len(align_df))
    ]
    # flatten the word and cPOS lists
    all_text = [e for sublist in all_text for e in sublist]
    all_cpos = [e for sublist in all_cpos for e in sublist if e != '']
    # count up how many POS is assigned to each word
    pos_mapping = {}
    for i in range(len(all_text)):
        if all_text[i] not in pos_mapping:
            pos_mapping[all_text[i]] = {}
        if all_cpos[i] not in pos_mapping[all_text[i]]:
            pos_mapping[all_text[i]][all_cpos[i]] = 0
        pos_mapping[all_text[i]][all_cpos[i]] += 1
    # pick the single POS that each word is tagged as most often
    for word in pos_mapping:
        max_pos = None
        max_count = 0
        for pos in pos_mapping[word]:
            if pos_mapping[word][pos] > max_count:
                max_pos = pos
                max_count = pos_mapping[word][pos]
        pos_mapping[word] = max_pos
    # get count of how many rows each word is present in
    # and remove stopwords at the same time
    tokens_df = dict([
        (word, sum([(word in row) for row in all_text])) 
        for word 
        in set(all_text)
        if word not in all_stopwords
    ])
    # remove the words that show up in less than one row
    for word in [word for word in tokens_df if tokens_df[word] <= 1]:
        discard = tokens_df.pop(word, None)
    # exponentiate the count of all of the words in the dict that are in POS classes we care about
    for word in tokens_df:
        if any([(pos in pos_mapping[word]) for pos in priority_pos]):
            tokens_df[word] = pow(tokens_df[word], 2)
    return tokens_df

alignmentTerms(toy_alignment_bad)

In [None]:
import random

# Do some variant of beam search to build up an alignment one at a time!!!
# TODO implement a method to group multiple together at a time? / dynamic programming sort of approach?
# TODO implement a faster method to do a sort of random walk
def buildAlignmentBeamSearch(align_df_src, indices, size_seed=10, size_beam=10, size_filter=5, use_fullscore=True):
    # Do a basic adjustment of size parameters if there is mismatch
    size_seed = min(size_seed, len(indices))
    size_beam = min(size_beam, len(indices))
    # Set up our term scoring
    alignment_terms = alignmentTerms(align_df_src.loc[indices])
    # Seed the beam
    beam = []
    indices_sampling = random.sample(indices, size_seed)
    for seed in indices_sampling:
        # beam format: (alignment, alignment ordering, single penalty matrix score, single alignment score)
        beam.append((align_df_src.loc[[seed]], [seed], 0, 0))
    # Run the main body of beam search...
    while len(beam[0][1]) < len(indices):
        beam_update = []
        for b in beam:
            # sample a number of possible next indices
            indices_sampling = [i for i in indices if (i not in b[1])]
            indices_sampling = random.sample(indices_sampling, min(size_beam, len(indices_sampling)))
            # special handling to account for first round: avoid AxB & BxA alignment duplication
            if len(b[1]) == 1:
                # get the list of all alignment orderings already tried and filter the samples out
                attempted = [bu[1] for bu in beam_update]
                indices_sampling = [i for i in indices_sampling if ([i]+b[1] not in attempted)]
            # run through all of the indices sampled to search for next step
            for next_i in indices_sampling:
                alignment, alignment_score = alignRowMajorLocal(
                    b[0],
                    align_df_src.loc[[next_i]],
                    remove_empty_cols=True
                )
                full_score = tempScoreVector(
                    alignment, 
                    term_list=list(alignment_terms), 
                    term_weights=list(alignment_terms.values())
                )[0]
                beam_update.append((alignment, b[1]+[next_i], alignment_score, full_score))
        # sort by descending score (since higher is better)
        # TODO want to see if there's a huge difference between using full score vs alignment score
        beam_update.sort(key=lambda e: e[3] if use_fullscore else e[2], reverse=True)
#         for b in beam_update:
#             print(f'BEAM {b[1]}    {b[2]}      {b[3]}')
#             print(extractTup(b[0], tup_i='segment', is_frame=True))
#             print('=====================')
#         print('===================== END BEAM =====================')
        beam = beam_update[:min(size_filter, len(beam_update))]
    # the beam is sorted in order of descending quality already
    return beam[0][0]

# buildAlignmentBeamSearch(alignment_df, [2552, 4594, 4618, 5130, 6474, 7298, 7308, 7321, 5126, 5134, 6507])
temp_alignment = buildAlignmentBeamSearch(alignment_df, [2552, 4594, 5130], use_fullscore=False)
temp_alignment
alignment_terms = alignmentTerms(temp_alignment)
tempScoreVector(
    temp_alignment, 
    term_list=list(alignment_terms), 
    term_weights=list(alignment_terms.values())
)

In [None]:
# bsp5 = buildAlignmentBeamSearch(alignment_df, [2552, 4594, 5130, 6474, 7298], size_filter=5, use_fullscore=False)
# bspx = buildAlignmentBeamSearch(alignment_df, [2552, 4594, 5130, 6474, 7298], size_filter=10, use_fullscore=False)
# bspl = buildAlignmentBeamSearch(alignment_df, [2552, 4594, 5130, 6474, 7298], size_filter=100, use_fullscore=False)
# bsf5 = buildAlignmentBeamSearch(alignment_df, [2552, 4594, 5130, 6474, 7298], size_filter=5, use_fullscore=True)
# bsfx = buildAlignmentBeamSearch(alignment_df, [2552, 4594, 5130, 6474, 7298], size_filter=10, use_fullscore=True)
# bsfl = buildAlignmentBeamSearch(alignment_df, [2552, 4594, 5130, 6474, 7298], size_filter=100, use_fullscore=True)
# blfx = buildAlignmentBeamSearch(alignment_df, [2552, 4594, 4618, 5130, 6474, 7298, 7308, 7321, 5126, 5134, 6507], 
#                                 size_filter=10, use_fullscore=True)
# blfl = buildAlignmentBeamSearch(alignment_df, [2552, 4594, 4618, 5130, 6474, 7298, 7308, 7321, 5126, 5134, 6507], 
#                                 size_filter=100, use_fullscore=True)
# blpx = buildAlignmentBeamSearch(alignment_df, [2552, 4594, 4618, 5130, 6474, 7298, 7308, 7321, 5126, 5134, 6507], 
#                                 size_filter=10, use_fullscore=False)
# blpl = buildAlignmentBeamSearch(alignment_df, [2552, 4594, 4618, 5130, 6474, 7298, 7308, 7321, 5126, 5134, 6507], 
#                                 size_filter=100, use_fullscore=False)

In [None]:
# import pickle
# pickle.dump((
#     bsp5, bspx, bspl,
#     bsf5, bsfx, bsfl,
#     blfx, blfl,
#     blpx, blpl,
# ), open('temp/ebm-beamsearchalignments.pkl', 'wb'))

In [None]:
import pickle
bsp5, bspx, bspl,\
bsf5, bsfx, bsfl,\
blfx, blfl,\
blpx, blpl \
    = pickle.load(open('temp/ebm-beamsearchalignments.pkl', 'rb'))

# bsp5, bspx, bspl, bsf5, bsfx, bsfl, blfx, blfl, blpx, blpl
focus_table = blpx
alignment_terms = alignmentTerms(focus_table)
tempScoreVector(
    focus_table, 
    term_list=list(alignment_terms), 
    term_weights=list(alignment_terms.values())
)
extractTup(focus_table, tup_i='segment').sort_index()
# blpx

# Alignment state exploration

In [None]:
# At each step, either:
# 1. Greedy step :3
# 2. Random step
# 3. Random restart