In [None]:
import numpy as np

import pandas as pd
pd.set_option("display.max_rows", None)

import gensim

# Load Google's pre-trained Word2Vec model.
# model source: https://code.google.com/archive/p/word2vec/
word2vec = gensim.models.KeyedVectors.load_word2vec_format(
    'model/GoogleNews-vectors-negative300.bin', 
    binary=True)
cached_word2vec_phrases = {}

In [None]:
# Get the word2vec embedding of a phrase
def get_phrase_embed_word2vec(word2vec, phrase):
    try:
        phraseS = phrase.split()
    except:
        return pd.DataFrame()
    unknowns = []
    emb = []
    for w in phraseS:
        try:
            emb.append(word2vec[w])
        except:
            unknowns.append(w)
    if len(emb) == 0:
        return pd.DataFrame()
    emb_sum = pd.DataFrame(emb).sum() / len(phraseS)
    emb_sum['word'] = phrase
    return pd.DataFrame([emb_sum])

get_phrase_embed_word2vec(
    word2vec, 
    'test sentence')
# get_phrase_embed_word2vec(
#     word2vec, 
#     'This is a test sentence !')

# Import sample dataset
(The code to construct the file `temp/ebm-pio_consegments.hdf` is in analyze.ipynb)

In [None]:
# Import the data we've already constructed out of constituency parse of specific phrases in specific sentences
con_segments = pd.read_hdf(f'temp/ebm-pio_consegments.hdf','mydata')
con_segments

In [None]:
# Transform that data into the format that is more readable for alignment
# (sorry, this is sort of an abuse of DataFrame datatypes)

def transformTuples(row):
    # turn each row into the segment tuples used for alignment
    output = pd.DataFrame()
    for i in range(len(row['alignsegments'])):
        output[f'txt{i}'] = [(row['alignsegments'][i], row['aligntypes'][i], row['alignctypes'][i])]
    return output.set_index(pd.Series([row.name]))

transformTuples(con_segments.loc[7298])

In [None]:
alignment_df = con_segments.groupby(con_segments.index, group_keys=False).apply(
    lambda group: transformTuples(group.iloc[0]))
alignment_df = alignment_df.applymap(lambda x: ('', '', []) if x is np.nan else x)
alignment_df

# Alignment operations / transformations

In [None]:
temp_df = con_segments

def mergeAdjacentNP(row):
    # merge adjacent noun phrases
    # TODO adapt this for alignment df format!!!
    for i in reversed(range(len(row['alignsegments'])-1)):
        if row['aligntypes'][i]=='NP' and row['aligntypes'][i+1]=='NP':
            row['alignsegments'][i] += ' ' + row['alignsegments'][i+1]
            row['alignsegments'][i+1] = []
            row['aligntypes'][i+1] = []
            row['alignctypes'][i] += row['alignctypes'][i+1]
            row['alignctypes'][i+1] = []
    row = row.drop('aligntup')
    row['alignsegments'] = [e for e in row['alignsegments'] if e != []]
    row['aligntypes'] = [e for e in row['aligntypes'] if e != []]
    row['alignctypes'] = [e for e in row['alignctypes'] if e != []]
    return row

temp_df = temp_df.apply(
    lambda row: mergeAdjacentNP(row), 
    axis=1, result_type='expand')
temp_df

In [None]:
# TODO this is still buggy, don't use it
temp_df = con_segments

def mergeParentheses(row):
    # merge parenthetical clauses
    numOpenParens = 0
    lastStart = -1
    mergeSegments = []
    for i in range(len(row['alignsegments'])):
        for c in [c for c in row['alignsegments'][i] if c in ['(', ')']]:
            if c == '(':
                numOpenParens += 1
                if numOpenParens == 1:
                    lastStart = i
            else:
                numOpenParens -= 1
                if numOpenParens == 0 and lastStart != i:
                    # close the parentheses
                    mergeSegments.append((lastStart, i))
    if numOpenParens > 0:
        mergeSegments.append((lastStart, len(row['alignsegments'])))
    mergeSegments = list(set(mergeSegments))
    if mergeSegments != []:
        for t in reversed(mergeSegments):
            print(row['aligntypes'][t[0]:t[1]+1])
            print(row['alignsegments'][t[0]:t[1]+1])
        print()
    return row

temp_df.apply(
    lambda row: mergeParentheses(row), 
    axis=1, result_type='expand')

In [None]:
def extractTup(data, tup_i=0, is_frame=True):
    types = {
        'segment': 0,
        'type': 1,
        'ctype': 2
    }
    if tup_i in types:
        tup_i = types[tup_i]
    if is_frame:
        return data.applymap(lambda x: x[tup_i])
    else:
        return data.map(lambda x: x[tup_i])

# extractTup(transformTuples(temp_df.loc[7298]), tup_i='segment', is_frame=True)
extractTup(alignment_df.loc[[7298]], tup_i='segment', is_frame=True)

In [None]:
def removeEmptyColumns(align_df):
    for c in align_df.columns:
        align_df_c = extractTup(align_df.loc[:, c], tup_i='segment', is_frame=False)
        if len([e for e in align_df_c if e != '']) == 0:
            del align_df[c]
    align_df.columns = [f'txt{i}' for i in range(len(align_df.columns))]
    return align_df

removeEmptyColumns(alignment_df.loc[[7298, 7321]])

In [None]:
import math
from nltk.metrics import edit_distance

def alignRowMajorLocal(align_a, align_b, use_types=False, remove_empty_cols=False, debug_print=False):
    # An implementation of Smith-Waterman alignment
    def removeEmptyColumns(align_df):
        for c in align_df.columns:
            align_df_c = extractTup(align_df.loc[:, c], tup_i='segment', is_frame=False)
            if len([e for e in align_df_c if e.strip() != '']) == 0:
                del align_df[c]
        align_df.columns = [f'txt{i}' for i in range(len(align_df.columns))]
        return align_df
    if remove_empty_cols:
        align_a = removeEmptyColumns(align_a)
        align_b = removeEmptyColumns(align_b)
    align_a_segment = extractTup(align_a, tup_i='segment')
    align_b_segment = extractTup(align_b, tup_i='segment')
    align_a_type = extractTup(align_a, tup_i='type')
    align_b_type = extractTup(align_b, tup_i='type')
    align_a_ctype = extractTup(align_a, tup_i='ctype')
    align_b_ctype = extractTup(align_b, tup_i='ctype')
    # If we are aligning purely on NP elements... not implemented currently.
#     align_a_elems = [i for i in range(len(align_a.columns)) 
#                      if 'NP' in set(align_a_type[align_a.columns[i]])]
#     align_b_elems = [i for i in range(len(align_b.columns)) 
#                      if 'NP' in set(align_b_type[align_a.columns[i]])]
    # If we are doing a general alignment
    align_a_elems = [i for i in range(len(align_a.columns))]
    align_b_elems = [i for i in range(len(align_b.columns))]
    if debug_print:
        print(align_a_elems)
        print(align_b_elems)
        print()
    def getScoreAligningIndices(index_a, index_b):
        # A higher score is better / more match!
        # make sure all the segment texts are precomputed lol
        text_a = list(align_a_segment[align_a.columns[index_a]])
        text_b = list(align_b_segment[align_b.columns[index_b]])
        for text in text_a+text_b:
            if text not in cached_word2vec_phrases:
                try:
                    cached_word2vec_phrases[text] = get_phrase_embed_word2vec(word2vec, text).drop('word', 1)
                except KeyError:
                    pass
        # start off with phrase embedding distance (current max is 60 for perfect match)
        # if we have embeds for any word in each set, ignore others and just use words we have embeds for
        if any(s in cached_word2vec_phrases for s in text_a)\
                and any(s in cached_word2vec_phrases for s in text_b):
            # calculate overall embeds
            embed_a = pd.concat([cached_word2vec_phrases[text] for text 
                                 in text_a if text in cached_word2vec_phrases]).apply(lambda x: x.mean())
            embed_b = pd.concat([cached_word2vec_phrases[text] for text 
                                 in text_b if text in cached_word2vec_phrases]).apply(lambda x: x.mean())
            # TODO can tweak this scoring calculation a little for performance
            score = 10 * (6 - np.linalg.norm(embed_a-embed_b))
        else:
            # use levenshtein dist as fallback... if either set has NO words with embeds available
            scaled_edits_sum = 0
            for phrase_a in [p for p in text_a if len(p) != 0]:
                for phrase_b in [p for p in text_b if len(p) != 0]:
                    scaled_edits_sum += edit_distance(phrase_a,phrase_b) / max(len(phrase_a), len(phrase_b))
            score = 60 * (1 - (scaled_edits_sum / (len(text_a) * len(text_b))))
        # add a component based on phrase type if flag is set to true (by default it is)
        # TODO improve this?; this currently just returns -inf if mismatch of type sets
        # Might want to add support for aligning different types of phrase together...
        if use_types:
            types_a = set([t for t in align_a_type[align_a.columns[index_a]] if t != ''])
            types_b = set([t for t in align_b_type[align_b.columns[index_b]] if t != ''])
            if len(types_a) != 0 and len(types_b) != 0 and types_a != types_b:
                score = -1 * math.inf
        # TODO: add a component based on phrase ctype (phrase POS breakdown) (?)
        if debug_print:
            print(f'scoring between '
                  +f'"{list(align_a_segment[align_a.columns[index_a]])}" and '
                  +f'"{list(align_b_segment[align_b.columns[index_b]])}": {score}')
        return score
    def getGapPenalty(length):
        return -1 * (1 * min(length,1) + 0.5 * max(length-1,0))
    # Build score matrix of size (a-alignables + 1)x(b-alignables + 1)
    scores = np.zeros((len(align_a_elems)+1, len(align_b_elems)+1))
    # Build traceback matrix
    # traceback = 0 for end, 4 for W, 7 for NW, 9 for N (to calculate traceback, t%2 is N-ness, t%3 is W-ness)
    traceback = np.zeros((len(align_a_elems)+1, len(align_b_elems)+1))
    # Iterate through all of the cells to populate both the score and traceback matrices
    for i in range(1, scores.shape[0]):
        for j in range(1, scores.shape[1]):
            score_map = {}
            # calculate score for aligning nouns a[i] and b[j]
            score_map[
                scores[i-1,j-1] + getScoreAligningIndices(align_a_elems[i-1], align_b_elems[j-1])
            ] = 7
            # calculate score for gap in i
            for i_gap in range(1, i):
                igap_score = scores[i-i_gap,j] + getGapPenalty(i_gap)
                score_map[igap_score] = 9
            # calculate score for gap in j
            for j_gap in range(1, j):
                jgap_score = scores[i,j-j_gap] + getGapPenalty(j_gap)
                score_map[jgap_score] = 4
            # add the possibility for unrelatedness
            score_map[0] = 0
            if debug_print:
                print(score_map)
            scores[i,j] = max(score_map.keys())
            traceback[i,j] = score_map[max(score_map.keys())]
    if debug_print:
        print()
        print(scores)
        print(traceback)
        print()
    # Do traceback to build our final alignment
    tracepoint = np.unravel_index(np.argmax(scores, axis=None), scores.shape)
    points_a = []
    points_b = []
    while traceback[tracepoint] != 0:
        # contribute to the align information
        if traceback[tracepoint] == 7:
            # this is a point where two elements were aligned
            points_a.append(align_a_elems[tracepoint[0]-1])
            points_b.append(align_b_elems[tracepoint[1]-1])
        elif traceback[tracepoint] == 4:
            # this is a point where there was a gap inserted for row_a
            points_a.append(-1)
            points_b.append(align_b_elems[tracepoint[1]-1])
        elif traceback[tracepoint] == 9:
            # this is a point where there was a gap inserted for row_b
            points_a.append(align_a_elems[tracepoint[0]-1])
            points_b.append(-1)
        # step backwards
        tracepoint = (
            tracepoint[0] - int(traceback[tracepoint] % 2),
            tracepoint[1] - int(traceback[tracepoint] % 3))
    points_a = list(reversed(points_a))
    points_b = list(reversed(points_b))
    if len(points_a) != len(points_b):
        # enforce that align_a and align_b are the same length (they should be)
        raise ValueError('should not occur; bug in S-W local alignment?')
    if debug_print:
        print(points_a)
        print(points_b)
        print()
    # Create a nice neat form of this alignment
    # TODO add support for NP-only alignment gaps?
    range_a = [i for i in points_a if i >= 0]
    range_b = [i for i in points_b if i >= 0]
    range_a = (range_a[0], range_a[-1])
    range_b = (range_b[0], range_b[-1])
    output = pd.DataFrame(columns=[f'txt{i}' for i in range(
        (range_a[0] + range_b[0]) + len(points_a)
        + max(0, (len(align_a.columns) - range_a[1]) - 1) 
        + max(0, (len(align_b.columns) - range_b[1]) - 1)
    )])
    # build the segment from align_a
    realign_a = align_a.loc[:, [f'txt{i}' for i in range(range_a[0])]]
    for i in range(range_b[0]):
        realign_a.insert(len(realign_a.columns), f'insx{i}', np.nan, True)
    for i in points_a:
        if i >= 0:
            realign_a[align_a.columns[i]] = align_a.loc[:, align_a.columns[i]]
        else:
            realign_a.insert(len(realign_a.columns), f'ins{len(realign_a.columns)}', np.nan, True)
    for i in range(range_a[1]+1, len(align_a.columns)):
        realign_a[align_a.columns[i]] = align_a.loc[:, align_a.columns[i]]
    for i in range(range_b[1]+1, len(align_b.columns)):
        realign_a.insert(len(realign_a.columns), f'insx{i+range_b[0]}', np.nan, True)
    # build the segment from align_b
    realign_b = align_b.loc[:, [f'txt{i}' for i in range(range_b[0])]]
    for i in range(range_a[0]):
        realign_b.insert(0, f'insx{i}', np.nan, True)
    for i in points_b:
        if i >= 0:
            realign_b[align_b.columns[i]] = align_b.loc[:, align_b.columns[i]]
        else:
            realign_b.insert(len(realign_b.columns), f'ins{len(realign_b.columns)}', np.nan, True)
    for i in range(range_a[1]+1, len(align_a.columns)):
        realign_b.insert(len(realign_b.columns), f'insx{i+range_a[0]}', np.nan, True)
    for i in range(range_b[1]+1, len(align_b.columns)):
        realign_b[align_b.columns[i]] = align_b.loc[:, align_b.columns[i]]
    # build final output
    realign_a.columns = output.columns
    realign_b.columns = output.columns
    output = output.append(realign_a)
    output = output.append(realign_b)
    return output.applymap(lambda x: ('', '', []) if x is np.nan else x)

toy_align = alignRowMajorLocal(transformTuples(temp_df.loc[7298]), transformTuples(temp_df.loc[7321]))
toy_align

In [None]:
toy_align = alignRowMajorLocal(alignment_df.loc[[7298]], alignment_df.loc[[7321]], remove_empty_cols=True)
toy_align

In [None]:
# Create the toy data
toy_data = pd.DataFrame(
    ['Asperger syndrome', 
     'high - functioning ASD', 
     'unrecognized and untreated anxiety', 
     'generalized anxiety disorders', 
     'anxiety', 
     'high - functioning autism spectrum disorders and anxiety', 
     'high - functioning ASD and anxiety', 
     'high - functioning ASD', 
     'high - functioning autism spectrum disorders', 
     'previously undetected anxiety', 
     'untreated anxiety']
).rename({0: 'txt'}, axis=1)
toy_data

In [None]:
# Column split step 1: Build word tree with node = word units running right->left

# add text to the given trienode
def wordTreeHelper(tree_node, text, id_data=None, right_align=False):
    text = text.strip()
    # Check for base case
    if text == '':
        tree_node[id_data] = id_data
        return tree_node
    # Select the right key (for now, just pick the key based on right-to-left ordering)
    key = ''
    if right_align:
        key = text.split(' ')[-1]
        text = ' '.join(text.split(' ')[0:-1])
    else:
        key = text.split(' ')[0]
        text = ' '.join(text.split(' ')[1:])
    # Put the key and text into the trie
    if key not in tree_node:
        tree_node[key] = {}
    tree_node[key] = wordTreeHelper(tree_node[key], text, id_data=id_data, right_align=right_align)
    return tree_node

def wordTree(df, colname, right_align=False):
    tree = {}
    for e_id in df.index:
        tree = wordTreeHelper(tree, df.loc[e_id][colname], id_data=e_id, right_align=right_align)
    return tree

st = wordTree(toy_data, 'txt')
st

In [None]:
# Column split step 2: Collapse the suffix trie (merge nodes with only one child)

# edits the input trie
def wordTreeCollapse(tree, right_align=False):
    # Collapse children nodes first
    added_keys = {}
    removed_keys = []
    for child in tree:
        if type(child) is str:
            tree[child] = wordTreeCollapse(tree[child], right_align=right_align)
            # Check if the new child node is collapsible
            if len(tree[child]) == 1 and type(list(tree[child])[0]) is str:
                grandchild = list(tree[child])[0]
                grandchild_tree = tree[child][grandchild]
                # Perform the merge (put into edit queue)
                removed_keys.append(child)
                if right_align:
                    added_keys[grandchild + ' ' + child] = grandchild_tree
                else:
                    added_keys[child + ' ' + grandchild] = grandchild_tree
    # Perform removals
    for key in removed_keys:
        tree.pop(key)
    # Perform additions
    for key in added_keys:
        tree[key] = added_keys[key]
    return tree

st = wordTreeCollapse(st)
st

In [None]:
# Column split step 3: Output the suffix trie to multiple columns

# Calculate how many output columns we'll need
# Get the depth of the trie (a trie with one terminal node {0:0} has depth 0)
def wordTreeDepth(tree):
    max_depth = 0
    for child in tree:
        if type(child) is str:
            max_depth = max(max_depth, 1 + wordTreeDepth(tree[child]))
    return max_depth

def wordTreeSplitHelper(tree, max_depth, output, so_far=[], right_align=False):
    for child in tree:
        if type(child) is not str:
            # we have hit a base, put in an entry
            if right_align:
                output[child] = ['']*(max_depth - len(so_far)) + so_far
            else:
                output[child] = so_far + ['']*(max_depth - len(so_far))
        else:
            # this node has further children!
            if right_align:
                output = wordTreeSplitHelper(tree[child], 
                                             max_depth, 
                                             output, 
                                             [child] + so_far, 
                                             right_align=right_align)
            else:
                output = wordTreeSplitHelper(tree[child], 
                                             max_depth, 
                                             output, 
                                             so_far + [child], 
                                             right_align=right_align)
    return output

def wordTreeSplit(tree, colname, right_align=False):
    tree_depth = wordTreeDepth(tree)
    output = pd.DataFrame(columns=[f'{colname}{i}' for i in range(tree_depth)])
    rearranged = wordTreeSplitHelper(tree, tree_depth, {}, right_align=right_align)
    for id in rearranged:
        output.loc[id] = rearranged[id]
    return output
    
wordTreeSplit(st, 'split')

In [None]:
def splitCol(src_alignment, split_col, right_align):
    # TODO make splitCol partially preserve POS information?
    splitted = wordTreeSplit(
        wordTreeCollapse(wordTree(
            src_alignment,
            split_col, 
            right_align=right_align), right_align=right_align),
        f'{split_col}.', 
        right_align=right_align)
    result = src_alignment.join(splitted)
    result = result.drop(split_col, 1)
    result = result.reindex(
        columns=[x for _,x in sorted(zip(
            [float(c[3:]) for c in result.columns],
            result.columns))]
    )
    result.columns = [f'txt{i}' for i in range(len(result.columns))]
    return result

toy_align = splitCol(extractTup(toy_align, tup_i='segment'), 'txt0', right_align=True)
toy_align

In [None]:
# TODO is it worth it to apply the old POS tags again, or can those just be discarded at this point?

def applyEmptyTup(row):
    output = pd.DataFrame()
    for i in row.index:
        output[i] = [(row[i], '', [])]
    return output.set_index(pd.Series([row.name]))

# applyEmptyTup(toy_align.loc[7298])

toy_align = toy_align.groupby(toy_align.index, group_keys=False).apply(
    lambda group: applyEmptyTup(group.iloc[0])
)
toy_align

In [None]:
def mergeCol(src_alignment, merge_col):
    # TODO make mergeCol partially preserve POS information?
    merge_col_next = src_alignment.columns[list(src_alignment.columns).index(merge_col)+1]
    merged = src_alignment[merge_col] + ' ' + src_alignment[merge_col_next]
    result = src_alignment.copy()
    result[merge_col] = merged
    del result[merge_col_next]
    result.columns = [f'txt{i}' for i in range(len(result.columns))]
    return result

mergeCol(extractTup(toy_align, tup_i='segment'), 'txt0')

In [None]:
for i in [7298, 7321, 5126, 5134, 4594, 4618, 6507, 6474, 7308, 5130, 2552]:
    print(i, temp_df[temp_df.index==i].iloc[0]['alignsegments'])

In [None]:
# Alignment in manually selected "nice" order with types enforced
temp_align = []
temp_align.append(alignRowMajorLocal(alignment_df.loc[[5126]], alignment_df.loc[[5134]],
                                    remove_empty_cols=True, use_types=True))
temp_align.append(alignRowMajorLocal(alignment_df.loc[[7298]], alignment_df.loc[[7321]],
                                    remove_empty_cols=True, use_types=True))
temp_align.append(alignRowMajorLocal(alignment_df.loc[[4594]], alignment_df.loc[[4618]],
                                    remove_empty_cols=True, use_types=True))
temp_align.append(alignRowMajorLocal(alignment_df.loc[[5130]], alignment_df.loc[[2552]],
                                    remove_empty_cols=True, use_types=True))
temp_align.append(alignRowMajorLocal(alignment_df.loc[[6474]], alignment_df.loc[[7308]],
                                    remove_empty_cols=True, use_types=True))
update_temp_align = []
update_temp_align.append(alignRowMajorLocal(temp_align[2], temp_align[3], 
                                            remove_empty_cols=True, use_types=True))
update_temp_align.append(alignRowMajorLocal(temp_align[1], temp_align[4], 
                                            remove_empty_cols=True, use_types=True))
update_temp_align.append(alignRowMajorLocal(temp_align[0], alignment_df.loc[[6507]], 
                                            remove_empty_cols=True, use_types=True))
temp_align = update_temp_align
update_temp_align = []
update_temp_align.append(alignRowMajorLocal(temp_align[0], temp_align[2], 
                                            remove_empty_cols=True, use_types=True))
update_temp_align[0] = alignRowMajorLocal(update_temp_align[0], temp_align[1], 
                                          remove_empty_cols=True, use_types=True)
manually_aligned_group = update_temp_align[0]
extractTup(manually_aligned_group, tup_i='segment').sort_index()

In [None]:
# Alignment in manually selected "nice" order without types enforced
temp_align = []
temp_align.append(alignRowMajorLocal(alignment_df.loc[[5126]], alignment_df.loc[[5134]], 
                                     remove_empty_cols=True))
temp_align.append(alignRowMajorLocal(alignment_df.loc[[7298]], alignment_df.loc[[7321]], 
                                     remove_empty_cols=True))
temp_align.append(alignRowMajorLocal(alignment_df.loc[[4594]], alignment_df.loc[[4618]], 
                                     remove_empty_cols=True))
temp_align.append(alignRowMajorLocal(alignment_df.loc[[5130]], alignment_df.loc[[2552]], 
                                     remove_empty_cols=True))
temp_align.append(alignRowMajorLocal(alignment_df.loc[[6474]], alignment_df.loc[[7308]], 
                                     remove_empty_cols=True))
update_temp_align = []
update_temp_align.append(alignRowMajorLocal(temp_align[2], temp_align[3]))
update_temp_align.append(alignRowMajorLocal(temp_align[1], temp_align[4]))
update_temp_align.append(alignRowMajorLocal(temp_align[0], alignment_df.loc[[6507]], 
                                            remove_empty_cols=True))
temp_align = update_temp_align
update_temp_align = []
update_temp_align.append(alignRowMajorLocal(temp_align[0], temp_align[2]))
update_temp_align[0] = alignRowMajorLocal(update_temp_align[0], temp_align[1])
manually_aligned_group = update_temp_align[0]
extractTup(manually_aligned_group, tup_i='segment').sort_index()

In [None]:
# Alignment in random-ish order without types enforced
temp_align = alignRowMajorLocal(alignment_df.loc[[7298]], alignment_df.loc[[7321]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[5126]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[5134]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[4594]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[4618]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[6507]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[6474]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[7308]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[5130]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[2552]], remove_empty_cols=True)
manually_aligned_group2 = temp_align
extractTup(manually_aligned_group2, tup_i='segment').sort_index()

In [None]:
# Alignment in random-ish order without types enforced (of a slightly different dataset!!!)
temp_align = alignRowMajorLocal(alignment_df.loc[[7494]], alignment_df.loc[[7541]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[7549]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[7585]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[7594]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[416]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[423]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[443]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[447]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[409]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[1960]], remove_empty_cols=True)
temp_align = alignRowMajorLocal(temp_align, alignment_df.loc[[1989]], remove_empty_cols=True)
manually_aligned_group2 = temp_align
extractTup(manually_aligned_group2, tup_i='segment').sort_index()

In [None]:
# Demonstrate a merge operation
manually_aligned_group_merge = mergeCol(extractTup(manually_aligned_group, tup_i='segment'), 'txt4')
manually_aligned_group_merge = mergeCol(manually_aligned_group_merge, 'txt4')
manually_aligned_group_merge = mergeCol(manually_aligned_group_merge, 'txt1')
manually_aligned_group_merge = mergeCol(manually_aligned_group_merge, 'txt4')
manually_aligned_group_merge.sort_index()

In [None]:
# Demonstrate a split operation
manually_aligned_group_split = splitCol(manually_aligned_group_merge, 'txt0', right_align=True)
manually_aligned_group_split = splitCol(manually_aligned_group_split, 'txt4', right_align=False)
manually_aligned_group_split = splitCol(manually_aligned_group_split, 'txt5', right_align=True)
manually_aligned_group_split = splitCol(manually_aligned_group_split, 'txt7', right_align=False)
manually_aligned_group_split.sort_index()

In [None]:
# Put the split operation output into something more standard alignment DF format
manually_aligned_group_split = manually_aligned_group_split.groupby(
    manually_aligned_group_split.index, group_keys=False).apply(
    lambda group: applyEmptyTup(group.iloc[0])
)
manually_aligned_group_split

In [None]:
# Re-align the output of the split function using the same ordering as initial alignment
manually_aligned_group_realign = []
manually_aligned_group_realign.append(
    alignRowMajorLocal(manually_aligned_group_split.loc[[5126]],
                       manually_aligned_group_split.loc[[5134]],
                       remove_empty_cols=True))
manually_aligned_group_realign.append(
    alignRowMajorLocal(manually_aligned_group_split.loc[[7298]],
                       manually_aligned_group_split.loc[[7321]],
                       remove_empty_cols=True))
manually_aligned_group_realign.append(
    alignRowMajorLocal(manually_aligned_group_split.loc[[4594]],
                       manually_aligned_group_split.loc[[4618]],
                       remove_empty_cols=True))
manually_aligned_group_realign.append(
    alignRowMajorLocal(manually_aligned_group_split.loc[[5130]],
                       manually_aligned_group_split.loc[[2552]],
                       remove_empty_cols=True))
manually_aligned_group_realign.append(
    alignRowMajorLocal(manually_aligned_group_split.loc[[6474]],
                       manually_aligned_group_split.loc[[7308]],
                       remove_empty_cols=True))
update_manually_aligned_group_realign = []
update_manually_aligned_group_realign.append(
    alignRowMajorLocal(manually_aligned_group_realign[2],
                       manually_aligned_group_realign[3],
                       remove_empty_cols=True))
update_manually_aligned_group_realign.append(
    alignRowMajorLocal(manually_aligned_group_realign[1],
                       manually_aligned_group_realign[4],
                       remove_empty_cols=True))
update_manually_aligned_group_realign.append(
    alignRowMajorLocal(manually_aligned_group_realign[0],
                       manually_aligned_group_split.loc[[6507]],
                       remove_empty_cols=True))
manually_aligned_group_realign = update_manually_aligned_group_realign
update_manually_aligned_group_realign = []
update_manually_aligned_group_realign.append(
    alignRowMajorLocal(manually_aligned_group_realign[0],
                       manually_aligned_group_realign[2],
                       remove_empty_cols=True))
update_manually_aligned_group_realign[0] = alignRowMajorLocal(update_manually_aligned_group_realign[0],
                                                              manually_aligned_group_realign[1],
                                                              remove_empty_cols=True)
manually_aligned_group_realign = update_manually_aligned_group_realign[0]
extractTup(manually_aligned_group_realign, tup_i='segment').sort_index()

In [None]:
# Re-align the output of the split function by a manual ordering
update_manually_aligned_group_realign = alignRowMajorLocal(
    manually_aligned_group_split.loc[[7298]],
    manually_aligned_group_split.loc[[7321]],
    remove_empty_cols=True)
update_manually_aligned_group_realign = alignRowMajorLocal(
    update_manually_aligned_group_realign,
    manually_aligned_group_split.loc[[5126]],
    remove_empty_cols=True)
update_manually_aligned_group_realign = alignRowMajorLocal(
    update_manually_aligned_group_realign,
    manually_aligned_group_split.loc[[7308]],
    remove_empty_cols=True)
update_manually_aligned_group_realign = alignRowMajorLocal(
    update_manually_aligned_group_realign,
    manually_aligned_group_split.loc[[6474]],
    remove_empty_cols=True)
update_manually_aligned_group_realign = alignRowMajorLocal(
    update_manually_aligned_group_realign,
    manually_aligned_group_split.loc[[4594]],
    remove_empty_cols=True)
update_manually_aligned_group_realign = alignRowMajorLocal(
    update_manually_aligned_group_realign,
    manually_aligned_group_split.loc[[4618]],
    remove_empty_cols=True)
update_manually_aligned_group_realign = alignRowMajorLocal(
    update_manually_aligned_group_realign,
    manually_aligned_group_split.loc[[5130]],
    remove_empty_cols=True)
update_manually_aligned_group_realign = alignRowMajorLocal(
    update_manually_aligned_group_realign,
    manually_aligned_group_split.loc[[5134]],
    remove_empty_cols=True)
update_manually_aligned_group_realign = alignRowMajorLocal(
    update_manually_aligned_group_realign,
    manually_aligned_group_split.loc[[2552]],
    remove_empty_cols=True)
update_manually_aligned_group_realign = alignRowMajorLocal(
    update_manually_aligned_group_realign,
    manually_aligned_group_split.loc[[6507]],
    remove_empty_cols=True)
extractTup(update_manually_aligned_group_realign, tup_i='segment').sort_index()

In [None]:
# Re-align the output of the split function using a DIFFERENT manual ordering
update_manually_aligned_group_realign = alignRowMajorLocal(
    manually_aligned_group_split.loc[[2552]],
    manually_aligned_group_split.loc[[4618]],
    remove_empty_cols=True)
update_manually_aligned_group_realign = alignRowMajorLocal(
    update_manually_aligned_group_realign,
    manually_aligned_group_split.loc[[4594]],
    remove_empty_cols=True)
update_manually_aligned_group_realign = alignRowMajorLocal(
    update_manually_aligned_group_realign,
    manually_aligned_group_split.loc[[5130]],
    remove_empty_cols=True)
update_manually_aligned_group_realign = alignRowMajorLocal(
    update_manually_aligned_group_realign,
    manually_aligned_group_split.loc[[7298]],
    remove_empty_cols=True)
update_manually_aligned_group_realign = alignRowMajorLocal(
    update_manually_aligned_group_realign,
    manually_aligned_group_split.loc[[7308]],
    remove_empty_cols=True)
update_manually_aligned_group_realign = alignRowMajorLocal(
    update_manually_aligned_group_realign,
    manually_aligned_group_split.loc[[7321]],
    remove_empty_cols=True)
update_manually_aligned_group_realign = alignRowMajorLocal(
    update_manually_aligned_group_realign,
    manually_aligned_group_split.loc[[6474]],
    remove_empty_cols=True)
update_manually_aligned_group_realign = alignRowMajorLocal(
    update_manually_aligned_group_realign,
    manually_aligned_group_split.loc[[5134]],
    remove_empty_cols=True)
update_manually_aligned_group_realign = alignRowMajorLocal(
    update_manually_aligned_group_realign,
    manually_aligned_group_split.loc[[6507]],
    remove_empty_cols=True)
update_manually_aligned_group_realign = alignRowMajorLocal(
    update_manually_aligned_group_realign,
    manually_aligned_group_split.loc[[5126]],
    remove_empty_cols=True)
extractTup(update_manually_aligned_group_realign, tup_i='segment').sort_index()

In [None]:
# extractTup(update_manually_aligned_group_realign, tup_i='segment').loc[
#     [2552, 4594, 4618, 5126, 5130, 5134, 6474, 6507, 7298, 7308, 7321]
# ]
extractTup(update_manually_aligned_group_realign, tup_i='segment').loc[
    [2552, 4594, 4618, 5130, 6474, 7298, 7308, 7321, 5126, 5134, 6507]
]