# Pairwise Merge

First, load data

In [1]:
import json
with open('./summarized_responses.json') as f:
    data = json.load(f)

In [2]:
data['basic-analysis'][0].keys()

dict_keys(['text', 'code', 'relevant_API'])

We'll flatten tutorials into a list for convenience.
Tutorial boundaries are marked by a sentinel token.

In [3]:
calls = []
SENTINEL = '<SENTINEL>'

for tutorial_name, tutorials in data.items():
    for t in tutorials:
        calls.extend(t['relevant_API'])
        calls.append(SENTINEL)

print(len(calls))
print(calls[:20])

553
['scanpy.logging.print_versions', 'scanpy.settings.set_figure_params', 'scanpy.datasets.pbmc3k_processed', 'scanpy.datasets.pbmc68k_reduced', '<SENTINEL>', 'scanpy.pp.pca', 'scanpy.pp.neighbors', 'scanpy.tl.umap', 'scanpy.pl.umap', '<SENTINEL>', 'scanpy.tl.ingest', 'scanpy.pl.umap', '<SENTINEL>', 'scanpy.pl.umap', '<SENTINEL>', 'scanpy.tl.pca', 'scanpy.external.pp.bbknn', 'scanpy.tl.umap', '<SENTINEL>', 'scanpy.pl.umap']


Next, we'll count how many times consecutive calls occur.

In [4]:
from collections import Counter


def count_bigrams(tokens):
    c = Counter()
    for i in range(len(tokens)-1):
        w1 = tokens[i]
        w2 = tokens[i+1]
        c[(w1, w2)] += 1
    non_consecutive = [k for k in c.keys() if SENTINEL in k]
    for k in non_consecutive:
        del c[k]
    return c


r0 = calls.copy()
c0 = count_bigrams(r0)
c0.most_common(10)

[(('scanpy.tl.rank_genes_groups', 'scanpy.pl.rank_genes_groups'), 8),
 (('scanpy.pp.pca', 'scanpy.pp.neighbors'), 6),
 (('scanpy.pp.neighbors', 'scanpy.tl.umap'), 6),
 (('scanpy.pl.umap', 'scanpy.pl.diffmap'), 6),
 (('scanpy.pp.filter_cells', 'scanpy.pp.filter_genes'), 5),
 (('scanpy.pp.highly_variable_genes', 'scanpy.pl.highly_variable_genes'), 5),
 (('scanpy.tl.leiden', 'scanpy.pl.umap'), 4),
 (('scanpy.pl.rank_genes_groups', 'scanpy.pl.rank_genes_groups_violin'), 4),
 (('scanpy.pp.normalize_per_cell', 'scanpy.pp.log1p'), 4),
 (('scanpy.read', 'scanpy.tl.louvain'), 4)]

We'll merge consecutive calls that occur more than some threshold.

In [5]:
def merge(tokens: [str], counts: Counter, threshold: int, verbose=False):
    merged = []
    i = 0
    while i < len(tokens) - 1:
        w1 = tokens[i]
        w2 = tokens[i+1]
        c = counts[(w1, w2)]
        if c >= threshold:
            m = '{} -> {}'.format(w1, w2)
            if verbose:
                print('Merging {} of frequency {} >= {}'.format(m, c, threshold))
            merged.append(m)
            i += 2
        else:
            merged.append(w1)
            i += 1
    if i < len(tokens):
        merged.append(tokens[i])
    return merged

r1 = merge(r0, c0, threshold=5, verbose=True)

Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 5
Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 5
Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 5
Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 5
Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 5
Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5
Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5
Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5
Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 5
Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 5
Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 5
Merging scanpy.tl.rank_genes_groups -> scanpy.pl.rank_genes_groups of frequency 8 >= 5
Merging scanpy.pl.umap -> scanpy.pl.diffmap of f

We'll iteratively merge until there are no more call-chains left to be merged.

In [6]:
def iterative_merge(tokens, threshold, verbose=False):
    before = tokens[:]
    rounds = 0
    while True:
        rounds += 1
        counts = count_bigrams(before)
        if verbose:
            print('-' * 10, 'Round {}'.format(rounds), '-' * 10)
            print('Top 5 pairs')
            for k, v in counts.most_common(5):
                print('{} occurred {} times'.format(k, v))
        after = merge(before, counts, threshold=threshold, verbose=verbose)
        if len(after) == len(before):
            break
        before = after
    if verbose:
        print('Stopping after Round {}'.format(rounds))
    return after

merged = iterative_merge(calls, threshold=3, verbose=True)

---------- Round 1 ----------
Top 5 pairs
('scanpy.tl.rank_genes_groups', 'scanpy.pl.rank_genes_groups') occurred 8 times
('scanpy.pp.pca', 'scanpy.pp.neighbors') occurred 6 times
('scanpy.pp.neighbors', 'scanpy.tl.umap') occurred 6 times
('scanpy.pl.umap', 'scanpy.pl.diffmap') occurred 6 times
('scanpy.pp.filter_cells', 'scanpy.pp.filter_genes') occurred 5 times
Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 3
Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 3
Merging scanpy.pp.pca -> scanpy.pp.neighbors of frequency 6 >= 3
Merging scanpy.tl.paga -> scanpy.pl.paga of frequency 3 >= 3
Merging scanpy.tl.paga -> scanpy.pl.paga of frequency 3 >= 3
Merging scanpy.pp.filter_cells -> scanpy.pp.filter_genes of frequency 5 >= 3
Merging scanpy.pp.normalize_total -> scanpy.pp.log1p of frequency 3 >= 3
Merging scanpy.pp.highly_variable_genes -> scanpy.pl.highly_variable_genes of frequency 5 >= 3
Merging scanpy.tl.leiden -> scanpy.pl.umap of frequency 4 >= 3
Merging s

## Merged call chains

In [7]:
def recover_call_chains(calls):
    return [x.split('___') for x in '___'.join(calls).split(SENTINEL)]

before = recover_call_chains(calls)
after = recover_call_chains(merged)
assert len(before) == len(after)

for i, (b, a) in enumerate(zip(before, after)):
    print('-' * 10, 'Tutorial {}'.format(i), '-' * 10)
    print('BEFORE')
    for x in b:
        print(x)
    print()
    print('AFTER')
    for x in a:
        print(x)

---------- Tutorial 0 ----------
BEFORE
scanpy.logging.print_versions
scanpy.settings.set_figure_params
scanpy.datasets.pbmc3k_processed
scanpy.datasets.pbmc68k_reduced


AFTER
scanpy.logging.print_versions
scanpy.settings.set_figure_params
scanpy.datasets.pbmc3k_processed
scanpy.datasets.pbmc68k_reduced

---------- Tutorial 1 ----------
BEFORE

scanpy.pp.pca
scanpy.pp.neighbors
scanpy.tl.umap
scanpy.pl.umap


AFTER

scanpy.pp.pca -> scanpy.pp.neighbors -> scanpy.tl.umap
scanpy.pl.umap

---------- Tutorial 2 ----------
BEFORE

scanpy.tl.ingest
scanpy.pl.umap


AFTER

scanpy.tl.ingest
scanpy.pl.umap

---------- Tutorial 3 ----------
BEFORE

scanpy.pl.umap


AFTER

scanpy.pl.umap

---------- Tutorial 4 ----------
BEFORE

scanpy.tl.pca
scanpy.external.pp.bbknn
scanpy.tl.umap


AFTER

scanpy.tl.pca
scanpy.external.pp.bbknn
scanpy.tl.umap

---------- Tutorial 5 ----------
BEFORE

scanpy.pl.umap
scanpy.read


AFTER

scanpy.pl.umap
scanpy.read

---------- Tutorial 6 ----------
BEFORE

scanpy.