In [None]:
%load_ext autoreload

In [None]:
import os as _os
_os.chdir(_os.environ['PROJECT_ROOT'])
_os.path.realpath(_os.path.curdir)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp

In [None]:
profiles = {}

gene_list = [
    'UHGG100745_03107',
    'UHGG041606_00976',
    'UHGG037134_01291',
    'UHGG029160_00499',
    'UHGG019871_03555',
    'UHGG182529_06021',
    'UHGG261920_02856',
    'UHGG000139_00452',
    'UHGG029950_00110',
    'UHGG286966_00934',
]

# q0 = 
# for gene_id in gene_list:
#     profiles[gene_id] = pd.read_table(f'{gene_id}.tsv', names=['gene_idA', 'gene_idB', 'position', 'positionB', 'depth_q2', 'depth_q0'])[['position', 'depth_q0', 'depth_q2']].set_index('position')
print('\|'.join(gene_list))

In [None]:
%%bash
for q in 0 2 10
do
    echo $q
    lz4cat data/group/xjin_hmp2/species/sp-101380/genome/Ruminococcus-gnavus-ATCC-29149_MinIONHybrid.tiles-l100-o99.pangenomes99.position_depth-mapq$q.tsv.lz4 \
    | grep 'UHGG100745_03107\|UHGG041606_00976\|UHGG037134_01291\|UHGG029160_00499\|UHGG019871_03555\|UHGG182529_06021\|UHGG261920_02856\|UHGG000139_00452\|UHGG029950_00110\|UHGG286966_00934' \
    > test$q.tsv
done

In [None]:
q0 = pd.read_table('test0.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q2 = pd.read_table('test2.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q10 = pd.read_table('test10.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
data = q0.join(q2.join(q10, rsuffix='10'), lsuffix='0', rsuffix='2').fillna(0)#.assign(delta_ratio=lambda x: x.delta / x.tally0)

gene_list = data.index.get_level_values(0).unique()
print(len(gene_list))

ngenes = len(gene_list)
fig, axs = plt.subplots(ngenes, figsize=(10, 4*ngenes))

results = {}
for ax, gene_id in zip(axs, gene_list):
    ax.set_title(gene_id)
    d = (
        data
        .xs(gene_id, level='gene_id')
        .assign(delta=lambda x: x.tally0 - x.tally2)
        .assign(
            ratio2=lambda x: x.tally2 / x.tally0,
            ratio10=lambda x: x.tally10 / x.tally0,
        )
    )
    ax2 = ax.twinx()
    ax.plot('tally0', data=d, color='tab:blue', label='tally0')
    ax.plot('tally2', data=d, color='tab:orange', label='tally2')
    ax.plot('tally10', data=d, color='tab:green', label='tally10')
    ax2.plot('ratio2', data=d, color='tab:purple', label='ratio2')
    ax2.plot('ratio10', data=d, color='tab:pink', label='ratio10')

    ax2.set_ylim(0, 1)
    tally0_sum = d.tally0.sum()
    tally2_sum = d.tally2.sum()
    tally10_sum = d.tally10.sum()
    raw_length = len(d)
    unmasked2_length = d.ratio2.fillna(0).sum()
    unmasked10_length = d.ratio10.fillna(0).sum()
    # corrected_tally2_mean = tally2_sum / unmasked_length
    # ax.axhline(tally0_mean, linestyle='--', color='tab:blue', label='tally0')
    # ax.axhline(tally2_mean, linestyle='--', color='tab:orange', label='tally2')
    # ax.axhline(delta_mean, linestyle='--', color='tab:green', label='delta')
    # ax.axhline(corrected_tally2_mean, linestyle='--', color='tab:red', label='corrected')
    # ax.annotate(f"{tally0_mean:.1f}\n{corrected_tally2_mean:.1f}\n{tally2_mean:.1f}", xy=(0.1, 0.9), xycoords="axes fraction", ha='left', va='top')
    ax2.annotate(f"{unmasked2_length:.1f} {tally2_sum:.1f}\n{unmasked10_length:.1f} {tally10_sum:.1f}", xy=(0.9, 0.9), xycoords="axes fraction", ha='right', va='top')
    ax.set_ylim(0, 100)
    results[gene_id] = (tally0_sum, tally2_sum, tally10_sum, unmasked2_length, unmasked10_length, raw_length)
# plt.plot([0, 100], [0, 100])
ax.legend(loc='upper center')
results = pd.DataFrame(results, index=['tally0_sum', 'tally2_sum', 'tally10_sum', 'unmasked2_length', 'unmasked10_length', 'raw_length']).T
print(results.sum(), results.raw_length.mean())
print(
    (results.tally0_sum / results.raw_length).sum(),
    results.tally2_sum.sum() / results.unmasked2_length.sum(),
    results.tally10_sum.sum() / results.unmasked10_length.sum(),
)
results

# print(np.mean(all_full_lengths), sum(all_unmasked2_lengths), sum(all_tally2s), sum(all_tally2s) / sum(all_unmasked_lengths))

In [None]:
!sort -k2,2 data/species/sp-101380/genome/Ruminococcus-gnavus-ATCC-29149_MinIONHybrid.midas_uhgg_pangenome-blastn.gene_matching-c75-t95.tsv | head -10

In [None]:
centroid_75 = 'UHGG000135_00003'

!grep "{centroid_75}" ref/midasdb_uhgg/pangenomes/101380/cluster_info.txt | cut -f1,7

In [None]:
%%bash -s "$centroid_75"
for v in '' '-v2' '-v3' '-v4' '-v5' '-v6' '-v7' '-v8' '-v9' '-v10' '-v11' '-v12'
do
    for q in 0 1 2
    do
        lz4cat data/group/xjin_hmp2/species/sp-101380/genome/Ruminococcus-gnavus-ATCC-29149_MinIONHybrid.tiles-l100-o99.pangenomes99$v.position_depth-mapq$q.tsv.lz4 \
        | grep -f <(grep $1 ref/midasdb_uhgg/pangenomes/101380/cluster_info.txt | cut -f1) \
        > test$q$v.tsv && echo $q$v &
    done
done
wait

In [None]:
d0 = pd.read_table(
    'UHGG000135_00003/Ruminococcus-gnavus-ATCC-29149_MinIONHybrid.tiles-l100-o99.pangenomes99-v10.position_depth.UHGG000135_0003-centroid99.tsv',
    names=['gene_id', 'position', 'depth'],
)

fig, ax = plt.subplots(figsize=(20, 5))
for gene_id, d1 in d0.groupby(['gene_id']):
    ax.plot('position', 'depth', data=d1, label=gene_id)
ax.plot('position', 'depth', data=d0.groupby('position').sum().reset_index(), color='k', label='total depth', linestyle='--') 
ax.set_ylim(-1, 101)

plt.legend(ncol=3)

In [None]:
gene_mapping_str = """101380|UHGG000135_00003|UHGG000135_00003
101380|UHGG000135_00003|UHGG000328_03129
101380|UHGG000135_00003|UHGG000717_02777
101380|UHGG000135_00003|UHGG000803_02647
101380|UHGG000135_00003|UHGG002889_00742
101380|UHGG000135_00003|UHGG005969_03139
102445|UHGG031469_00702|UHGG031469_00702
100262|UHGG033842_02675|UHGG033842_02675
100148|UHGG034175_02040|UHGG034175_02040
103326|UHGG038204_00998|UHGG038204_00998
100242|UHGG038378_03087|UHGG038378_03087
101380|UHGG000135_00003|UHGG046224_02195
100262|UHGG033842_02675|UHGG060747_00843
102528|UHGG062423_02644|UHGG062423_02644
101439|UHGG074707_01708|UHGG074707_01708
102528|UHGG062423_02644|UHGG076943_01560
102528|UHGG062423_02643|UHGG076943_01561
101400|UHGG078729_00486|UHGG078729_00486
101380|UHGG000135_00003|UHGG078890_01518
100148|UHGG034175_02040|UHGG080201_02760
100249|UHGG082137_00962|UHGG082137_00962
100076|UHGG082590_00893|UHGG082590_00893
101345|UHGG087535_00667|UHGG087535_00667
101704|UHGG089287_00941|UHGG089287_00941
101374|UHGG097301_02087|UHGG097301_02087
101374|UHGG097301_02088|UHGG097301_02088
101374|UHGG097301_02089|UHGG097301_02089
100200|UHGG100226_02260|UHGG100226_02260
103326|UHGG038204_00998|UHGG116197_02451
100142|UHGG130379_02202|UHGG130379_02202
100038|UHGG130558_02563|UHGG130558_02563
104115|UHGG159718_01407|UHGG159718_01407
102625|UHGG162126_01581|UHGG162126_01581
100249|UHGG082137_00963|UHGG184663_02234
100249|UHGG082137_00962|UHGG184663_02235
100249|UHGG082137_00961|UHGG184663_02236
101380|UHGG000135_00004|UHGG184738_02089
102327|UHGG187275_00532|UHGG187275_00532
102327|UHGG187275_00533|UHGG187275_00533
101380|UHGG166739_02489|UHGG187321_02321
101380|UHGG000135_00003|UHGG187321_02322
101439|UHGG074707_01708|UHGG227266_02075
102478|UHGG230188_00491|UHGG230188_00491
101380|UHGG166739_02489|UHGG231246_01460
101380|UHGG000135_00003|UHGG244152_01492"""

gene_species_mapping = {}
gene_centroid_75_mapping = {}
for line in gene_mapping_str.split('\n'):
    species_id, centroid_75, gene_id = line.split('|')
    gene_species_mapping[gene_id] = species_id
    gene_centroid_75_mapping[gene_id] = centroid_75

In [None]:
d0 = pd.read_table(
    'UHGG000135_00003/Ruminococcus-gnavus-ATCC-29149_MinIONHybrid.tiles-l100-o99.pangenomes99-v10.orf-tiles.depth.tsv',
    names=['gene_id', 'position', 'depth'],
).assign(
    species=lambda x: x.gene_id.map(gene_species_mapping),
    centroid_75=lambda x: x.gene_id.map(gene_centroid_75_mapping),
)

species_order = d0[['species', 'gene_id']].drop_duplicates().species.value_counts().index
num_centroids = len(d0.centroid_75.unique())

fig, axs = plt.subplots(num_centroids, figsize=(10, num_centroids * 3))
 = 0
for species in species_order:
    d1 = d0[d0.species == species]
    for centroid_75 in d1.groupby('centroid_75'):
        ax = axs[ax_idx]
        d2 = 
    for gene_id, d2 in d1.groupby(['gene_id']):
        ax.plot('position', 'depth', data=d2, label=gene_id)
    ax.plot('position', 'depth', data=d1.groupby('position').sum().reset_index(), color='k', label='total depth', linestyle='--') 
    ax.set_title(f"{species}: {centroid_75}")
    ax.legend(bbox_to_anchor=(1, 1), ncol=2)
# ax.set_ylim(-1, 101)

# plt.legend(ncol=3)

In [None]:
read_length = 100
read_frac = 0.5

q0 = pd.read_table('test0.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q1 = pd.read_table('test1.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q2 = pd.read_table('test2.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
d0 = q0.join(
    q1.join(q2, rsuffix='2'),
    lsuffix='0',
    rsuffix='1'
).fillna(0)

gene_list = d0.index.get_level_values(0).unique()
print(len(gene_list))

ngenes = len(gene_list)
fig, axs = plt.subplots(ngenes, figsize=(10, 4*ngenes))
if type(axs) is not np.ndarray:
    axs = np.array([axs])

results = {}
for ax, gene_id in zip(axs, gene_list):
    ax.set_title(gene_id)
    d1 = (
        d0
        .xs(gene_id, level='gene_id')
        .assign(
            tally0_pc=lambda x: x.tally0 + 1,
            tally1_pc=lambda x: x.tally1 + 1,
            tally2_pc=lambda x: x.tally2 + 1,
            frac_not_multimapped_pc=lambda x: (1 - ((x.tally0_pc - x.tally2) / x.tally0_pc)),
            frac_not_multimapped=lambda x: (1 - ((x.tally0 - x.tally2) / x.tally0)),
            frac_multimapped=lambda x: ((x.tally0 - x.tally2) / x.tally0).fillna(0),
            frac_multimapped_pc=lambda x: ((x.tally0_pc - x.tally2) / x.tally0_pc).fillna(0),
            gene_left=lambda x: x.index.to_frame().position.min(),  # 1
            gene_right=lambda x: x.index.to_frame().position.max(),  # gene length
            leftmost_start=lambda x: (x.index.to_frame().position - read_length).clip(lower=x.gene_left),
            rightmost_end=lambda x: (x.index.to_frame().position + read_length).clip(upper=x.gene_right),
            frac_not_clipped=lambda x: (x.rightmost_end - x.leftmost_start) / (read_length * 2),
            not_masked2=lambda x: x.frac_multimapped > 0.2,
            frac_not_filtered=lambda x: x.frac_not_clipped * x.frac_not_multimapped,
        )
    )
    ax2 = ax.twinx()
    # ax2.plot('frac_not_filtered', data=d1, color='tab:purple', linestyle='--', label='frac_not_filtered')
    ax2.plot('frac_multimapped', data=d1, color='tab:pink', linestyle='--', label='multimapping_rate')

    ax.plot('tally0', data=d1, color='tab:blue', label='depth')
    # ax.plot('tally1', data=d1, color='tab:green', label='tally1')
    ax.plot('tally2', data=d1, color='tab:orange', label='tally2')
    ax.set_ylim(-1, 100)
    ax2.set_ylim(-0.1, 1.1)

    results[gene_id] = dict(
        nominal_length=d1.index.to_frame().position.max() - d1.index.to_frame().position.min(),
        edge_corrected_length=d1.frac_not_clipped.sum(),
        edge_and_multi_corrected_length=d1.frac_not_filtered.sum(),
        # edge_corrected_unmasked_length=d1.frac_not_clipped[d1.not_masked2].sum(),
        # tally0_unmasked=d1.tally0[d1.not_masked2].sum(),
        tally0=d1.tally0.sum(),
        tally1=d1.tally1.sum(),
        tally2=d1.tally2.sum(),
    )
ax.legend(loc='upper center')
d2 = pd.DataFrame(results).T.assign(corrected_depth=lambda x: (x.tally2 + 1) / (x.edge_and_multi_corrected_length + read_length))
print(
    (d2.tally0.sum(), d2.tally1.sum(), d2.tally2.sum()),
    (d2.tally0 / d2.nominal_length).sum(),
    (d2.tally0 / d2.edge_corrected_length).sum(),
    # (d2.tally1 / d2.edge_corrected_length).sum(),
    ((d2.tally2 + 1) / (d2.edge_and_multi_corrected_length + read_length)).sum(),
    # ((d2.tally0 + 1) / (d2.edge_corrected_length + read_length)).sum(),
    # (d2.tally0_unmasked.sum() / d2.edge_corrected_unmasked_length.sum())
)
d2

In [None]:
read_length = 100
read_frac = 0.5

q0 = pd.read_table('test0-v2.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q1 = pd.read_table('test1-v2.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q2 = pd.read_table('test2-v2.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
d0 = q0.join(
    q1.join(q2, rsuffix='2'),
    lsuffix='0',
    rsuffix='1'
).fillna(0)

gene_list = d0.index.get_level_values(0).unique()
print(len(gene_list))

ngenes = len(gene_list)
fig, axs = plt.subplots(ngenes, figsize=(10, 4*ngenes))
if type(axs) is not np.ndarray:
    axs = np.array([axs])

results = {}
for ax, gene_id in zip(axs, gene_list):
    ax.set_title(gene_id)
    d1 = (
        d0
        .xs(gene_id, level='gene_id')
        .assign(
            tally0_pc=lambda x: x.tally0 + 1,
            tally1_pc=lambda x: x.tally1 + 1,
            tally2_pc=lambda x: x.tally2 + 1,
            frac_not_multimapped_pc=lambda x: (1 - ((x.tally0_pc - x.tally2) / x.tally0_pc)),
            frac_not_multimapped=lambda x: (1 - ((x.tally0 - x.tally2) / x.tally0)),
            frac_multimapped=lambda x: ((x.tally0 - x.tally2) / x.tally0).fillna(0),
            gene_left=lambda x: x.index.to_frame().position.min(),  # 1
            gene_right=lambda x: x.index.to_frame().position.max(),  # gene length
            leftmost_start=lambda x: (x.index.to_frame().position - read_length).clip(lower=x.gene_left),
            rightmost_end=lambda x: (x.index.to_frame().position + read_length).clip(upper=x.gene_right),
            frac_not_clipped=lambda x: (x.rightmost_end - x.leftmost_start) / (read_length * 2),
            not_masked2=lambda x: x.frac_multimapped > 0.2,
            frac_not_filtered=lambda x: x.frac_not_clipped * x.frac_not_multimapped,
        )
    )
    ax2 = ax.twinx()
    ax2.plot('frac_not_filtered', data=d1, color='tab:purple', linestyle='--', label='frac_not_filtered')
    # ax2.plot('not_masked2', data=d1, color='tab:pink', linestyle='--', label='not_masked2')

    ax.plot('tally0', data=d1, color='tab:blue', label='tally0')
    ax.plot('tally1', data=d1, color='tab:green', label='tally1')
    ax.plot('tally2', data=d1, color='tab:orange', label='tally2')
    ax.set_ylim(-1, 100)
    ax2.set_ylim(-0.1, 1.1)

    results[gene_id] = dict(
        nominal_length=d1.index.to_frame().position.max() - d1.index.to_frame().position.min(),
        edge_corrected_length=d1.frac_not_clipped.sum(),
        edge_and_multi_corrected_length=d1.frac_not_filtered.sum(),
        # edge_corrected_unmasked_length=d1.frac_not_clipped[d1.not_masked2].sum(),
        # tally0_unmasked=d1.tally0[d1.not_masked2].sum(),
        tally0=d1.tally0.sum(),
        tally1=d1.tally1.sum(),
        tally2=d1.tally2.sum(),
    )
ax.legend(loc='upper center')
d2 = pd.DataFrame(results).T.assign(corrected_depth=lambda x: (x.tally2 + 1) / (x.edge_and_multi_corrected_length + read_length))
print(
    (d2.tally0.sum(), d2.tally1.sum(), d2.tally2.sum()),
    (d2.tally0 / d2.nominal_length).sum(),
    (d2.tally0 / d2.edge_corrected_length).sum(),
    # (d2.tally1 / d2.edge_corrected_length).sum(),
    ((d2.tally2 + 1) / (d2.edge_and_multi_corrected_length + read_length)).sum(),
    # ((d2.tally0 + 1) / (d2.edge_corrected_length + read_length)).sum(),
    # (d2.tally0_unmasked.sum() / d2.edge_corrected_unmasked_length.sum())
)
d2

In [None]:
read_length = 100
read_frac = 0.5

q0 = pd.read_table('test0-v3.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q1 = pd.read_table('test1-v3.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q2 = pd.read_table('test2-v3.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
d0 = q0.join(
    q1.join(q2, rsuffix='2'),
    lsuffix='0',
    rsuffix='1'
).fillna(0)

gene_list = d0.index.get_level_values(0).unique()
print(len(gene_list))

ngenes = len(gene_list)
fig, axs = plt.subplots(ngenes, figsize=(10, 4*ngenes))
if type(axs) is not np.ndarray:
    axs = np.array([axs])

results = {}
for ax, gene_id in zip(axs, gene_list):
    ax.set_title(gene_id)
    d1 = (
        d0
        .xs(gene_id, level='gene_id')
        .assign(
            tally0_pc=lambda x: x.tally0 + 1,
            tally1_pc=lambda x: x.tally1 + 1,
            tally2_pc=lambda x: x.tally2 + 1,
            frac_not_multimapped_pc=lambda x: (1 - ((x.tally0_pc - x.tally2) / x.tally0_pc)),
            frac_not_multimapped=lambda x: (1 - ((x.tally0 - x.tally2) / x.tally0)),
            frac_multimapped=lambda x: ((x.tally0 - x.tally2) / x.tally0).fillna(0),
            gene_left=lambda x: x.index.to_frame().position.min(),  # 1
            gene_right=lambda x: x.index.to_frame().position.max(),  # gene length
            leftmost_start=lambda x: (x.index.to_frame().position - read_length).clip(lower=x.gene_left),
            rightmost_end=lambda x: (x.index.to_frame().position + read_length).clip(upper=x.gene_right),
            frac_not_clipped=lambda x: (x.rightmost_end - x.leftmost_start) / (read_length * 2),
            not_masked2=lambda x: x.frac_multimapped > 0.2,
            frac_not_filtered=lambda x: x.frac_not_clipped * x.frac_not_multimapped,
        )
    )
    ax2 = ax.twinx()
    ax2.plot('frac_not_filtered', data=d1, color='tab:purple', linestyle='--', label='frac_not_filtered')
    # ax2.plot('not_masked2', data=d1, color='tab:pink', linestyle='--', label='not_masked2')

    ax.plot('tally0', data=d1, color='tab:blue', label='tally0')
    ax.plot('tally1', data=d1, color='tab:green', label='tally1')
    ax.plot('tally2', data=d1, color='tab:orange', label='tally2')
    ax.set_ylim(-1, 100)
    ax2.set_ylim(-0.1, 1.1)

    results[gene_id] = dict(
        nominal_length=d1.index.to_frame().position.max() - d1.index.to_frame().position.min(),
        edge_corrected_length=d1.frac_not_clipped.sum(),
        edge_and_multi_corrected_length=d1.frac_not_filtered.sum(),
        # edge_corrected_unmasked_length=d1.frac_not_clipped[d1.not_masked2].sum(),
        # tally0_unmasked=d1.tally0[d1.not_masked2].sum(),
        tally0=d1.tally0.sum(),
        tally1=d1.tally1.sum(),
        tally2=d1.tally2.sum(),
    )
ax.legend(loc='upper center')
d2 = pd.DataFrame(results).T.assign(corrected_depth=lambda x: (x.tally2 + 1) / (x.edge_and_multi_corrected_length + read_length))
print(
    (d2.tally0.sum(), d2.tally1.sum(), d2.tally2.sum()),
    (d2.tally0 / d2.nominal_length).sum(),
    (d2.tally0 / d2.edge_corrected_length).sum(),
    # (d2.tally1 / d2.edge_corrected_length).sum(),
    ((d2.tally2 + 1) / (d2.edge_and_multi_corrected_length + read_length)).sum(),
    # ((d2.tally0 + 1) / (d2.edge_corrected_length + read_length)).sum(),
    # (d2.tally0_unmasked.sum() / d2.edge_corrected_unmasked_length.sum())
)
d2

In [None]:
read_length = 100
read_frac = 0.5

q0 = pd.read_table('test0-v4.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q1 = pd.read_table('test1-v4.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q2 = pd.read_table('test2-v4.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
d0 = q0.join(
    q1.join(q2, rsuffix='2'),
    lsuffix='0',
    rsuffix='1'
).fillna(0)

gene_list = d0.index.get_level_values(0).unique()
print(len(gene_list))

ngenes = len(gene_list)
fig, axs = plt.subplots(ngenes, figsize=(10, 4*ngenes))
if type(axs) is not np.ndarray:
    axs = np.array([axs])

results = {}
for ax, gene_id in zip(axs, gene_list):
    ax.set_title(gene_id)
    d1 = (
        d0
        .xs(gene_id, level='gene_id')
        .assign(
            tally0_pc=lambda x: x.tally0 + 1,
            tally1_pc=lambda x: x.tally1 + 1,
            tally2_pc=lambda x: x.tally2 + 1,
            frac_not_multimapped_pc=lambda x: (1 - ((x.tally0_pc - x.tally2) / x.tally0_pc)),
            frac_not_multimapped=lambda x: (1 - ((x.tally0 - x.tally2) / x.tally0)),
            frac_multimapped=lambda x: ((x.tally0 - x.tally2) / x.tally0).fillna(0),
            gene_left=lambda x: x.index.to_frame().position.min(),  # 1
            gene_right=lambda x: x.index.to_frame().position.max(),  # gene length
            leftmost_start=lambda x: (x.index.to_frame().position - read_length).clip(lower=x.gene_left),
            rightmost_end=lambda x: (x.index.to_frame().position + read_length).clip(upper=x.gene_right),
            frac_not_clipped=lambda x: (x.rightmost_end - x.leftmost_start) / (read_length * 2),
            not_masked2=lambda x: x.frac_multimapped > 0.2,
            frac_not_filtered=lambda x: x.frac_not_clipped * x.frac_not_multimapped,
        )
    )
    ax2 = ax.twinx()
    ax2.plot('frac_not_filtered', data=d1, color='tab:purple', linestyle='--', label='frac_not_filtered')
    # ax2.plot('not_masked2', data=d1, color='tab:pink', linestyle='--', label='not_masked2')

    ax.plot('tally0', data=d1, color='tab:blue', label='tally0')
    ax.plot('tally1', data=d1, color='tab:green', label='tally1')
    ax.plot('tally2', data=d1, color='tab:orange', label='tally2')
    ax.set_ylim(-1, 100)
    ax2.set_ylim(-0.1, 1.1)

    results[gene_id] = dict(
        nominal_length=d1.index.to_frame().position.max() - d1.index.to_frame().position.min(),
        edge_corrected_length=d1.frac_not_clipped.sum(),
        edge_and_multi_corrected_length=d1.frac_not_filtered.sum(),
        # edge_corrected_unmasked_length=d1.frac_not_clipped[d1.not_masked2].sum(),
        # tally0_unmasked=d1.tally0[d1.not_masked2].sum(),
        tally0=d1.tally0.sum(),
        tally1=d1.tally1.sum(),
        tally2=d1.tally2.sum(),
    )
ax.legend(loc='upper center')
d2 = pd.DataFrame(results).T.assign(corrected_depth=lambda x: (x.tally2 + 1) / (x.edge_and_multi_corrected_length + read_length))
print(
    (d2.tally0.sum(), d2.tally1.sum(), d2.tally2.sum()),
    (d2.tally0 / d2.nominal_length).sum(),
    (d2.tally0 / d2.edge_corrected_length).sum(),
    # (d2.tally1 / d2.edge_corrected_length).sum(),
    ((d2.tally2 + 1) / (d2.edge_and_multi_corrected_length + read_length)).sum(),
    # ((d2.tally0 + 1) / (d2.edge_corrected_length + read_length)).sum(),
    # (d2.tally0_unmasked.sum() / d2.edge_corrected_unmasked_length.sum())
)
d2

In [None]:
read_length = 100
read_frac = 0.5

q0 = pd.read_table('test0-v5.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q1 = pd.read_table('test1-v5.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q2 = pd.read_table('test2-v5.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
d0 = q0.join(
    q1.join(q2, rsuffix='2'),
    lsuffix='0',
    rsuffix='1'
).fillna(0)

gene_list = d0.index.get_level_values(0).unique()
print(len(gene_list))

ngenes = len(gene_list)
fig, axs = plt.subplots(ngenes, figsize=(10, 4*ngenes))
if type(axs) is not np.ndarray:
    axs = np.array([axs])

results = {}
for ax, gene_id in zip(axs, gene_list):
    ax.set_title(gene_id)
    d1 = (
        d0
        .xs(gene_id, level='gene_id')
        .assign(
            tally0_pc=lambda x: x.tally0 + 1,
            tally1_pc=lambda x: x.tally1 + 1,
            tally2_pc=lambda x: x.tally2 + 1,
            frac_not_multimapped_pc=lambda x: (1 - ((x.tally0_pc - x.tally2) / x.tally0_pc)),
            frac_not_multimapped=lambda x: (1 - ((x.tally0 - x.tally2) / x.tally0)),
            frac_multimapped=lambda x: ((x.tally0 - x.tally2) / x.tally0).fillna(0),
            gene_left=lambda x: x.index.to_frame().position.min(),  # 1
            gene_right=lambda x: x.index.to_frame().position.max(),  # gene length
            leftmost_start=lambda x: (x.index.to_frame().position - read_length).clip(lower=x.gene_left),
            rightmost_end=lambda x: (x.index.to_frame().position + read_length).clip(upper=x.gene_right),
            frac_not_clipped=lambda x: (x.rightmost_end - x.leftmost_start) / (read_length * 2),
            not_masked2=lambda x: x.frac_multimapped > 0.2,
            frac_not_filtered=lambda x: x.frac_not_clipped * x.frac_not_multimapped,
        )
    )
    ax2 = ax.twinx()
    ax2.plot('frac_not_filtered', data=d1, color='tab:purple', linestyle='--', label='frac_not_filtered')
    # ax2.plot('not_masked2', data=d1, color='tab:pink', linestyle='--', label='not_masked2')

    ax.plot('tally0', data=d1, color='tab:blue', label='tally0')
    ax.plot('tally1', data=d1, color='tab:green', label='tally1')
    ax.plot('tally2', data=d1, color='tab:orange', label='tally2')
    ax.set_ylim(-1, 100)
    ax2.set_ylim(-0.1, 1.1)

    results[gene_id] = dict(
        nominal_length=d1.index.to_frame().position.max() - d1.index.to_frame().position.min(),
        edge_corrected_length=d1.frac_not_clipped.sum(),
        edge_and_multi_corrected_length=d1.frac_not_filtered.sum(),
        # edge_corrected_unmasked_length=d1.frac_not_clipped[d1.not_masked2].sum(),
        # tally0_unmasked=d1.tally0[d1.not_masked2].sum(),
        tally0=d1.tally0.sum(),
        tally1=d1.tally1.sum(),
        tally2=d1.tally2.sum(),
    )
ax.legend(loc='upper center')
d2 = pd.DataFrame(results).T.assign(corrected_depth=lambda x: (x.tally2 + 1) / (x.edge_and_multi_corrected_length + read_length))
print(
    (d2.tally0.sum(), d2.tally1.sum(), d2.tally2.sum()),
    (d2.tally0 / d2.nominal_length).sum(),
    (d2.tally0 / d2.edge_corrected_length).sum(),
    # (d2.tally1 / d2.edge_corrected_length).sum(),
    ((d2.tally2 + 1) / (d2.edge_and_multi_corrected_length + read_length)).sum(),
    # ((d2.tally0 + 1) / (d2.edge_corrected_length + read_length)).sum(),
    # (d2.tally0_unmasked.sum() / d2.edge_corrected_unmasked_length.sum())
)
d2

In [None]:
read_length = 100
read_frac = 0.5

q0 = pd.read_table('test0-v6.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q1 = pd.read_table('test1-v6.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q2 = pd.read_table('test2-v6.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
d0 = q0.join(
    q1.join(q2, rsuffix='2'),
    lsuffix='0',
    rsuffix='1'
).fillna(0)

gene_list = d0.index.get_level_values(0).unique()
print(len(gene_list))

ngenes = len(gene_list)
fig, axs = plt.subplots(ngenes, figsize=(10, 4*ngenes))
if type(axs) is not np.ndarray:
    axs = np.array([axs])

results = {}
for ax, gene_id in zip(axs, gene_list):
    ax.set_title(gene_id)
    d1 = (
        d0
        .xs(gene_id, level='gene_id')
        .assign(
            tally0_pc=lambda x: x.tally0 + 1,
            tally1_pc=lambda x: x.tally1 + 1,
            tally2_pc=lambda x: x.tally2 + 1,
            frac_not_multimapped_pc=lambda x: (1 - ((x.tally0_pc - x.tally2) / x.tally0_pc)),
            frac_not_multimapped=lambda x: (1 - ((x.tally0 - x.tally2) / x.tally0)),
            frac_multimapped=lambda x: ((x.tally0 - x.tally2) / x.tally0).fillna(0),
            gene_left=lambda x: x.index.to_frame().position.min(),  # 1
            gene_right=lambda x: x.index.to_frame().position.max(),  # gene length
            leftmost_start=lambda x: (x.index.to_frame().position - read_length).clip(lower=x.gene_left),
            rightmost_end=lambda x: (x.index.to_frame().position + read_length).clip(upper=x.gene_right),
            frac_not_clipped=lambda x: (x.rightmost_end - x.leftmost_start) / (read_length * 2),
            not_masked2=lambda x: x.frac_multimapped > 0.2,
            frac_not_filtered=lambda x: x.frac_not_clipped * x.frac_not_multimapped,
        )
    )
    ax2 = ax.twinx()
    ax2.plot('frac_not_filtered', data=d1, color='tab:purple', linestyle='--', label='frac_not_filtered')
    # ax2.plot('not_masked2', data=d1, color='tab:pink', linestyle='--', label='not_masked2')

    ax.plot('tally0', data=d1, color='tab:blue', label='tally0')
    ax.plot('tally1', data=d1, color='tab:green', label='tally1')
    ax.plot('tally2', data=d1, color='tab:orange', label='tally2')
    ax.set_ylim(-1, 100)
    ax2.set_ylim(-0.1, 1.1)

    results[gene_id] = dict(
        nominal_length=d1.index.to_frame().position.max() - d1.index.to_frame().position.min(),
        edge_corrected_length=d1.frac_not_clipped.sum(),
        edge_and_multi_corrected_length=d1.frac_not_filtered.sum(),
        # edge_corrected_unmasked_length=d1.frac_not_clipped[d1.not_masked2].sum(),
        # tally0_unmasked=d1.tally0[d1.not_masked2].sum(),
        tally0=d1.tally0.sum(),
        tally1=d1.tally1.sum(),
        tally2=d1.tally2.sum(),
    )
ax.legend(loc='upper center')
d2 = pd.DataFrame(results).T.assign(corrected_depth=lambda x: (x.tally2 + 1) / (x.edge_and_multi_corrected_length + read_length))
print(
    (d2.tally0.sum(), d2.tally1.sum(), d2.tally2.sum()),
    (d2.tally0 / d2.nominal_length).sum(),
    (d2.tally0 / d2.edge_corrected_length).sum(),
    # (d2.tally1 / d2.edge_corrected_length).sum(),
    ((d2.tally2 + 1) / (d2.edge_and_multi_corrected_length + read_length)).sum(),
    # ((d2.tally0 + 1) / (d2.edge_corrected_length + read_length)).sum(),
    # (d2.tally0_unmasked.sum() / d2.edge_corrected_unmasked_length.sum())
)
d2

In [None]:
read_length = 100
read_frac = 0.5

q0 = pd.read_table('test0-v7.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q1 = pd.read_table('test1-v7.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q2 = pd.read_table('test2-v7.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
d0 = q0.join(
    q1.join(q2, rsuffix='2'),
    lsuffix='0',
    rsuffix='1'
).fillna(0)

gene_list = d0.index.get_level_values(0).unique()
print(len(gene_list))

ngenes = len(gene_list)
fig, axs = plt.subplots(ngenes, figsize=(10, 4*ngenes))
if type(axs) is not np.ndarray:
    axs = np.array([axs])

results = {}
for ax, gene_id in zip(axs, gene_list):
    ax.set_title(gene_id)
    d1 = (
        d0
        .xs(gene_id, level='gene_id')
        .assign(
            tally0_pc=lambda x: x.tally0 + 1,
            tally1_pc=lambda x: x.tally1 + 1,
            tally2_pc=lambda x: x.tally2 + 1,
            frac_not_multimapped_pc=lambda x: (1 - ((x.tally0_pc - x.tally2) / x.tally0_pc)),
            frac_not_multimapped=lambda x: (1 - ((x.tally0 - x.tally2) / x.tally0)),
            frac_multimapped=lambda x: ((x.tally0 - x.tally2) / x.tally0).fillna(0),
            gene_left=lambda x: x.index.to_frame().position.min(),  # 1
            gene_right=lambda x: x.index.to_frame().position.max(),  # gene length
            leftmost_start=lambda x: (x.index.to_frame().position - read_length).clip(lower=x.gene_left),
            rightmost_end=lambda x: (x.index.to_frame().position + read_length).clip(upper=x.gene_right),
            frac_not_clipped=lambda x: (x.rightmost_end - x.leftmost_start) / (read_length * 2),
            not_masked2=lambda x: x.frac_multimapped > 0.2,
            frac_not_filtered=lambda x: x.frac_not_clipped * x.frac_not_multimapped,
        )
    )
    ax2 = ax.twinx()
    ax2.plot('frac_not_filtered', data=d1, color='tab:purple', linestyle='--', label='frac_not_filtered')
    # ax2.plot('not_masked2', data=d1, color='tab:pink', linestyle='--', label='not_masked2')

    ax.plot('tally0', data=d1, color='tab:blue', label='tally0')
    ax.plot('tally1', data=d1, color='tab:green', label='tally1')
    ax.plot('tally2', data=d1, color='tab:orange', label='tally2')
    ax.set_ylim(-1, 100)
    ax2.set_ylim(-0.1, 1.1)

    results[gene_id] = dict(
        nominal_length=d1.index.to_frame().position.max() - d1.index.to_frame().position.min(),
        edge_corrected_length=d1.frac_not_clipped.sum(),
        edge_and_multi_corrected_length=d1.frac_not_filtered.sum(),
        # edge_corrected_unmasked_length=d1.frac_not_clipped[d1.not_masked2].sum(),
        # tally0_unmasked=d1.tally0[d1.not_masked2].sum(),
        tally0=d1.tally0.sum(),
        tally1=d1.tally1.sum(),
        tally2=d1.tally2.sum(),
    )
ax.legend(loc='upper center')
d2 = pd.DataFrame(results).T.assign(corrected_depth=lambda x: (x.tally2 + 1) / (x.edge_and_multi_corrected_length + read_length))
print(
    (d2.tally0.sum(), d2.tally1.sum(), d2.tally2.sum()),
    (d2.tally0 / d2.nominal_length).sum(),
    (d2.tally0 / d2.edge_corrected_length).sum(),
    # (d2.tally1 / d2.edge_corrected_length).sum(),
    ((d2.tally2 + 1) / (d2.edge_and_multi_corrected_length + read_length)).sum(),
    # ((d2.tally0 + 1) / (d2.edge_corrected_length + read_length)).sum(),
    # (d2.tally0_unmasked.sum() / d2.edge_corrected_unmasked_length.sum())
)
d2

In [None]:
read_length = 100
read_frac = 0.5

q0 = pd.read_table('test0-v8.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q1 = pd.read_table('test1-v8.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q2 = pd.read_table('test2-v8.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
d0 = q0.join(
    q1.join(q2, rsuffix='2'),
    lsuffix='0',
    rsuffix='1'
).fillna(0)

gene_list = d0.index.get_level_values(0).unique()
print(len(gene_list))

ngenes = len(gene_list)
fig, axs = plt.subplots(ngenes, figsize=(10, 4*ngenes))
if type(axs) is not np.ndarray:
    axs = np.array([axs])

results = {}
for ax, gene_id in zip(axs, gene_list):
    ax.set_title(gene_id)
    d1 = (
        d0
        .xs(gene_id, level='gene_id')
        .assign(
            tally0_pc=lambda x: x.tally0 + 1,
            tally1_pc=lambda x: x.tally1 + 1,
            tally2_pc=lambda x: x.tally2 + 1,
            frac_not_multimapped_pc=lambda x: (1 - ((x.tally0_pc - x.tally2) / x.tally0_pc)),
            frac_not_multimapped=lambda x: (1 - ((x.tally0 - x.tally2) / x.tally0)),
            frac_multimapped=lambda x: ((x.tally0 - x.tally2) / x.tally0).fillna(0),
            gene_left=lambda x: x.index.to_frame().position.min(),  # 1
            gene_right=lambda x: x.index.to_frame().position.max(),  # gene length
            leftmost_start=lambda x: (x.index.to_frame().position - read_length).clip(lower=x.gene_left),
            rightmost_end=lambda x: (x.index.to_frame().position + read_length).clip(upper=x.gene_right),
            frac_not_clipped=lambda x: (x.rightmost_end - x.leftmost_start) / (read_length * 2),
            not_masked2=lambda x: x.frac_multimapped > 0.2,
            frac_not_filtered=lambda x: x.frac_not_clipped * x.frac_not_multimapped,
        )
    )
    ax2 = ax.twinx()
    ax2.plot('frac_not_filtered', data=d1, color='tab:purple', linestyle='--', label='frac_not_filtered')
    # ax2.plot('not_masked2', data=d1, color='tab:pink', linestyle='--', label='not_masked2')

    ax.plot('tally0', data=d1, color='tab:blue', label='tally0')
    ax.plot('tally1', data=d1, color='tab:green', label='tally1')
    ax.plot('tally2', data=d1, color='tab:orange', label='tally2')
    ax.set_ylim(-1, 100)
    ax2.set_ylim(-0.1, 1.1)

    results[gene_id] = dict(
        nominal_length=d1.index.to_frame().position.max() - d1.index.to_frame().position.min(),
        edge_corrected_length=d1.frac_not_clipped.sum(),
        edge_and_multi_corrected_length=d1.frac_not_filtered.sum(),
        # edge_corrected_unmasked_length=d1.frac_not_clipped[d1.not_masked2].sum(),
        # tally0_unmasked=d1.tally0[d1.not_masked2].sum(),
        tally0=d1.tally0.sum(),
        tally1=d1.tally1.sum(),
        tally2=d1.tally2.sum(),
    )
ax.legend(loc='upper center')
d2 = pd.DataFrame(results).T.assign(corrected_depth=lambda x: (x.tally2 + 1) / (x.edge_and_multi_corrected_length + read_length))
print(
    (d2.tally0.sum(), d2.tally1.sum(), d2.tally2.sum()),
    (d2.tally0 / d2.nominal_length).sum(),
    (d2.tally0 / d2.edge_corrected_length).sum(),
    # (d2.tally1 / d2.edge_corrected_length).sum(),
    ((d2.tally2 + 1) / (d2.edge_and_multi_corrected_length + read_length)).sum(),
    # ((d2.tally0 + 1) / (d2.edge_corrected_length + read_length)).sum(),
    # (d2.tally0_unmasked.sum() / d2.edge_corrected_unmasked_length.sum())
)
d2

In [None]:
read_length = 100
read_frac = 0.5

q0 = pd.read_table('test0-v9.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q1 = pd.read_table('test1-v9.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q2 = pd.read_table('test2-v9.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
d0 = q0.join(
    q1.join(q2, rsuffix='2'),
    lsuffix='0',
    rsuffix='1'
).fillna(0)

gene_list = d0.index.get_level_values(0).unique()
print(len(gene_list))

ngenes = len(gene_list)
fig, axs = plt.subplots(ngenes, figsize=(10, 4*ngenes))
if type(axs) is not np.ndarray:
    axs = np.array([axs])

results = {}
for ax, gene_id in zip(axs, gene_list):
    ax.set_title(gene_id)
    d1 = (
        d0
        .xs(gene_id, level='gene_id')
        .assign(
            tally0_pc=lambda x: x.tally0 + 1,
            tally1_pc=lambda x: x.tally1 + 1,
            tally2_pc=lambda x: x.tally2 + 1,
            frac_not_multimapped_pc=lambda x: (1 - ((x.tally0_pc - x.tally2) / x.tally0_pc)),
            frac_not_multimapped=lambda x: (1 - ((x.tally0 - x.tally2) / x.tally0)),
            frac_multimapped=lambda x: ((x.tally0 - x.tally2) / x.tally0).fillna(0),
            gene_left=lambda x: x.index.to_frame().position.min(),  # 1
            gene_right=lambda x: x.index.to_frame().position.max(),  # gene length
            leftmost_start=lambda x: (x.index.to_frame().position - read_length).clip(lower=x.gene_left),
            rightmost_end=lambda x: (x.index.to_frame().position + read_length).clip(upper=x.gene_right),
            frac_not_clipped=lambda x: (x.rightmost_end - x.leftmost_start) / (read_length * 2),
            not_masked2=lambda x: x.frac_multimapped > 0.2,
            frac_not_filtered=lambda x: x.frac_not_clipped * x.frac_not_multimapped,
        )
    )
    ax2 = ax.twinx()
    # ax2.plot('frac_not_filtered', data=d1, color='tab:purple', linestyle='--', label='frac_not_filtered')
    # ax2.plot('not_masked2', data=d1, color='tab:pink', linestyle='--', label='not_masked2')

    ax.plot('tally0', data=d1, color='tab:blue', label='depth')
    # ax.plot('tally1', data=d1, color='tab:green', label='tally1')
    # ax.plot('tally2', data=d1, color='tab:orange', label='tally2')
    ax.set_ylim(-1, 120)
    ax2.set_ylim(-0.1, 1.1)

    results[gene_id] = dict(
        nominal_length=d1.index.to_frame().position.max() - d1.index.to_frame().position.min(),
        edge_corrected_length=d1.frac_not_clipped.sum(),
        edge_and_multi_corrected_length=d1.frac_not_filtered.sum(),
        # edge_corrected_unmasked_length=d1.frac_not_clipped[d1.not_masked2].sum(),
        # tally0_unmasked=d1.tally0[d1.not_masked2].sum(),
        tally0=d1.tally0.sum(),
        tally1=d1.tally1.sum(),
        tally2=d1.tally2.sum(),
    )
ax.legend(loc='upper center')
d2 = pd.DataFrame(results).T.assign(corrected_depth=lambda x: (x.tally2 + 1) / (x.edge_and_multi_corrected_length + read_length))
print(
    (d2.tally0.sum(), d2.tally1.sum(), d2.tally2.sum()),
    (d2.tally0 / d2.nominal_length).sum(),
    (d2.tally0 / d2.edge_corrected_length).sum(),
    # (d2.tally1 / d2.edge_corrected_length).sum(),
    ((d2.tally2 + 1) / (d2.edge_and_multi_corrected_length + read_length)).sum(),
    # ((d2.tally0 + 1) / (d2.edge_corrected_length + read_length)).sum(),
    # (d2.tally0_unmasked.sum() / d2.edge_corrected_unmasked_length.sum())
)
d2

In [None]:
read_length = 100
read_frac = 0.5

q0 = pd.read_table('test0-v10.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q1 = pd.read_table('test1-v10.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q2 = pd.read_table('test2-v10.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
d0 = q0.join(
    q1.join(q2, rsuffix='2'),
    lsuffix='0',
    rsuffix='1'
).fillna(0)

gene_list = d0.index.get_level_values(0).unique()
print(len(gene_list))

ngenes = len(gene_list)
fig, axs = plt.subplots(ngenes, figsize=(10, 4*ngenes))
if type(axs) is not np.ndarray:
    axs = np.array([axs])

results = {}
for ax, gene_id in zip(axs, gene_list):
    ax.set_title(gene_id)
    d1 = (
        d0
        .xs(gene_id, level='gene_id')
        .assign(
            tally0_pc=lambda x: x.tally0 + 1,
            tally1_pc=lambda x: x.tally1 + 1,
            tally2_pc=lambda x: x.tally2 + 1,
            frac_not_multimapped_pc=lambda x: (1 - ((x.tally0_pc - x.tally2) / x.tally0_pc)),
            frac_not_multimapped=lambda x: (1 - ((x.tally0 - x.tally2) / x.tally0)),
            frac_multimapped=lambda x: ((x.tally0 - x.tally2) / x.tally0).fillna(0),
            multimapping_multiplier=lambda x: (x.tally0 / x.tally2_pc).fillna(1),
            gene_left=lambda x: x.index.to_frame().position.min(),  # 1
            gene_right=lambda x: x.index.to_frame().position.max(),  # gene length
            leftmost_start=lambda x: (x.index.to_frame().position - read_length).clip(lower=x.gene_left),
            rightmost_end=lambda x: (x.index.to_frame().position + read_length).clip(upper=x.gene_right),
            frac_not_clipped=lambda x: (x.rightmost_end - x.leftmost_start) / (read_length * 2),
            not_masked2=lambda x: x.frac_multimapped > 0.2,
            frac_not_filtered=lambda x: x.frac_not_clipped * x.frac_not_multimapped,
        )
    )
    ax2 = ax.twinx()
    # ax2.plot('frac_not_filtered', data=d1, color='tab:purple', linestyle='--', label='frac_not_filtered')
    # ax2.plot('frac_multimapped', data=d1, color='tab:pink', linestyle='--', label='multimapping_rate')

    ax.plot('tally0', data=d1, color='tab:blue', label='depth')
    # ax.plot('tally1', data=d1, color='tab:green', label='tally1')
    ax.plot('tally2', data=d1, color='tab:orange', label='tally2')
    ax.plot('multimapping_multiplier', data=d1, color='tab:pink', label='multimapping')

    # ax.set_ylim(-1, 100)
    ax2.set_ylim(-0.1, 1.1)

    results[gene_id] = dict(
        nominal_length=d1.index.to_frame().position.max() - d1.index.to_frame().position.min(),
        edge_corrected_length=d1.frac_not_clipped.sum(),
        edge_and_multi_corrected_length=d1.frac_not_filtered.sum(),
        # edge_corrected_unmasked_length=d1.frac_not_clipped[d1.not_masked2].sum(),
        # tally0_unmasked=d1.tally0[d1.not_masked2].sum(),
        tally0=d1.tally0.sum(),
        tally1=d1.tally1.sum(),
        tally2=d1.tally2.sum(),
    )
ax.legend(loc='upper center')
d2 = pd.DataFrame(results).T.assign(corrected_depth=lambda x: (x.tally2 + 1) / (x.edge_and_multi_corrected_length + read_length))
print(
    (d2.tally0.sum(), d2.tally1.sum(), d2.tally2.sum()),
    (d2.tally0 / d2.nominal_length).sum(),
    (d2.tally0 / d2.edge_corrected_length).sum(),
    # (d2.tally1 / d2.edge_corrected_length).sum(),
    ((d2.tally2 + 1) / (d2.edge_and_multi_corrected_length + read_length)).sum(),
    # ((d2.tally0 + 1) / (d2.edge_corrected_length + read_length)).sum(),
    # (d2.tally0_unmasked.sum() / d2.edge_corrected_unmasked_length.sum())
)
d2

In [None]:
read_length = 100
read_frac = 0.5

q0 = pd.read_table('test0-v11.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q1 = pd.read_table('test1-v11.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q2 = pd.read_table('test2-v11.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
d0 = q0.join(
    q1.join(q2, rsuffix='2'),
    lsuffix='0',
    rsuffix='1'
).fillna(0)

gene_list = d0.index.get_level_values(0).unique()
print(len(gene_list))

ngenes = len(gene_list)
fig, axs = plt.subplots(ngenes, figsize=(10, 4*ngenes))
if type(axs) is not np.ndarray:
    axs = np.array([axs])

results = {}
for ax, gene_id in zip(axs, gene_list):
    ax.set_title(gene_id)
    d1 = (
        d0
        .xs(gene_id, level='gene_id')
        .assign(
            tally0_pc=lambda x: x.tally0 + 1,
            tally1_pc=lambda x: x.tally1 + 1,
            tally2_pc=lambda x: x.tally2 + 1,
            frac_not_multimapped_pc=lambda x: (1 - ((x.tally0_pc - x.tally2) / x.tally0_pc)),
            frac_not_multimapped=lambda x: (1 - ((x.tally0 - x.tally2) / x.tally0)),
            frac_multimapped=lambda x: ((x.tally0 - x.tally2) / x.tally0).fillna(0),
            multimapping_multiplier=lambda x: (x.tally0 / x.tally2_pc).fillna(1),
            gene_left=lambda x: x.index.to_frame().position.min(),  # 1
            gene_right=lambda x: x.index.to_frame().position.max(),  # gene length
            leftmost_start=lambda x: (x.index.to_frame().position - read_length).clip(lower=x.gene_left),
            rightmost_end=lambda x: (x.index.to_frame().position + read_length).clip(upper=x.gene_right),
            frac_not_clipped=lambda x: (x.rightmost_end - x.leftmost_start) / (read_length * 2),
            not_masked2=lambda x: x.frac_multimapped > 0.2,
            frac_not_filtered=lambda x: x.frac_not_clipped * x.frac_not_multimapped,
        )
    )
    ax2 = ax.twinx()
    # ax2.plot('frac_not_filtered', data=d1, color='tab:purple', linestyle='--', label='frac_not_filtered')
    # ax2.plot('frac_multimapped', data=d1, color='tab:pink', linestyle='--', label='multimapping_rate')

    ax.plot('tally0', data=d1, color='tab:blue', label='depth')
    # ax.plot('tally1', data=d1, color='tab:green', label='tally1')
    ax.plot('tally2', data=d1, color='tab:orange', label='tally2')
    ax.plot('multimapping_multiplier', data=d1, color='tab:pink', label='multimapping')

    ax.set_ylim(-1, 100)
    ax2.set_ylim(-0.1, 1.1)

    results[gene_id] = dict(
        nominal_length=d1.index.to_frame().position.max() - d1.index.to_frame().position.min(),
        edge_corrected_length=d1.frac_not_clipped.sum(),
        edge_and_multi_corrected_length=d1.frac_not_filtered.sum(),
        multi_corrected_length2=((d1.tally0_pc - d1.tally2) / d1.tally0_pc).sum(),
        # edge_corrected_unmasked_length=d1.frac_not_clipped[d1.not_masked2].sum(),
        # tally0_unmasked=d1.tally0[d1.not_masked2].sum(),
        tally0=d1.tally0.sum(),
        tally1=d1.tally1.sum(),
        tally2=d1.tally2.sum(),
    )
ax.legend(loc='upper center')
d2 = pd.DataFrame(results).T.assign(corrected_depth=lambda x: (x.tally2 + 1) / (x.edge_and_multi_corrected_length + read_length))
print(
    (d2.tally0.sum(), d2.tally1.sum(), d2.tally2.sum()),
    (d2.tally0 / d2.nominal_length).sum(),
    (d2.tally0 / d2.edge_corrected_length).sum(),
    # (d2.tally1 / d2.edge_corrected_length).sum(),
    ((d2.tally2 + 1) / (d2.edge_and_multi_corrected_length + read_length)).sum(),
    # ((d2.tally0 + 1) / (d2.edge_corrected_length + read_length)).sum(),
    # (d2.tally0_unmasked.sum() / d2.edge_corrected_unmasked_length.sum())
)
d2

In [None]:
read_length = 100
read_frac = 0.5

q0 = pd.read_table('test0-v12.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q1 = pd.read_table('test1-v12.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
q2 = pd.read_table('test2-v12.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
d0 = q0.join(
    q1.join(q2, rsuffix='2'),
    lsuffix='0',
    rsuffix='1'
).fillna(0)

gene_list = d0.index.get_level_values(0).unique()
print(len(gene_list))

ngenes = len(gene_list)
fig, axs = plt.subplots(ngenes, figsize=(10, 4*ngenes))
if type(axs) is not np.ndarray:
    axs = np.array([axs])

results = {}
for ax, gene_id in zip(axs, gene_list):
    ax.set_title(gene_id)
    d1 = (
        d0
        .xs(gene_id, level='gene_id')
        .assign(
            tally0_pc=lambda x: x.tally0 + 1,
            tally1_pc=lambda x: x.tally1 + 1,
            tally2_pc=lambda x: x.tally2 + 1,
            frac_not_multimapped_pc=lambda x: (1 - ((x.tally0_pc - x.tally2) / x.tally0_pc)),
            frac_not_multimapped=lambda x: (1 - ((x.tally0 - x.tally2) / x.tally0)),
            frac_multimapped=lambda x: ((x.tally0 - x.tally2) / x.tally0).fillna(0),
            multimapping_multiplier=lambda x: (x.tally0 / x.tally2_pc).fillna(1),
            gene_left=lambda x: x.index.to_frame().position.min(),  # 1
            gene_right=lambda x: x.index.to_frame().position.max(),  # gene length
            leftmost_start=lambda x: (x.index.to_frame().position - read_length).clip(lower=x.gene_left),
            rightmost_end=lambda x: (x.index.to_frame().position + read_length).clip(upper=x.gene_right),
            frac_not_clipped=lambda x: (x.rightmost_end - x.leftmost_start) / (read_length * 2),
            not_masked2=lambda x: x.frac_multimapped > 0.2,
            frac_not_filtered=lambda x: x.frac_not_clipped * x.frac_not_multimapped,
        )
    )
    ax2 = ax.twinx()
    # ax2.plot('frac_not_filtered', data=d1, color='tab:purple', linestyle='--', label='frac_not_filtered')
    # ax2.plot('frac_multimapped', data=d1, color='tab:pink', linestyle='--', label='multimapping_rate')

    ax.plot('tally0', data=d1, color='tab:blue', label='depth')
    # ax.plot('tally1', data=d1, color='tab:green', label='tally1')
    ax.plot('tally2', data=d1, color='tab:orange', label='tally2')
    ax.plot('multimapping_multiplier', data=d1, color='tab:pink', label='multimapping')

    ax.set_ylim(-1, 100)
    ax2.set_ylim(-0.1, 1.1)

    results[gene_id] = dict(
        nominal_length=d1.index.to_frame().position.max() - d1.index.to_frame().position.min(),
        edge_corrected_length=d1.frac_not_clipped.sum(),
        edge_and_multi_corrected_length=d1.frac_not_filtered.sum(),
        multi_corrected_length2=((d1.tally0_pc - d1.tally2) / d1.tally0_pc).sum(),
        # edge_corrected_unmasked_length=d1.frac_not_clipped[d1.not_masked2].sum(),
        # tally0_unmasked=d1.tally0[d1.not_masked2].sum(),
        tally0=d1.tally0.sum(),
        tally1=d1.tally1.sum(),
        tally2=d1.tally2.sum(),
    )
ax.legend(loc='upper center')
d2 = pd.DataFrame(results).T.assign(corrected_depth=lambda x: (x.tally2 + 1) / (x.edge_and_multi_corrected_length + read_length))
print(
    (d2.tally0.sum(), d2.tally1.sum(), d2.tally2.sum()),
    (d2.tally0 / d2.nominal_length).sum(),
    (d2.tally0 / d2.edge_corrected_length).sum(),
    # (d2.tally1 / d2.edge_corrected_length).sum(),
    ((d2.tally2 + 1) / (d2.edge_and_multi_corrected_length + read_length)).sum(),
    # ((d2.tally0 + 1) / (d2.edge_corrected_length + read_length)).sum(),
    # (d2.tally0_unmasked.sum() / d2.edge_corrected_unmasked_length.sum())
)
d2

In [None]:
d1.

In [None]:
v7 = pd.read_table('test-v7.depth', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
# v11 = pd.read_table('test-v11.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position'])
d0 = v7  # .join(v7, lsuffix='_10', rsuffix='_11')

gene_list = d0.index.get_level_values('gene_id').unique()

fig, axs = plt.subplots(len(gene_list), figsize=(5, len(gene_list)*2), sharex=True, sharey=True)
for gene_id, ax in zip(gene_list, axs):
    d1 = d0.xs(gene_id, level='gene_id')
    ax.plot(d1.tally, label='v7')
    # ax.plot(d1.tally_11, label='--score-min G,20,8.')
    ax.set_title(gene_id)
    
ax.set_ylim(0, 100)
ax.legend()
# ax.set_yscale('symlog')
fig.tight_layout()

In [None]:
d0.groupby('gene_id').tally_13_75.apply(sp.stats.trim_mean, proportiontocut=0.3)

In [None]:
plt.hist(v13_75, bins=np.arange(0, 250))
None

In [None]:
v10 = pd.read_table('test0-v10.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position']).squeeze()
v11 = pd.read_table('test0-v11.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position']).squeeze()
v12 = pd.read_table('test0-v12.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position']).squeeze()
v12_sec = pd.read_table('test0-v12-keepsecondary.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position']).squeeze()
v12_sec95 = pd.read_table('test95-v12-keepsecondary.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position']).squeeze()
v12_95 = pd.read_table('test95-v12.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position']).squeeze()
v12_75 = pd.read_table('test75-v12.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position']).squeeze()
v13_75 = pd.read_table('test75-v13.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position']).squeeze()

d0 = pd.concat(dict(tally_10=v10, tally_11=v11, tally_12=v12, tally_12s=v12_sec, tally_12s95=v12_sec95, tally_95=v12_95, tally_75=v12_75, tally_13_75=v13_75), axis=1).fillna(0)

gene_list = d0.index.get_level_values('gene_id').unique()

fig, axs = plt.subplots(len(gene_list), figsize=(5, len(gene_list)*2), sharex=True, sharey=True)
for gene_id, ax in zip(gene_list, axs):
    d1 = d0.xs(gene_id, level='gene_id')
    ax.plot(d1.tally_10, label='--score-min G,5,0')
    # ax.plot(d1.tally_11, label='--score-min G,20,8.')
    # ax.plot(d1.tally_12, label='--score-min G,5,0 --all')
    ax.plot(d1.tally_12s, label='--score-min G,5,0 --all | -g SECONDARY')
    ax.plot(d1.tally_12s95, label='centroid95 | --score-min G,5,0 --all | -g SECONDARY')
    ax.plot(d1.tally_95, label='centroid95 | --score-min G,5,0 --all')
    ax.plot(d1.tally_75, label='centroid75 | --score-min G,5,0 --all')
    ax.plot(d1.tally_13_75, label='centroid75 | --all')
    ax.set_title(gene_id)
    
ax.set_ylim(0, 500)
ax.legend()
# ax.set_yscale('symlog')
fig.tight_layout()

In [None]:
depth_all_genes_nosecondary = pd.read_table('test.depth-nosecondary.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position']).squeeze()
depth_all_genes = pd.read_table('test.depth.tsv', names=['gene_id', 'position', 'tally'], index_col=['gene_id', 'position']).squeeze()

In [None]:
depth_all_genes.groupby('gene_id').apply(sp.stats.trim_mean, proportiontocut=0.1)
