# Preamble

## Imports

In [None]:
import xarray as xr
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from lib.pandas_util import idxwhere
from lib.project_style import color_palette, major_allele_frequency_bins

## Style

In [None]:
mpl.rcParams['figure.dpi'] = 200

## Load Data

In [None]:
con = sqlite3.connect('data/all_drplt.a.proc.gtpro.2.denorm.db')

con.executescript("""
    CREATE TEMP TABLE species AS
    SELECT species_id, COUNT(species_position) AS position_total
    FROM snp
    GROUP BY species_id
    ;

    CREATE TEMP VIEW lib_x_species_tally AS
    SELECT
      lib_id
    , species_id
    , COUNT(species_position) AS position_tally
    , SUM(reference_tally + alternative_tally) AS depth_tally
    FROM snp_x_lib
    GROUP BY lib_id, species_id
    ;
    
    CREATE TEMP VIEW drplt_combined AS
    SELECT
      sample_id
    , species_id
    , species_position
    , COUNT(lib_id) AS lib_tally
    , SUM(reference_tally) AS reference_tally
    , SUM(alternative_tally) AS alternative_tally
    , SUM(reference_tally + alternative_tally) AS depth_tally
    FROM snp_x_lib
    JOIN lib USING (lib_id)
    WHERE lib_type = 'droplet'
    GROUP BY sample_id, species_id, species_position
    ;
    
    CREATE TEMP VIEW mgen_combined AS
    SELECT
      sample_id
    , species_id
    , species_position
    , COUNT(lib_id) AS lib_tally
    , SUM(reference_tally) AS reference_tally
    , SUM(alternative_tally) AS alternative_tally
    , SUM(reference_tally + alternative_tally) AS depth_tally
    FROM snp_x_lib
    JOIN lib USING (lib_id)
    WHERE lib_type = 'metagenome'
    GROUP BY sample_id, species_id, species_position
    ;
""")

In [None]:
drplt_cvrg = pd.read_sql("""
SELECT
  sample_id
, species_id
, (1.0 * SUM(lib_tally)) / position_total AS mean_incidence
, (1.0 * COUNT(*)) / position_total AS horizontal_coverage
, (1.0 * SUM(depth_tally)) / position_total AS mean_depth
FROM drplt_combined
JOIN species USING (species_id)
GROUP BY sample_id, species_id
;
""", con=con, index_col=['sample_id', 'species_id'])

In [None]:
mgen_cvrg = pd.read_sql("""
SELECT
  sample_id
, species_id
, (1.0 * SUM(lib_tally)) / position_total AS mean_incidence
, (1.0 * COUNT(*)) / position_total AS horizontal_coverage
, (1.0 * SUM(depth_tally)) / position_total AS mean_depth
FROM mgen_combined
JOIN species USING (species_id)
GROUP BY sample_id, species_id
;
""", con=con, index_col=['sample_id', 'species_id'])

In [None]:
species = pd.read_sql("SELECT * FROM species", con=con, index_col=['species_id'])

In [None]:
lib_cvrg = pd.read_sql("""
    SELECT
      *
    , (1.0 * position_tally) / position_total AS horizontal_coverage
    , (1.0 * depth_tally) / position_total AS mean_depth
    FROM lib_x_species_tally
    JOIN species USING (species_id)
    ;
""", con=con, index_col=["lib_id", "species_id"])

In [None]:
lib = pd.read_sql("SELECT * FROM lib;", con=con, index_col=["lib_id"])

In [None]:
species = pd.read_table('ref/gtpro/species_taxonomy_ext.tsv', names=['_1', 'species_id', 'taxon_string']).assign(species_id=lambda x: x.species_id.astype(str)).set_index('species_id').taxon_string
species = species.apply(lambda x: pd.Series(x.split(';'), index=['d__', 'p__', 'c__', 'o__', 'f__', 'g__', 's__']))
species

# Analysis

## Data Quality

In [None]:
lib_stats = (
    lib_cvrg
    .groupby('lib_id')
    .apply(lambda x: pd.Series(dict(
        total_tally=x.depth_tally.sum(),
        num_species_gt0=(x.depth_tally > 0).sum(),
        num_species_gt1=(x.depth_tally > 1).sum(),
        num_species_gt5=(x.depth_tally > 5).sum(),
        num_species_gt10=(x.depth_tally > 10).sum(),
        max_species_tally=x.depth_tally.max(),
        species_id=x.depth_tally.idxmax()[1],
    )))
)
lib_stats['frac_dominant'] = lib_stats.max_species_tally / lib_stats.total_tally

lib_stats = lib_stats.assign(
    max_depth=lib_cvrg.groupby('lib_id').mean_depth.max(),
    purity=lib_cvrg.groupby('lib_id').mean_depth.max() / lib_cvrg.groupby('lib_id').mean_depth.sum()
)
lib_stats['contamination'] = 1 - lib_stats.purity

In [None]:
drplt_to_sample = pd.read_sql("SELECT lib_id, sample_id FROM lib WHERE lib_type = 'droplet'", con=con, index_col='lib_id').squeeze()
drplt_to_sample.value_counts()

### Are most droplets dominated by one species?

In [None]:
lib_stats.groupby(drplt_to_sample).total_tally.sum()

In [None]:
lib_stats.groupby(drplt_to_sample).total_tally.mean()

In [None]:
lib_stats.groupby(lib.lib_type).median()

In [None]:
d = (
    lib_stats
    .join(lib, on='lib_id')
    [lambda x: x.lib_type == 'droplet']
)

fig, ax = plt.subplots()
sns.kdeplot(d.total_tally.pipe(np.log10))

In [None]:
sns.kdeplot()

In [None]:
d = (
    lib_stats
    .join(lib, on='lib_id')
    .assign(log10_total_tally=lambda x: x.total_tally.pipe(np.log10))
    [lambda x: x.lib_type == 'droplet']
)


g = sns.FacetGrid(d, hue='sample_id', size=3, aspect=2);
g.map(
    sns.kdeplot,
    'log10_total_tally',
    cut=0,
)
g.ax.axvline(np.log10(d.total_tally.median()), lw=1, linestyle='--', color='k', label='median')
g.ax.axvline(np.log10(d.total_tally.mean()), lw=1, linestyle=':', color='k', label='mean')

g.ax.legend()

In [None]:
g = sns.JointGrid(
    x='total_tally',
    y='max_species_tally',
    data=(
        lib_stats
        .join(lib, on='lib_id')
        [lambda x: x.lib_type == 'droplet']
    ),
    hue='sample_id',
    palette=color_palette,
)
g.ax_joint.set_yscale('log')
g.ax_joint.set_xscale('log')

g.plot_joint(sns.scatterplot, s=2, alpha=0.8)
g.plot_marginals(sns.kdeplot, common_norm=False)

g.ax_joint.plot([1e-1, 4e6], [1e-1, 4e6], lw=1, linestyle='--', color='k', alpha=0.1)
g.ax_joint.axvline(lib_stats.groupby(lib.lib_type).total_tally.median()['droplet'], lw=1, linestyle=':', color='k')
g.ax_joint.axvline(lib_stats.groupby(lib.lib_type).total_tally.mean()['droplet'], lw=1, linestyle=':', color='k')

In [None]:
g = sns.JointGrid(
    x='max_depth',
    y='contamination',
    data=(
        lib_stats
        .join(lib, on='lib_id')
        [lambda x: x.lib_type == 'droplet']
    ),
    hue='sample_id',
    palette=color_palette,
)
g.ax_joint.set_yscale('symlog', linthreshy=1e-3)
g.ax_joint.set_ylim(-1e-4)
g.ax_joint.set_xscale('log')
# g.ax_joint.set_xscale('symlog', linthreshx=1e-2)

g.plot_joint(sns.scatterplot, s=3, alpha=0.8)
g.plot_marginals(sns.kdeplot, common_norm=False)
# g.ax_joint.legend(loc='lower right')

#g.ax_joint.plot([1, 1e8], [1, 1e8], lw=1, linestyle='--', color='k')

In [None]:
plt.hist(lib_stats.drop(['SS01009.m', 'SS01057.m'])[lambda x: x.max_depth > 1e-1].purity, bins=51)
plt.ylabel('count')
plt.xlabel('purity')
# plt.yscale('symlog')

## Sanity check genome coverage across species

### Depth and horizontal coverage across libraries

In [None]:
min_value = 1e-6

g = sns.JointGrid(
    x='mean_depth',
    y='horizontal_coverage',
    data=(
        lib_cvrg
        .join(lib, on='lib_id')
    ),
    hue='lib_type',
    palette=color_palette,
)
g.ax_joint.set_yscale('log')
g.ax_joint.set_xscale('log')

g.plot_joint(sns.scatterplot, s=3)
g.plot_marginals(sns.kdeplot, common_norm=False)

g.ax_joint.axhline(0.1, lw=1, linestyle=':', color='k')
g.ax_joint.plot([min_value, 1], [min_value, 1], lw=1, linestyle='--', color='k')

#### Just in metagenomes

In [None]:
min_value = 1e-6

g = sns.JointGrid(
    x='mean_depth',
    y='horizontal_coverage',
    data=mgen_cvrg,
    hue='sample_id',
    palette=color_palette,
)
g.ax_joint.set_yscale('log')
g.ax_joint.set_xscale('log')

g.plot_joint(sns.scatterplot, s=3)
g.plot_marginals(sns.kdeplot, common_norm=False)

g.ax_joint.axhline(0.1, lw=1, linestyle=':', color='k')
g.ax_joint.plot([min_value, 1], [min_value, 1], lw=1, linestyle='--', color='k')

#### Just in droplets

In [None]:
min_value = 1e-6

g = sns.JointGrid(
    x='mean_depth',
    y='horizontal_coverage',
    data=drplt_cvrg,
    hue='sample_id',
    palette=color_palette,
)
g.ax_joint.set_yscale('log')
g.ax_joint.set_xscale('log')

g.plot_joint(sns.scatterplot, s=3)
g.plot_marginals(sns.kdeplot, common_norm=False)

g.ax_joint.axhline(0.1, lw=1, linestyle=':', color='k')
g.ax_joint.plot([min_value, 1], [min_value, 1], lw=1, linestyle='--', color='k')

## Do summed droplets resemble the full metagenomes?

### Sequencing depth

In [None]:
min_value = 1e-6

g = sns.JointGrid(
    x='mean_depth_m',
    y='mean_depth_d',
    data=mgen_cvrg.join(drplt_cvrg, rsuffix='_d', lsuffix='_m').fillna(min_value),
    hue='sample_id',
    palette=color_palette,
)
g.ax_joint.set_yscale('log')
g.ax_joint.set_xscale('log')

g.plot_joint(sns.scatterplot, s=3)
g.plot_marginals(sns.kdeplot, common_norm=False)

g.ax_joint.axhline(0.1, lw=1, linestyle=':', color='k')
g.ax_joint.plot([min_value, 1], [min_value, 1], lw=1, linestyle='--', color='k')

### Horizontal coverage

In [None]:
min_value = 1e-6

g = sns.JointGrid(
    x='horizontal_coverage_m',
    y='horizontal_coverage_d',
    data=mgen_cvrg.join(drplt_cvrg, rsuffix='_d', lsuffix='_m').fillna(min_value),
    hue='sample_id',
    palette=color_palette,
)
g.ax_joint.set_yscale('log')
g.ax_joint.set_xscale('log')

g.plot_joint(sns.scatterplot, s=3)
g.plot_marginals(sns.kdeplot, common_norm=False)

g.ax_joint.axhline(0.1, lw=1, linestyle=':', color='k')
g.ax_joint.plot([min_value, 1], [min_value, 1], lw=1, linestyle='--', color='k')

### Relative abundance

In [None]:
mgen_rabund = mgen_cvrg.groupby('sample_id', group_keys=False).apply(lambda x: x.mean_depth / x.mean_depth.sum()).rename('rabund')
drplt_rabund = drplt_cvrg.groupby('sample_id', group_keys=False).apply(lambda x: x.mean_depth / x.mean_depth.sum()).rename('rabund')

In [None]:
min_value = 1e-6

g = sns.JointGrid(
    x='rabund_m',
    y='rabund_d',
    data=mgen_rabund.to_frame().join(drplt_rabund, rsuffix='_d', lsuffix='_m').fillna(1e-9),
    hue='sample_id',
    palette=color_palette,
)
g.ax_joint.set_yscale('log')
g.ax_joint.set_xscale('log')

g.plot_joint(sns.scatterplot, s=3)
g.plot_marginals(sns.kdeplot, common_norm=False)

g.ax_joint.axhline(0.1, lw=1, linestyle=':', color='k')
g.ax_joint.plot([min_value, 1], [min_value, 1], lw=1, linestyle='--', color='k')

In [None]:
sns.regplot(
    x='SS01009',
    y='SS01057',
    data=mgen_rabund.to_frame().join(drplt_rabund, lsuffix='_m', rsuffix='_d').fillna(1e-4).apply(lambda x: np.log(x.rabund_d / x.rabund_m), axis=1).unstack('sample_id')
)

## Do droplet frequencies resemble the full metagenomes?

In [None]:
species_dominated_libs = lib_stats.frac_dominant > 0.5

frac_droplets = (
    lib_cvrg
    .loc[idxwhere(species_dominated_libs)]
    .groupby('lib_id')
    .mean_depth
    .idxmax()
    .rename('species_id')
    .apply(lambda x: x[1])
    .groupby(lib.sample_id)
    .value_counts()
    .groupby('sample_id')
    .apply(lambda x: x / x.sum())
    .rename('frac')
)

In [None]:
g = sns.JointGrid(
    x='rabund',
    y='frac',
    data=mgen_rabund.to_frame().join(frac_droplets).fillna(1e-4),
    hue='sample_id',
    palette=color_palette,
)
g.ax_joint.set_yscale('log')
g.ax_joint.set_xscale('log')

g.plot_joint(sns.scatterplot, s=10, alpha=0.5)
g.plot_marginals(sns.kdeplot, common_norm=False)

# g.ax_joint.axhline(0.1, lw=1, linestyle=':', color='k')
g.ax_joint.plot([1e-4, 1], [1e-4, 1], lw=1, linestyle='--', color='k')

In [None]:
g = sns.JointGrid(
    x='rabund',
    y='frac',
    data=mgen_rabund.to_frame().join(frac_droplets).fillna(1e-4),
    hue='sample_id',
    palette=color_palette,
)
# g.ax_joint.set_yscale('log')
# g.ax_joint.set_xscale('log')

g.plot_joint(sns.scatterplot, s=10, alpha=0.5)
g.plot_marginals(sns.kdeplot, common_norm=False)

# g.ax_joint.axhline(0.1, lw=1, linestyle=':', color='k')
g.ax_joint.plot([1e-4, 1.0], [1e-4, 1.0], lw=1, linestyle='--', color='k')

In [None]:
d

In [None]:
from lib.plot import construct_ordered_pallete

pal = construct_ordered_pallete(species.sort_values(['d__', 'p__', 'c__']).c__)

d = mgen_rabund.to_frame().join(frac_droplets).fillna(1e-8).join(species).sort_values(['d__', 'p__', 'c__'])


g = sns.FacetGrid(d, hue='c__', size=5, palette=pal);
g.map(
    sns.scatterplot,
    'rabund',
    'frac',
)

g.ax.set_yscale('log')
g.ax.set_xscale('log')
g.ax.plot([1e-8, 1], [1e-8, 1], lw=1, linestyle='--', color='k')
plt.legend(bbox_to_anchor=(1, 1))

In [None]:
from lib.plot import construct_ordered_pallete

pal = construct_ordered_pallete(species.sort_values(['d__', 'p__', 'c__']).c__)

d = (
    mgen_rabund
    .to_frame()
    .join(frac_droplets)
    .fillna(1e-4)
    .apply(lambda x: np.log(x.frac / x.rabund), axis=1)
    .unstack('sample_id')
    .join(species)
    .sort_values(['d__', 'p__', 'c__'])
)


g = sns.FacetGrid(d, hue='c__', size=5, palette=pal);
g.map(
    sns.scatterplot,
    'SS01009',
    'SS01057',
)
# sns.regplot(
#     x='SS01009',
#     y='SS01057',
#     data=d,
#     scatter=False,
#     color='k',
#     line_kws=dict(alpha=0.5),
# )
plt.plot([-5, 10], [-5, 10], linestyle='--', lw=1, color='k')

plt.legend(bbox_to_anchor=(1, 1))

In [None]:
from lib.plot import construct_ordered_pallete

pal = construct_ordered_pallete(species.sort_values(['d__', 'p__', 'c__']).c__)

d = (
    mgen_rabund
    .to_frame()
    .join(frac_droplets, how='inner')
    .fillna(1e-4)
    .apply(lambda x: np.log(x.frac / x.rabund), axis=1)
    .unstack('sample_id')
    .join(species)
    .sort_values(['d__', 'p__', 'c__'])
)


g = sns.FacetGrid(d, hue='c__', size=5, palette=pal);
g.map(
    sns.scatterplot,
    'SS01009',
    'SS01057',
)
# sns.regplot(
#     x='SS01009',
#     y='SS01057',
#     data=d,
#     scatter=False,
#     color='k',
#     line_kws=dict(alpha=0.5),
# )
plt.plot([-5, 10], [-5, 10], linestyle='--', lw=1, color='k')

plt.legend(bbox_to_anchor=(1, 1))

In [None]:
from lib.plot import construct_ordered_pallete

pal = construct_ordered_pallete(species.sort_values(['d__', 'p__', 'c__']).c__)

d = (
    mgen_rabund
    .to_frame()
    .join(frac_droplets, how='inner')
    .fillna(1e-4)
    .apply(lambda x: np.log(x.frac / x.rabund), axis=1)
    .unstack('sample_id')
    .join(species)
    .sort_values(['d__', 'p__', 'c__'])
)


g = sns.FacetGrid(d, hue='c__', size=5, palette=pal);
g.map(
    sns.scatterplot,
    'SS01009',
    'SS01057',
)
# sns.regplot(
#     x='SS01009',
#     y='SS01057',
#     data=d,
#     scatter=False,
#     color='k',
#     line_kws=dict(alpha=0.5),
# )
plt.plot([-5, 10], [-5, 10], linestyle='--', lw=1, color='k')

plt.legend(bbox_to_anchor=(1, 1))

### Do droplets have much strain admixture?

In [None]:
lib_genotype = pd.read_sql("""
WITH
summed AS
    (
    SELECT *, reference_tally + alternative_tally AS total_tally
    FROM snp_x_lib
    ),
drplt_normalized AS
    (
    SELECT
        lib_id
      , species_id
      , species_position
      , 1.0 * reference_tally / total_tally AS reference_tally
      , 1.0 * alternative_tally / total_tally AS alternative_tally
      , total_tally
    FROM summed
    )
SELECT
  lib_id
, species_id
, species_position
, drplt_normalized.total_tally AS total_tally
, drplt_normalized.reference_tally AS reference_tally
, drplt_normalized.alternative_tally AS alternative_tally
, MAX(drplt_normalized.reference_tally, drplt_normalized.alternative_tally) AS max_allele_frac
FROM drplt_normalized
JOIN lib USING (lib_id)
""", con=con, index_col=['lib_id', 'species_id', 'species_position'])
lib_genotype

In [None]:
d = lib_genotype[lambda x: x.total_tally > 15].max_allele_frac
bins = np.linspace(0.5, 1.0)

plt.hist(
    d.loc[['SS01009.m', 'SS01057.m']],
    bins=major_allele_frequency_bins,
    color=color_palette['metagenome'],
    alpha=0.5,
    density=True,
    label='metagenome',
)
plt.hist(
    d.loc[idxwhere(species_dominated_libs)],
    bins=major_allele_frequency_bins,
    color=color_palette['droplet'],
    alpha=0.5,
    density=True,
    label='droplet',
)

plt.legend()
plt.yscale('log')
plt.ylim(1e-4)
None

### Are droplets representative of the associated metagenomes?

In [None]:
summed_drplt_genotype = pd.read_sql("""
SELECT
  sample_id
, species_id
, species_position
, SUM(reference_tally) AS reference_tally
, SUM(alternative_tally) AS alternative_tally
FROM snp_x_lib
JOIN lib USING (lib_id)
WHERE lib_type = 'droplet'
GROUP BY sample_id, species_id, species_position
""", con=con, index_col=['sample_id', 'species_id', 'species_position'])

summed_drplt_genotype = (
    summed_drplt_genotype
    .assign(total_tally=lambda x: x.reference_tally + x.alternative_tally)
    .assign(max_allele_frac=lambda x: x[['reference_tally', 'alternative_tally']].max(1) / x.total_tally)
)

In [None]:
mgen_genotype = (
    lib_genotype
    .loc[['SS01009.m', 'SS01057.m']]
    .rename({'SS01009.m': 'SS01009', 'SS01057.m': 'SS01057'})
    .rename_axis(index={'lib_id': 'sample_id'})
    .rename(columns={'reference_tally': 'reference_frac', 'alternative_tally': 'alternative_frac'})
)
# FIXME: Careful! Only works because there's just one library for each.
# Instead you have to aggregate across libraries if you want it to be correct.

# TODO: Build this similar to summed_drplt_genotype but for metagenomes

# (
#     lib_genotype
#     .loc[['SS01009.m', 'SS01057.m']]
#     .join(lib)
#     .groupby(['sample_id', 'species_id', 'species_position'])
#     .sum()
#     [['total_tally', 'max_allele_frac']]
# )

mgen_genotype

In [None]:
virtual_mgen_genotype = pd.read_sql("""
WITH
position_total AS
(
    SELECT *, reference_tally + alternative_tally AS total_tally
    FROM snp_x_lib
),

drplt_genotype AS
(
    SELECT *
  , 1.0 * reference_tally / total_tally AS reference_frac
  , 1.0 * alternative_tally / total_tally AS alternative_frac
    FROM position_total
)

SELECT
    sample_id
  , species_id
  , species_position
  , AVG(reference_frac) AS reference_frac
  , AVG(alternative_frac) AS alternative_frac
  , COUNT(*) AS total_tally
FROM drplt_genotype
JOIN lib USING (lib_id)
GROUP BY sample_id, species_id, species_position
;
""", con=con, index_col=['sample_id', 'species_id', 'species_position']).assign(
        max_allele_frac=lambda x: x[['reference_frac', 'alternative_frac']].max(1)
    )

In [None]:
virtual_mgen_genotype.groupby(['sample_id', 'species_id']).total_tally.mean().sort_values(ascending=False).head(20)

In [None]:
(
    lib_stats
    [lambda x: (x.frac_dominant > 0.5) & (x.max_species_tally > 100)]
    .groupby(lib.sample_id)
    .species_id
    .value_counts()
    .rename('tally')
    .unstack('sample_id', fill_value=0)
    .assign(total=lambda x: x.sum(1))
    .join(species)
    .sort_values('total', ascending=False)
    .head(40)
)

In [None]:
sample_of_interest = 'SS01009'
species_of_interest = '104345'

In [None]:
plt.hist(
    (
        lib_genotype
        .loc[idxwhere(
            (lib.lib_type == 'droplet') &
            (lib.sample_id == sample_of_interest) &
            lib.index.isin(lib_genotype.index.get_level_values('lib_id'))
        )]
        [lambda x: x.total_tally > 10]
        .xs(species_of_interest, level='species_id')
        .max_allele_frac
    ),
    bins=major_allele_frequency_bins,
    density=True,
    alpha=0.5,
    color=color_palette['droplet'],
    label='droplet',
)
plt.hist(
    (
        mgen_genotype
        .xs((sample_of_interest, species_of_interest), level=('sample_id', 'species_id'))
        [lambda x: x.total_tally > 10]
        .max_allele_frac
    ),
    bins=major_allele_frequency_bins,
    density=True,
    alpha=0.5,
    color=color_palette['metagenome'],
    label='metagenome',
)
plt.hist(
    (
        virtual_mgen_genotype
        .xs((sample_of_interest, species_of_interest), level=('sample_id', 'species_id'))
        [lambda x: x.total_tally > 10]
        .max_allele_frac
    ),
    bins=major_allele_frequency_bins,
    density=True,
    alpha=0.5,
    color=color_palette['virtual-metagenome'],
    label='virtual-metagenome',
)


plt.yscale('log')
plt.legend()

In [None]:
min_value = 1e-6

d = (
    mgen_genotype.xs((sample_of_interest, species_of_interest), level=('sample_id', 'species_id'))
    .join(virtual_mgen_genotype.xs((sample_of_interest, species_of_interest), level=('sample_id', 'species_id')), how='inner', lsuffix='_m', rsuffix='_d')
    [lambda x: (x.total_tally_m > 100) & (x.max_allele_frac_m > 0.05) & (x.max_allele_frac_m < 0.95)]
)

g = sns.JointGrid(
    x='reference_frac_m',
    y='reference_frac_d',
    data=d,
#     hue='sample_id',
    palette=color_palette,
)
# g.ax_joint.set_yscale('log')
# g.ax_joint.set_xscale('log')

g.plot_joint(sns.regplot, line_kws={'color': 'k'})
g.plot_marginals(sns.kdeplot, common_norm=False)

# g.ax_joint.axhline(0.1, lw=1, linestyle=':', color='k')
g.ax_joint.plot([min_value, 1], [min_value, 1], lw=1, linestyle='--', color='k')

d

In [None]:
bins = np.linspace(0.5, 1.0)

plt.hist(
    mgen_genotype[lambda x: x.total_tally > 15].max_allele_frac,
    bins=major_allele_frequency_bins,
    color=color_palette['metagenome'],
    alpha=0.5,
    density=True,
    label='metagenome',
)
plt.hist(
    virtual_mgen_genotype[lambda x: x.total_tally > 15].max_allele_frac,
    bins=major_allele_frequency_bins,
    color=color_palette['virtual-metagenome'],
    alpha=0.5,
    density=True,
    label='virtual-metagenome',
)

plt.legend()
plt.yscale('log')
plt.ylim(1e-4)
None

In [None]:
bins = np.linspace(0.5, 1.0)

plt.hist(
    mgen_genotype[lambda x: x.total_tally > 15].max_allele_frac,
    bins=major_allele_frequency_bins,
    color=color_palette['metagenome'],
    alpha=0.5,
    density=True,
    label='metagenome',
)
plt.hist(
    virtual_mgen_genotype[lambda x: x.total_tally > 15].max_allele_frac,
    bins=major_allele_frequency_bins,
    color=color_palette['virtual-metagenome'],
    alpha=0.5,
    density=True,
    label='virtual-metagenome',
)
plt.hist(
    (
        lib_genotype
        .loc[idxwhere(species_dominated_libs)]
        [lambda x: x.total_tally > 15]
        .max_allele_frac
    ),
    bins=major_allele_frequency_bins,
    color=color_palette['droplet'],
    alpha=0.5,
    density=True,
    label='droplet',
)

plt.legend()
plt.yscale('log')
plt.ylim(1e-4)
None

In [None]:
min_value = 1e-6

d = (
    mgen_genotype
    .join(virtual_mgen_genotype, how='inner', lsuffix='_m', rsuffix='_d')
    [lambda x: (x.total_tally_m > 100) & (x.max_allele_frac_m > 0.05) & (x.max_allele_frac_m < 0.95)]
)

g = sns.jointplot(
    x='reference_frac_m',
    y='reference_frac_d',
    data=d,
    kind='hex',
#     hue='sample_id',
    palette=color_palette,
)
# g.ax_joint.set_yscale('log')
# g.ax_joint.set_xscale('log')

g.plot_marginals(sns.kdeplot, common_norm=False)

# g.ax_joint.axhline(0.1, lw=1, linestyle=':', color='k')
g.ax_joint.plot([min_value, 1], [min_value, 1], lw=1, linestyle='--', color='k')