In [None]:
# # Run this for CoLab setup
# !rm -r sample_data .config
# !git clone https://github.com/bsmith89/gtpro-strain-factorization .
# !wget -O core.sp-102506.gtpro-pileup.filt.nc https://www.dropbox.com/s/9ortwudkk81n3x0/core.sp-102506.gtpro-pileup.filt.nc?dl=0
# !wget -O shi2019s13.tsv https://www.dropbox.com/s/gz2cn1vecsx3q05/shi2019s13.tsv?dl=0
# !pip install pyro-ppl

## Imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sfacts as sf
from sfacts.model_zoo import (
    full_metagenotype,
    full_metagenotype_pp_pi,
    full_metagenotype_dirichlet_rho,
    full_metagenotype_special_meta
)

In [None]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy as sp
import pyro
import pyro.distributions as dist
import torch
from functools import partial
from tqdm import tqdm
import xarray as xr
import warnings
from torch.jit import TracerWarning

In [None]:
mpl.rcParams['figure.dpi'] = 100

In [None]:
warnings.filterwarnings(
    "ignore",
    message="torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.",
    category=torch.jit.TracerWarning,
#     module="trace_elbo",  # FIXME: What is the correct regex for module?
#     lineno=5,
)

## Prototype

### Load Data

In [None]:
species_id = 100022

# Sanity check on sfacts/data.py
np.random.seed(1)

mgen_all = (
    sf.data.Metagenotypes.load(f'data/core.sp-{species_id}.gtpro-pileup.nc', validate=False)
)
mgen_filt = (
    mgen_all
    .select_variable_positions(thresh=0.05)
    .select_samples_with_coverage(0.05)
)

print(mgen_all.sizes)
print(mgen_filt.sizes)
del mgen_all

### Fitting

In [None]:
nstrain = 1000
nposition = min(int(5e2), mgen_filt.sizes['position'])

# est, (genotypes, missingness) = (
est, _ = (
    sf.workflow.fit_subsampled_metagenotype_collapse_strains_then_iteratively_refit_full_genotypes(
#     sf.workflow.fit_metagenotypes_simple(
#         full_metagenotype_special_meta.model_structure,
        full_metagenotype_dirichlet_rho.model_structure,
#         full_metagenotype_dirichlet_rho.model_structure,
        mgen_filt.random_sample(position=nposition),
        nstrain=nstrain,
        nposition=nposition,
        hyperparameters=dict(
            gamma_hyper=0.1,
            delta_hyper_r=0.8,
            delta_hyper_temp=0.1,
            rho_hyper=1e-1,
            pi_hyper=1e0,
            epsilon_hyper_mode=0.01,
            epsilon_hyper_spread=1.5,
            alpha_hyper_hyper_mean=1000.0,
            alpha_hyper_hyper_scale=0.5,
            alpha_hyper_scale=1.0,
        ),
        diss_thresh=0.05,
        coverage_thresh=1e-4,
        stage2_hyperparameters=dict(gamma_hyper=1.0),
        device='cuda',
        dtype=torch.float64,
        quiet=False,
        estimation_kwargs=dict(
            lagA=10,
            lagB=100,
            opt=pyro.optim.Adamax({"lr": 1e-0}, {"clip_norm": 100}),
            seed=2,
            jit=True,
        )
    )
)

In [None]:
plt.plot(est.data.communities.max("sample").to_series().sort_values(ascending=False).values)
plt.xlim(-5, 200)

In [None]:
plt.plot(est.data.communities.max("strain").to_series().sort_values(ascending=False).values)

In [None]:
plt.plot((est.data.communities * est.data.mu).sum("sample").to_series().sort_values(ascending=False).values)
plt.axhline(1.)
plt.yscale('log')

### Viz.

In [None]:
sf.plot.plot_community(
    est,
    col_colors_func=lambda w: xr.Dataset(dict(
#         mgen_entropy=w.metagenotypes.entropy(),
#         expect_entropy=w.data['p'].pipe(sf.math.binary_entropy).mean("position"),
        mean_cvrg=w.metagenotypes.sum("allele").mean("position").pipe(np.sqrt),
#         m_hyper_r=w.data['m_hyper_r'],
#         alpha=w.data['alpha'].pipe(np.sqrt),
#         flag=(w.data".alpha < 10) & (w.metagenotypes.sum("allele").mean("position") > 20),
    )),
    row_colors_func=lambda w: xr.Dataset(dict(
        entropy=w.genotypes.entropy(),
        mean_cvrg=(w.communities.data * w.metagenotypes.sum("allele").mean("position")).sum("sample").pipe(np.sqrt),
    )),
#     row_linkage_func=lambda w: w.genotypes.cosine_linkag(),
    col_linkage_func=lambda w: sf.data.latent_metagenotypes_linkage(w),
    row_linkage_func=lambda w: w.genotypes.linkage(),
    norm=mpl.colors.PowerNorm(1/3),
)

Samples of interest:
- One of (in priority order) DS0097_035, SS01009, SS01084, SS01073, SS01006
- SS01172
- SS01057
- One of (in priority order) SS01055, DS0485_002, SS01165
- SS01047
- One of (in priority order) SS01017, SS01080, SS01037
- SS01131

In [None]:
sf.plot.plot_genotype(
    est,
    row_colors_func=lambda w: xr.Dataset(dict(
        cvrg=(est.data.communities * est.data.mu).sum("sample").pipe(np.cbrt),
    )),
    row_linkage_func=lambda w: w.genotypes.linkage(),
)

In [None]:
sf.plot.plot_missing(
    est,
    row_colors_func=lambda w: xr.Dataset(dict(
        cvrg=(est.data.communities * est.data.mu).sum("sample").pipe(np.cbrt),
    )),
    row_linkage_func=lambda w: w.genotypes.cosine_linkage(),
)

In [None]:
sf.plot.plot_metagenotype(
    est,
    col_linkage_func=lambda w: sf.data.latent_metagenotypes_linkage(w),
)

In [None]:
plt.hist(est.data.communities.max("sample"), bins=50)

In [None]:
plt.hist(est.data.alpha.pipe(np.log10), bins=21)

In [None]:
plt.hist(est.data.epsilon.pipe(np.log10), bins=21)

In [None]:
plt.hist(est.data.m_hyper_r.pipe(np.log10), bins=21)

In [None]:
plt.scatter(est.data.m_hyper_r, est.data.alpha, c=est.data.mu, alpha=0.5, s=5)
plt.yscale('log')
plt.xscale('log')

### Big Data Viz.

In [None]:
sf.plot.plot_genotype(
    est,
    scaley=0.02,
    row_linkage_func=lambda w: w.genotypes.cosine_linkage(),
    row_colors_func=lambda w: xr.Dataset(dict(
        e=1 / w.genotypes.entropy(),
        c=(w.data.mu * w.data.communities).sum("sample").pipe(np.cbrt),
    ))
)

In [None]:
sf.plot.plot_community(
    est.random_sample(sample=500),
    row_linkage_func=lambda w: w.genotypes.cosine_linkage(),
    col_linkage_func=lambda w: w.communities.linkage(dim="sample", quiet=False),
)

In [None]:
est.communities.data.idxmax("strain").to_series().value_counts().head(20)

In [None]:
sns.clustermap(est.genotypes.to_pandas().T, vmin=0, vmax=1, center=0.5, cmap='coolwarm', col_linkage=est.genotypes.linkage(quiet=False))

In [None]:
plt.hist(est.communities.max("strain"), bins=np.linspace(0, 1, num=51))

In [None]:
plt.scatter(est.communities.max("strain"), est.metagenotypes.sum("allele").mean("position"), c=est.data['mu'].pipe(np.log), s=1)
plt.yscale('log')

In [None]:
sns.jointplot(est.communities.max("strain"), est.metagenotypes.entropy("sample"), kind='hex', norm=mpl.colors.PowerNorm(1/5))

In [None]:
max_inferred_sample_coverage = est.metagenotypes.sum('allele').mean('position')
total_inferred_strain_coverage = (est.metagenotypes.sum('allele').mean('position') * est.communities.data).sum('sample')

bins = np.linspace(-2, 4, num=51)

plt.hist(max_inferred_sample_coverage[max_inferred_sample_coverage > 0].pipe(np.log10), density=True, bins=bins, alpha=1.)
# plt.hist(total_inferred_strain_coverage.pipe(np.log10), density=True, bins=bins, alpha=0.7)

plt.xlabel('log10(mean species/sample coverage)')
plt.ylabel('density')
None

In [None]:
max_inferred_sample_coverage = est.metagenotypes.sum('allele').mean('position')
total_inferred_strain_coverage = (est.metagenotypes.sum('allele').mean('position') * est.communities.data).sum('sample')

bins = np.linspace(-2, 4, num=51)

plt.hist(max_inferred_sample_coverage[max_inferred_sample_coverage > 0].pipe(np.log10), density=True, bins=bins, alpha=1.0)
plt.hist(total_inferred_strain_coverage.pipe(np.log10), density=True, bins=bins, alpha=0.7)

plt.xlabel('log10(mean species/sample coverage)')
plt.ylabel('density')
None

In [None]:
plt.hist(total_inferred_strain_coverage.pipe(np.log10), bins=50)
None

In [None]:
max_single_sample_inferred_strain_coverage = (est.metagenotypes.sum('allele').mean('position') * est.communities.data).max('sample')
plt.hist(max_single_sample_inferred_strain_coverage.pipe(np.log10), bins=50)
None

In [None]:
plt.scatter(max_single_sample_inferred_strain_coverage, total_inferred_strain_coverage, s=5)
plt.plot([0, 1e3], [0, 1e3])
plt.yscale('log')
plt.xscale('log')

In [None]:
sns.clustermap(est.missingness.to_pandas().T, vmin=0, vmax=1)

In [None]:
sns.clustermap(est.communities.to_pandas(), vmin=0, vmax=1, norm=mpl.colors.PowerNorm(1/2))

In [None]:
plt.hist(est.communities.max('sample'), bins=np.linspace(0, 1, num=51))
None

In [None]:
plt.hist(est.genotypes.values.flatten(), bins=np.linspace(0, 1, num=51))
None

In [None]:
plt.hist(est.missingness.values.flatten(), bins=np.linspace(0, 1, num=51))
None

### What sorts of wonky things does this big-data show?

In [None]:
sns.jointplot(est.data.mu.pipe(np.log), est.communities.max("strain"), kind='hex', norm=mpl.colors.PowerNorm(1/2))
sns.regplot(est.data.mu.pipe(np.log), est.communities.max("strain"), lowess=True, scatter=False)

In [None]:
sns.jointplot(est.data.mu.pipe(np.log), est.metagenotypes.sum("allele").mean("position").pipe(np.log), kind='hex', norm=mpl.colors.PowerNorm(1/2))

In [None]:
sns.jointplot(est.data['mu'].pipe(np.log), est.data['alpha'].pipe(np.log), kind='hex', norm=mpl.colors.PowerNorm(1/2))
sns.regplot(est.data['mu'].pipe(np.log), est.data['alpha'].pipe(np.log), lowess=True, scatter=False)

In [None]:
sns.jointplot(est.communities.max("strain"), est.data['alpha'].pipe(np.log), kind='hex', norm=mpl.colors.PowerNorm(1/2))
sns.regplot(est.communities.max("strain"), est.data['alpha'].pipe(np.log), lowess=True, scatter=False)

In [None]:
sns.jointplot(est.communities.max("strain"), est.metagenotypes.entropy('sample').pipe(np.sqrt), kind='hex', norm=mpl.colors.PowerNorm(1/2))
sns.regplot(est.communities.max("strain"), est.metagenotypes.entropy('sample').pipe(np.sqrt), lowess=True, scatter=False)

### Biogeography

In [None]:
sample_meta = pd.read_table('raw/shi2019s13.tsv').set_index('NCBI Accession Number')
sample_meta.groupby(['Study', 'Continent']).apply(len)

In [None]:
select_studies = ['CM_madagascar', 'Bengtsson-PalmeJ_2015', 'FengQ_2015', 'LiJ_2017', 'LomanNJ_2013']

In [None]:
# Construct composition matrix for samples with biogeography data

composition = est.communities.to_pandas()
meta = sample_meta.reindex(composition.index).dropna(subset=['Sample ID'])
composition_bg = composition.reindex(meta.index)

In [None]:
from sfacts.pandas_util import idxwhere

d = composition_bg[meta['Study'].isin(['VatanenT_2016'])]
strains = idxwhere((composition_bg[meta['Study'].isin(['VatanenT_2016'])] > 0.5).sum() > 1)

# sf.plot.plot_community(
#     d.loc[:, strains],
#     yticklabels=1,
#     norm=mpl.colors.PowerNorm(1/3),
# )

In [None]:
# TODO: This is a giant contingency table,
# and the p-value on a chisq test shows clearly that strains clump
# into countries.

contingency = (
    composition_bg
    .groupby(meta['Country'])
    .apply(lambda d: d.idxmax(1).value_counts())
    .unstack(fill_value=0)
)

null_contingency = (
    composition_bg
    .set_index(composition_bg.sample(frac=1.0).index)
    .groupby(meta['Country'])
    .apply(lambda d: d.idxmax(1).value_counts())
    .unstack(fill_value=0)
)
assert sp.stats.chi2_contingency(null_contingency)[1] > 0.01

print(sp.stats.chi2_contingency(contingency))

In [None]:
# Same analysis, but carefully selecting studies that I don't believe have
# multiple metagenomes from same/related individuals.

contingency2 = (
    composition_bg
    [meta['Study'].isin(select_studies)]
    .groupby(meta['Country'])
    .apply(lambda d: d.idxmax(1).value_counts())
    .unstack(fill_value=0)
)

null_contingency2 = (
    composition_bg
    [meta['Study'].isin(select_studies)]
    .set_index(composition_bg[meta['Study'].isin(select_studies)].sample(frac=1.0).index)
    .groupby(meta['Country'])
    .apply(lambda d: d.idxmax(1).value_counts())
    .unstack(fill_value=0)
)
assert sp.stats.chi2_contingency(null_contingency2)[1] > 0.01

print(sp.stats.chi2_contingency(contingency2))

In [None]:
# Same analysis, but carefully selecting studies that I don't believe have
# multiple metagenomes from same/related individuals.
# And clustering by study rather than country.

contingency3 = (
    composition_bg
    [meta['Study'].isin(select_studies)]
    .groupby(meta['Study'])
    .apply(lambda d: d.idxmax(1).value_counts())
    .unstack(fill_value=0)
)

null_contingency3 = (
    composition_bg
    [meta['Study'].isin(select_studies)]
    .set_index(composition_bg[meta['Study'].isin(select_studies)].sample(frac=1.0).index)
    .groupby(meta['Study'])
    .apply(lambda d: d.idxmax(1).value_counts())
    .unstack(fill_value=0)
)
assert sp.stats.chi2_contingency(null_contingency3)[1] > 0.01

print(sp.stats.chi2_contingency(contingency3))

In [None]:
meta[meta['Study'].isin(select_studies)].groupby('Study').apply(len)

In [None]:
count_individuals = meta[meta['Study'].isin(select_studies)].groupby('Country').apply(len)

top_20_strains = contingency2.apply(lambda x: x / x.sum(), axis=1).mean().sort_values(ascending=False).head(20).index

ax = (
    contingency2
    .apply(lambda x: x / x.sum(), axis=1)
    .loc[['CHN', 'MDG', 'AUT', 'DEU', 'SWE'], top_20_strains]
    .plot
    .bar(stacked=True, color=mpl.cm.tab20(np.linspace(0, 1, num=20)))
)
#ax.legend_.set_visible(False)
ax.legend(bbox_to_anchor=(1, 1), title='Top 20 Strains')

ax.set_ylabel('Fraction samples where dominant')

In [None]:
meta.groupby(['Study', 'Country']).apply(len).unstack(fill_value=0).loc[select_studies].T

In [None]:
count_individuals = meta[meta['Study'].isin(select_studies)].groupby('Country').apply(len)

top_20_strains = contingency3.apply(lambda x: x / x.sum(), axis=1).mean().sort_values(ascending=False).head(20).index

ax = (
    contingency3
    .apply(lambda x: x / x.sum(), axis=1)
    .loc[:, top_20_strains]
    .plot
    .bar(stacked=True, color=mpl.cm.tab20(np.linspace(0, 1, num=20)))
)
#ax.legend_.set_visible(False)
ax.legend(bbox_to_anchor=(1, 1), title='Top 20 Strains')

ax.set_ylabel('Fraction samples where dominant')

In [None]:
from lib.plot import rotate_xticklabels

count_individuals = meta.groupby([meta['Continent'], meta['Country'], meta['Study']]).apply(len)

d = (
    composition_bg
    .groupby([meta['Continent'], meta['Country'], meta['Study']])
    .apply(lambda d: d.idxmax(1).value_counts())
    .unstack(fill_value=0)
    .sort_index()
    .apply(lambda x: x / x.sum(), axis=1)
)
top_strains = d.mean().sort_values(ascending=False).head(15).index

d = d.loc[:, top_strains].assign(other=1 - d.loc[:, top_strains].sum(1)).drop(idxwhere(count_individuals < 10))

ax = (
    d
    .plot
    .bar(
        stacked=True,
        color=mpl.cm.tab20(np.linspace(0, 1, num=20)),
        figsize=(10, 5)
    )
)
#ax.legend_.set_visible(False)
ax.legend(bbox_to_anchor=(1, 1), title='Top Strains')

ax.set_ylabel('Fraction samples where dominant')
rotate_xticklabels()

In [None]:
continent_dominance_total = est.communities.data.to_pandas().groupby(meta['Continent']).apply(lambda d: (d > 0.5).sum())
continent_dominance_frequency = est.communities.data.to_pandas().groupby(meta['Continent']).apply(lambda d: (d > 0.5).mean())
continent_detect_frequency = est.communities.data.to_pandas().groupby(meta['Continent']).apply(lambda d: (d > 0.1).mean())


In [None]:
continent_dominance_frequency.loc[:, lambda x: x.sum() != 0].idxmax().value_counts()

In [None]:
continent_colormap = dict(
    AF='green',
    EU='blue',
    AS='purple',
    OC='pink',
    SA='orange',
)
thresh = 0

genotype_colors = pd.DataFrame(dict(
    AF=(continent_dominance_total.loc['AF'] > thresh).map({False: 'white', True: continent_colormap['AF']}),
    EU=(continent_dominance_total.loc['EU'] > thresh).map({False: 'white', True: continent_colormap['EU']}),
    AS=(continent_dominance_total.loc['AS'] > thresh).map({False: 'white', True: continent_colormap['AS']}),
    OC=(continent_dominance_total.loc['OC'] > thresh).map({False: 'white', True: continent_colormap['OC']}),
    SA=(continent_dominance_total.loc['SA'] > thresh).map({False: 'white', True: continent_colormap['SA']}),
    highest_detect_frequency=continent_detect_frequency.idxmax().map(continent_colormap).where(continent_detect_frequency.sum() != 0, 'white'),
))


cgrid = sns.clustermap(
    est.genotypes.to_pandas().T,
    vmin=0, vmax=1, center=0.5, cmap='coolwarm',
    col_linkage=est.genotypes.linkage(quiet=False, method='average', optimal_ordering=False),
    col_colors=genotype_colors,
    figsize=(35, 15),
    dendrogram_ratio=(0.01, 0.75),
    xticklabels=0, yticklabels=0,
    tree_kws=dict(lw=1.5),
)
cgrid.ax_cbar.set_visible(False)

In [None]:
estimates = {}

species_names = {
    102506: 'E. coli',
    100022: 'F. prausnitzii',
    102492: 'A. rectalis',
    100113: 'S. salivarius'
    
}

for spid in species_names:
    estimates[spid] = sf.data.World.load(
        f'data/core.sp-{spid}.gtpro-pileup.sfacts.nc',
        validate=False
    )

In [None]:
def rarifaction(world, thresh=0.5, shuffle=False):
    d = world.communities.to_pandas() > thresh
    d = d[d.any(1)]
    if shuffle:
        d = d.sample(frac=1., replace=False)
    d = d.cumsum()
    return (d > 0).sum(1)

for spid, world in estimates.items():
    plt.plot(rarifaction(world, shuffle=True).values, label=species_names[spid])
    
plt.plot([1, 600], [1, 600], lw=1, linestyle='--', color='k')
plt.legend()
# plt.yscale('log')
# plt.xscale('log')
plt.xlim(1)
plt.ylim(1)

### LD

In [None]:
all_species_position_meta_ = pd.read_table(
    '/pollard/data/gt-pro-db/variants_main.covered.hq.snp_dict.tsv',
    names=['species_id', 'position', 'contig', 'contig_position', 'ref', 'alt']
).set_index('position')
all_species_position_meta_ = all_species_position_meta_[all_species_position_meta_.species_id.isin([100022, 102506])]

In [None]:
all_species_position_meta_.info()

In [None]:
genotypes = xr.open_dataarray('data/core.sp-102506.gtpro-pileup.sfacts-genotypes.nc')

In [None]:
missingness = xr.open_dataarray('data/core.sp-102506.gtpro-pileup.sfacts-missingness.nc')

In [None]:
position_meta = all_species_position_meta_[lambda x: x.species_id == 102506]

In [None]:
position_meta.loc[genotypes.position]

In [None]:
from scipy.spatial.distance import squareform, pdist

def pos_psim(genotypes):
    return (1 - squareform(pdist((genotypes.T), metric='correlation')))**2

def linear_distance(linear_index):
    linear_index = linear_index.to_frame()
    return pd.DataFrame(
        squareform(
            pdist(
                linear_index,
                metric='cityblock'
            )
        ),
        index=linear_index.index,
        columns=linear_index.index,
    )

position_sim = pd.DataFrame(pos_psim(genotypes), index=genotypes.position, columns=genotypes.position)

In [None]:
(1 - squareform(1 - position_sim)).mean()

In [None]:
snp_info = (
    position_meta
    .groupby('contig')
    .apply(len)
    .to_frame(name='total_count')
    .assign(
        fit_count=position_meta.loc[genotypes.position]
        .groupby('contig')
        .apply(len)
    ).fillna(0)
).sort_values('fit_count', ascending=False)

snp_info.head(10)

In [None]:
position_ldist_ = linear_distance(
    position_meta.loc[genotypes.position]['contig_position']
).sort_index().sort_index(1)

In [None]:
import patsy

same_contig = pd.DataFrame(
    1 - squareform(
        pdist(
            patsy.dmatrix(
                'contig - 1', data=position_meta.loc[genotypes.position]['contig'].to_frame(), return_type='dataframe'
            ),
            'jaccard'),
    ),
    index=genotypes.position, columns=genotypes.position,

)
#sns.heatmap(same_contig.sort_index().sort_index(1))

In [None]:
ld_data = pd.DataFrame(dict(
    linear_distance=squareform(position_ldist_.values),
    same_contig=(squareform(1 - same_contig.values) == 0),
    ld=1 - squareform(1 - position_sim),
))
ld_data = ld_data[ld_data.same_contig]
ld_data

In [None]:
d = ld_data[
        lambda x: x.same_contig & (150 < x.linear_distance) & (x.linear_distance < 2000)
]

plt.scatter(
    x='linear_distance',
    y='ld',
    data=d,
    s=1,
    alpha=0.1,
)


In [None]:
sns.jointplot(
    x='linear_distance',
    y='ld',
    data=ld_data[
        lambda x: x.same_contig & (0 < x.linear_distance) & (x.linear_distance < 2000)
    ],
    kind='hex',
    norm=mpl.colors.PowerNorm(1/3)
)

In [None]:
ld_data[
        lambda x: x.same_contig & (0 < x.linear_distance) & (x.linear_distance < 100)
    ].ld.mean()

In [None]:
ld_data[
        lambda x: x.same_contig & (100 < x.linear_distance) & (x.linear_distance < 200)
    ].ld.mean()

In [None]:
stepsize = 5
right = 2500

d = ld_data[ld_data.linear_distance < right]

bins = {}
for start in range(0, right, stepsize):
    stop = start + stepsize
    bins[start] = d[(d.linear_distance >= start) & (d.linear_distance < stop)].ld.mean()
    
plt.scatter(
    x='linear_distance',
    y='ld',
    data=d,
    s=0.1,
    alpha=0.05,
    color='black',
    label='__nolegend__',
)
plt.scatter([], [], s=10, color='black', label='Locus Pair')
plt.plot(pd.Series(bins), color='red', label=f'Mean LD ({stepsize} bp Bin)')
plt.axhline(0, lw=1, color='red', linestyle='--')
plt.ylabel(r"LD")
plt.xlabel("Distance")
plt.legend(bbox_to_anchor=(0.85, 1.15), ncol=2)

print(sp.stats.spearmanr(d['linear_distance'], d['ld']))

In [None]:
stepsize = 5
right = 200

d = ld_data[ld_data.linear_distance < right]

bins = {}
for start in range(0, right, stepsize):
    stop = start + stepsize
    bins[start] = d[(d.linear_distance >= start) & (d.linear_distance < stop)].ld.mean()
    
plt.scatter(
    x='linear_distance',
    y='ld',
    data=d,
    s=0.5,
    alpha=0.05,
    color='black',
    label='__nolegend__',
)
plt.scatter([], [], s=10, color='black', label='Locus Pair')
plt.plot(pd.Series(bins), color='red', label=f'Mean LD ({stepsize} bp Bin)')
plt.axhline(0, lw=1, color='red', linestyle='--')
plt.ylabel(r"LD")
plt.xlabel("Distance")
plt.legend(bbox_to_anchor=(0.85, 1.15), ncol=2)

print(sp.stats.spearmanr(d['linear_distance'], d['ld']))

In [None]:
all_species_position_meta_ = pd.read_table(
    '/pollard/data/gt-pro-db/variants_main.covered.hq.snp_dict.tsv',
    names=['species_id', 'position', 'contig', 'contig_position', 'ref', 'alt']
).set_index('position')
all_species_position_meta_ = all_species_position_meta_[all_species_position_meta_.species_id.isin([100022, 102506])]

In [None]:
all_species_position_meta_.info()