In [None]:
import pandas as pd
from lib.util import info, idxwhere
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import logit
import numpy as np
import scipy as sp
import xarray as xr

In [None]:
species_id = 100022
group = 'core'
inpath = f'data/{group}/{species_id}/gtpro.nc'

data = xr.open_dataarray(inpath).squeeze()

In [None]:
data

In [None]:
cvrg = data.sum(dim=['allele', 'read'])

In [None]:
cvrg

In [None]:
np.histogram(data.isel(position=1), bins=np.linspace(0, 1000, num=50))[0]

In [None]:
bins = np.concatenate([np.arange(100), np.arange(int(np.sqrt(100)), int(np.sqrt(2000)))**2])

cvrg_hist = (
    cvrg
    .to_pandas()
    .apply(lambda x: np.histogram(x, bins=bins, density=True)[0])
    .set_index(bins[1:])
    .rename_axis(index='bin_high')
)

In [None]:
fig = plt.figure(figsize=(20, 10))
sns.heatmap(cvrg_hist.loc[:, cvrg_hist.iloc[0].sort_values().index], norm=mpl.colors.SymLogNorm(linthresh=1e-7), xticklabels=0)

In [None]:
rcvrg = cvrg / cvrg.mean('position')

bins = np.logspace(0, 2.5, num=100) - 1
rcvrg_hist = (
    rcvrg
    .to_pandas()
    .apply(lambda x: np.histogram(x, bins=bins, density=True)[0])
    .set_index(bins[1:])
    .rename_axis(index='bin_high')
)

In [None]:
fig = plt.figure(figsize=(20, 10))
sns.heatmap(rcvrg_hist.loc[:, rcvrg_hist.iloc[0].sort_values().index], norm=mpl.colors.SymLogNorm(linthresh=1e-5), xticklabels=0)

In [None]:
species_id = 100022
pathfmt = 'data/core/{species_id}/gtpro.read_r{r}.tsv.bz2'

r1 = (
    pd.read_table(
        pathfmt.format(species_id=species_id, r=1),
        names=[
            "library_id",
            "species_id",
            "position",
            "_3",
            "_4",
            "_5",
            "_6",
            "ref",
            "alt",
        ],
        index_col=["library_id", "species_id", "position"],
    )[["ref", "alt"]]
    .rename_axis(columns="allele")
    .stack()
    .astype(int)
    .squeeze()
)

r2 = (
    pd.read_table(
        pathfmt.format(species_id=species_id, r=2),
        names=[
            "library_id",
            "species_id",
            "position",
            "_3",
            "_4",
            "_5",
            "_6",
            "ref",
            "alt",
        ],
        index_col=["library_id", "species_id", "position"],
    )[["ref", "alt"]]
    .rename_axis(columns="allele")
    .stack()
    .astype(int)
    .squeeze()
)

In [None]:
data = (
    pd.concat([
        r1.to_frame('r1').rename_axis(columns='read').stack(),
        r2.to_frame('r2').rename_axis(columns='read').stack()
    ])
    .to_xarray()
    .fillna(0)
    .astype(int)
)

In [None]:
#dataseries = 
sdata = data.to_series()[lambda x: ~(x == 0)].astype(int).to_frame('tally')
sdata.to_parquet('test.parquet', compression='gzip')

In [None]:
sdata

In [None]:
sdata.to_parquet('test.parquet', compression='gzip')

In [None]:
sdata = pd.read_parquet('test.parquet')

In [None]:
# TODO: Stack them together into one dataframe,
# rename the columns to 'read', and then unstack.

data = pd.DataFrame(dict(r1=r1, r2=r2)).rename_axis(columns='read').unstack().to_xarray()

In [None]:


info(data.sizes)

cvrg = data.sum('allele')

# alt_frac = data.sel(allele='alt') / cvrg
# has_alt_pos_frac = (alt_frac > 0).sum('snp_idx') / (alt_frac.notnull()).sum('snp_idx')


In [None]:
pos_mean_cvrg = cvrg.mean('library_id')
plt.hist(pos_mean_cvrg.values, bins=100)
None

In [None]:
pos_incid = (cvrg > 0).mean('library_id')
plt.hist(pos_incid.values, bins=100)
None

In [None]:
# TODO: For each position, see how the probability of hitting that
# position increases with increasing library coverage.

In [None]:
log2_cvrg_ratio = np.log2((cvrg + 1) / (cvrg + 1).reduce(lambda x, axis: sp.stats.trim_mean(x, 0.05, axis), 'snp_idx'))
pos_log2_cvrg_ratio_mean = log2_cvrg_ratio.mean('library_id')
pos_log2_cvrg_ratio_mean_anomaly = np.abs(pos_log2_cvrg_ratio_mean) > 0.5

plt.hist(pos_log2_cvrg_ratio_mean.values, bins=100)
None

In [None]:
pos_log2_cvrg_ratio_std = log2_cvrg_ratio.std('library_id')
pos_log2_cvrg_ratio_std_anomaly = pos_log2_cvrg_ratio_std > 1.5

plt.hist(pos_log2_cvrg_ratio_std.values, bins=100)
None

In [None]:
library_log2_cvrg_ratio_std = log2_cvrg_ratio.std('snp_idx')
library_log2_cvrg_ratio_std_anomaly = library_log2_cvrg_ratio_std > 1.5

plt.hist(library_log2_cvrg_ratio_std.values, bins=100)
None

In [None]:
library_log2_cvrg_ratio_std_anomaly.sum()

In [None]:
# Show how the number of positions "seen" per library increases with coverage.
# TODO: Drop libraries where this is way out-of-whack, as these suggest
# problems (e.g. some genome not in the database with homology to just a few positions)

fig = plt.figure(figsize=(10, 8))
plt.scatter(cvrg.mean('snp_idx'), (cvrg > 0).mean('snp_idx'), s=1, alpha=0.5)
plt.xscale('log')
# plt.yscale('log')

In [None]:
pos_log2_cvrg_ratio_mean_anomaly.mean()

In [None]:
pos_log2_cvrg_ratio_std_anomaly.mean()

In [None]:
(pos_log2_cvrg_ratio_mean_anomaly | pos_log2_cvrg_ratio_std_anomaly).mean()

In [None]:
mean_allele_frac = (data / cvrg).mean('library_id')

plt.hist(mean_allele_frac.sel(allele='alt').values, bins=100)
None

In [None]:
plt.hist(cvrg.sum('snp_idx').values, bins=100)
plt.yscale('log')
None

In [None]:
# TODO: Decide if I want to down-sample high-coverage libraries.