In [None]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from itertools import chain
import scipy as sp

from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
from scipy.spatial.distance import pdist

def read_list_from_file(path):
    with open(path) as handle:
        return [line.strip() for line in handle]

In [None]:
con = sqlite3.connect('res/core.1.denorm.db')
contig_ids = read_list_from_file('res/core.a.mbins.d/Otu0001.contigs.list')
contig_ids_sql = '"' + '", "'.join(contig_ids) + '"'

cvrg = pd.read_sql("""
SELECT extraction_id, contig_id, SUM(coverage) AS coverage
FROM contig_coverage
JOIN library USING (library_id)
WHERE contig_id IN ({})
GROUP BY extraction_id, contig_id
                   """.format(contig_ids_sql), con=con,
                   index_col=['extraction_id', 'contig_id']).coverage.unstack('contig_id', fill_value=0)

cvrg = cvrg.sample(frac=1, random_state=1)

In [None]:
extraction_meta = pd.read_sql("""
SELECT *
FROM extraction
JOIN sample USING (sample_id)
JOIN mouse USING (mouse_id)
                                  """, con=con, index_col='extraction_id')


In [None]:
contig_meta = pd.read_sql("""
SELECT *
FROM contig_bin
JOIN contig USING (contig_id)
WHERE contig_id IN ({})
                          """.format(contig_ids_sql),
                         con=con, index_col='contig_id')

In [None]:
plt.scatter('core-k161_1010974', 'core-k161_1020875', data=cvrg)

In [None]:
seed = 'core-k161_1010974'
trusted_contigs = cvrg.apply(lambda x: sp.stats.pearsonr(cvrg[seed], x)[0])[lambda x: x > 0.999].index

In [None]:
cvrg_norm = cvrg.div(cvrg[trusted_contigs].mean(1), axis=0)
_ = plt.hist(np.log(cvrg_norm.mean()), bins=50)

-   Normalize each library to the coverage for contig we're sure is in the bin
-   Now normalized coverages of are the expectation for core genome contigs
-   Sort contigs by CV (promote stabily high abundance contigs)
-   Plot each library

In [None]:
from sklearn.mixture import BayesianGaussianMixture

cluster_data = np.sqrt(cvrg_norm)

bgm = BayesianGaussianMixture(50,
                              covariance_type='full',
                              weight_concentration_prior=0.1,
                              random_state=1
                             ).fit(cluster_data.T)
group_assign = pd.Series(bgm.predict(cluster_data.T), index=cvrg_norm.columns)
group_cvrg = (cvrg_norm.groupby(group_assign, axis='columns').mean()
                       .apply(lambda x: pd.Series({'group_mean_coverage': x.mean(),
                                                   'group_std_coverage': x.std()})).T)
group_cvrg.index.name = 'group'
group_assign = group_assign.to_frame(name='group').join(group_cvrg, on='group')
group_assign['bin_id'] = contig_meta.bin_id
group_assign['length'] = contig_meta.length
group_assign.sort_values(['group_std_coverage', 'length'], ascending=[True, False], inplace=True)
order = group_assign.index

fig, ax = plt.subplots(figsize=(15, 5))
ax.axhline(y=1, color='k', linestyle='--')
color_map = {'acarbose': 'goldenrod', 'control': 'darkblue',
             'UM': 'darkblue', 'UT': 'darkgreen',
             'male': 'blue', 'female': 'magenta',
             'C2013': 'blue', 'Glenn': 'red'}
for des, d in cvrg_norm.groupby(extraction_meta.treatment):
#    color = color_map[des]
    color = None
    _ = ax.plot(d[order].values.T, lw=1, alpha=0.5, color=color)
#_ = ax.plot(group_assign.group_mean_coverage.values, color='k')

group_assign['contig_index'] = range(group_assign.shape[0])
for _, d in group_assign.groupby('group').contig_index.mean().reset_index().iterrows():
    ax.annotate(int(d.group), xy=(d.contig_index, 100))

ax.set_yscale('symlog', linthreshy=1)

In [None]:
quality_groups = [1, 28, 0, 12, 44, 9, 38, 35]
for group in quality_groups:
    print(group, group_assign[lambda x: x.group==group].groupby('bin_id').contig_index.count(),
          sep='\t', end='\n\n')
    
print('others')
print(group_assign[lambda x: ~x.group.isin(quality_groups)].groupby('bin_id').contig_index.count())

In [None]:
contig_group_12_and_44 = group_assign[lambda x: x.group.isin([12, 44])].copy()

In [None]:
contig_group_9_and_38 = group_assign[lambda x: x.group.isin([9, 38])].copy()

In [None]:
with open('variable_coverage_group2.list', 'w') as handle:
    print(*contig_group_9_and_38.index, sep='\n', file=handle)