In [None]:
import pandas as pd
import sqlite3
import seaborn as sns
import matplotlib as mpl

In [None]:
con = sqlite3.connect('data/core.muri2.2.denorm.db')

In [None]:
# All subfamily hits (must hit a subfamily)
gh13_domain_all_features = pd.read_sql(r"""
    SELECT * FROM feature_x_cazy_domain
    JOIN feature_to_opf USING (feature_id)
    WHERE (
        domain_id LIKE 'GH13\_%' ESCAPE '\'
--        OR domain_id IS 'GH13'
        )
       AND score > 50
""", con=con).sort_values('feature_id')

In [None]:
opf_to_gh13_subfamily_feature_counts = (
    gh13_domain_all_features
    .groupby('feature_id').opf_id.first().to_frame()
    .join(
        gh13_domain_all_features
        .groupby('feature_id')
        .apply(lambda d: d.set_index('domain_id').score.idxmax())
        .rename('domain_id')
    )
    .groupby(['opf_id', 'domain_id'])
    .apply(len)
)

sns.clustermap(
    opf_to_gh13_subfamily_feature_counts.unstack(fill_value=0),
    norm=mpl.colors.PowerNorm(1/3),
    yticklabels=1,
)

opf_to_gh13_subfamily_feature_counts.unstack(fill_value=0)

In [None]:
# All GH13 family or subfamily best hits.
gh13_domain_features = pd.read_sql(r"""
    SELECT * FROM feature_x_cazy_minimal_domain
    JOIN feature_to_opf USING (feature_id)
    WHERE (
        domain_id LIKE 'GH13\_%' ESCAPE '\'
        OR domain_id IS 'GH13'
        )
       AND score > 50
""", con=con).sort_values('feature_id')

In [None]:
# One Opf (Opf05857) has two GH13 domains
two_gh13_domain_features = gh13_domain_features[gh13_domain_features.feature_id.duplicated(keep=False)]
two_gh13_domain_features

In [None]:
# Almost all Opf05857 have both domains (except for two minor examples)
gh13_domain_features[(gh13_domain_features.opf_id == 'Opf05857') & (~gh13_domain_features.feature_id.isin(two_gh13_domain_features.feature_id))]

In [None]:
# GH13 domains and OPFs match very nicely.
opf_to_gh13_family_feature_counts = gh13_domain_features.groupby(['opf_id', 'domain_id']).apply(len).sort_index()

sns.clustermap(
    opf_to_gh13_family_feature_counts.unstack(fill_value=0),
    norm=mpl.colors.PowerNorm(1/2),
    yticklabels=1,
)

opf_to_gh13_family_feature_counts.unstack(fill_value=0)

In [None]:
starch_domain_features

In [None]:
# All examples from OPFs with even one member matching a starch-active domain

starch_domain_features = (
    pd.read_sql(r"""
        SELECT *
        FROM (
            SELECT DISTINCT opf_id
            FROM starch_active_gh_domain
            JOIN feature_x_cazy_minimal_domain USING (domain_id)
            JOIN feature_to_opf USING (feature_id)
            WHERE score > 50
        )
        LEFT JOIN feature_to_opf USING (opf_id)
        LEFT JOIN feature_x_cazy_minimal_domain USING (feature_id)
        WHERE (score > 50 AND domain_id IN starch_active_gh_domain)
           OR domain_id IS NULL
    """, con=con)
    .sort_values('feature_id')
    .dropna(subset=['opf_id'])
    .assign(domain_id=lambda x: x.domain_id.fillna('no_gh_domain'))
)

In [None]:
# A few OPFs consistently have more than one domain
two_starch_domain_features = starch_domain_features[starch_domain_features.feature_id.duplicated(keep=False)]
two_starch_domain_features

In [None]:
# Starch-active domains and OPFs match very nicely.
opf_to_starch_subfamily_feature_counts = starch_domain_features.groupby(['opf_id', 'domain_id'], dropna=False).apply(len).sort_index()

sns.clustermap(
    opf_to_starch_subfamily_feature_counts.unstack(fill_value=0),
    norm=mpl.colors.PowerNorm(1/2),
    yticklabels=1,
)

opf_to_starch_subfamily_feature_counts.unstack(fill_value=0)

In [None]:
# (1) Collect all features with GH13 domains using the following SQL:
# SELECT feature_id
# FROM feature_x_cazy_domain
# WHERE (domain_id LIKE "GH13\_%" ESCAPE "\" OR domain_id IS "GH13")
#   AND score > 50

# (2) Take this list of feature_id and pull the AA sequences:
# echo <SCRIPT> | sqlite3 data/core.muri2.2.denorm.db | seqtk subseq data/core.a.mags.muri2.g.final.cds.fa - | pbcopy

# (3) Run this through the dbCAN meta server: <https://bcb.unl.edu/dbCAN2/blast.php>

# (4) Save the resulting table as `build/gh13_hits_subfamilies.tsv`

# (5):

def _parse_hmmer_field_to_gh13(s):
    return [t for t in s.split('(') if (t.startswith('GH13_') or t == 'GH13')]
    
dbcan_domain_assignment = (
    pd.read_table('build/gh13_hits_subfamilies.tsv')
    .rename(columns={'Gene ID': 'feature_id'})
    .assign(domain_id=lambda x: x.HMMER.apply(_parse_hmmer_field_to_gh13))
    .explode('domain_id')
    [['feature_id', 'domain_id']]
)


opf_to_dbcan_feature_counts = (
    dbcan_domain_assignment
    .set_index('feature_id')
    .join(gh13_domain_features.set_index('feature_id').opf_id)
    .groupby(['opf_id', 'domain_id'])
    .apply(len)
)

sns.clustermap(
    opf_to_dbcan_feature_counts.unstack(fill_value=0),
    norm=mpl.colors.PowerNorm(1/2),
    yticklabels=1,
)

opf_to_dbcan_feature_counts.unstack(fill_value=0)