In [None]:
import pandas as pd
import sqlite3
import seaborn as sns
import matplotlib as mpl

In [None]:
con = sqlite3.connect('data/core.muri2.2.denorm.db')

In [None]:
gh13_domain_features = pd.read_sql(r"""
    SELECT * FROM feature_x_cazy_minimal_domain
    JOIN feature_to_opf USING (feature_id)
    WHERE (
        domain_id LIKE 'GH13\_%' ESCAPE '\'
        OR domain_id IS 'GH13'
        )
       AND score > 50
""", con=con).sort_values('feature_id')

In [None]:
# One Opf (Opf05857) has two GH13 domains
two_gh13_domain_features = gh13_domain_features[gh13_domain_features.feature_id.duplicated(keep=False)]
two_gh13_domain_features

In [None]:
# Almost all Opf05857 have both domains (except for two minor examples)
gh13_domain_features[(gh13_domain_features.opf_id == 'Opf05857') & (~gh13_domain_features.feature_id.isin(two_gh13_domain_features.feature_id))]

In [None]:
# GH13 domains and OPFs match very nicely.
opf_to_gh13_subfamily_feature_counts = gh13_domain_features.groupby(['opf_id', 'domain_id']).apply(len).sort_index()

sns.clustermap(
    opf_to_gh13_subfamily_feature_counts.unstack(fill_value=0),
    norm=mpl.colors.PowerNorm(1/3),
    yticklabels=1,
)

opf_to_gh13_subfamily_feature_counts.unstack(fill_value=0)

In [None]:
# Pull all examples of any OPF with even one member matching a GH13 domain

starch_domain_features = pd.read_sql(r"""
    SELECT *
    FROM (
        SELECT DISTINCT opf_id
        FROM starch_active_gh_domain
        JOIN feature_x_cazy_minimal_domain USING (domain_id)
        JOIN feature_to_opf USING (feature_id)
        WHERE score > 50
    )
    LEFT JOIN feature_to_opf USING (opf_id)
    LEFT JOIN feature_x_cazy_minimal_domain USING (feature_id)
    WHERE (score > 50 AND domain_id IN starch_active_gh_domain)
       OR domain_id IS NULL
""", con=con).sort_values('feature_id')

In [None]:
# One Opf (Opf05857) has two GH13 domains
two_starch_domain_features = starch_domain_features[starch_domain_features.feature_id.duplicated(keep=False)]
two_starch_domain_features

In [None]:
# GH13 domains and OPFs match very nicely.
opf_to_starch_subfamily_feature_counts = starch_domain_features.groupby(['opf_id', 'domain_id'], dropna=False).apply(len).sort_index()

sns.clustermap(
    opf_to_starch_subfamily_feature_counts.unstack(fill_value=0),
    norm=mpl.colors.PowerNorm(1/3),
    yticklabels=1,
)

opf_to_starch_subfamily_feature_counts.unstack(fill_value=0)