# Preamble

## Imports

In [None]:
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np
import seaborn as sns
import sqlite3
import matplotlib as mpl

con = sqlite3.connect('data/core.muri.2.denorm.db')
sns.set_context('notebook')

In [None]:
mag = pd.read_table('meta/genome.tsv', index_col='genome_id')

## Metadata

# Shared Single-Copy Genes with large length variation

In [None]:
data = (pd.read_sql("""
SELECT genome_id, opf_id AS func_id, COUNT(feature_id) AS tally
FROM feature JOIN sequence USING (sequence_id)
JOIN feature_to_opf USING (feature_id)
WHERE func_id NOT NULL
GROUP BY genome_id, func_id
                    """, con=con, index_col=['genome_id', 'func_id'])
          .tally.unstack('func_id', fill_value=0)
        )
data = data.reindex(mag.index)

function = pd.read_sql("""
SELECT
    opf_id AS func_id
  , architecture
  , ko_id
  , ko.description AS ko_description
  , cog_id
  , cog.description AS cog_description
  , function_category AS cog_category
FROM opf_to_architecture
LEFT JOIN opf_to_ko USING (opf_id)
LEFT JOIN ko USING (ko_id)
LEFT JOIN opf_to_cog USING (opf_id)
LEFT JOIN cog USING (cog_id)
                       """, con=con, index_col=['func_id'])

function = function.loc[data.columns]

In [None]:
single_copy_opfs = data.loc[:,(data[mag.genome_type == 'here'] == 1).mean(0) == 1].columns

In [None]:
plength = (pd.read_sql("""
    SELECT genome_id, opf_id, MAX(ABS(left - right)) / 3 AS plength
    FROM feature
    JOIN sequence USING (sequence_id)
    JOIN feature_to_opf USING (feature_id)
    WHERE opf_id IS NOT NULL
    GROUP BY genome_id, opf_id
                       """, con=con, index_col=['genome_id', 'opf_id'])
             .plength.unstack('opf_id', fill_value=0)
          )

In [None]:
plength.loc[mag.genome_type == 'here', single_copy_opfs].apply(lambda x: x.min() / x.median()).sort_values().head(20)