In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import tqdm
import matplotlib.pyplot as plt

from bokeh.models import tickers

from plots import scatterplot


Bad key "text.kerning_factor" on line 4 in
/opt/conda/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


## Load data

In [2]:
swiss_df = pd.read_hdf('../data/bacterial_swissprot.h5')
swiss_df.head()

Unnamed: 0_level_0,accessions,sequence_length,sequence,description,InterPro,GO,KO,Gene3D,Pfam,KEGG,...,Superkingdom,Kingdom,Phylum,Class,Order,Family,Subfamily,Genus,Species,Transmembrane
entry_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12AH_CLOS4,P21215,29.0,MIFDGKVAIITGGGKAKSIGYGIAVAYAK,RecName: Full=12-alpha-hydroxysteroid dehydrog...,IPR036291,GO:0047013||GO:0030573||GO:0016042,,,,,...,Bacteria,,Firmicutes,Clostridia,Clostridiales,Clostridiaceae,,Clostridium,,0.0
12KD_MYCSM,P80438,24.0,MFHVLTLTYLCPLDVVXQTRPAHV,RecName: Full=12 kDa protein; Flags: Fragment;,,,,,,,...,Bacteria,,Actinobacteria,Actinobacteria,Corynebacteriales,Mycobacteriaceae,,Mycolicibacterium,,0.0
12OLP_LISIN,Q92AT0,1086.0,MTMLKEIKKADLSAAFYPSGELAWLKLKDIMLNQVIQNPLENRLSQ...,"RecName: Full=1,2-beta-oligoglucan phosphoryla...",IPR008928||IPR012341||IPR033432,GO:0016740,K21298,1.50.10.10,PF17167,lin:lin1839,...,Bacteria,,Firmicutes,Bacilli,Bacillales,Listeriaceae,,Listeria,,
12S_PROFR,Q8GBW6||Q05617,611.0,MAENNNLKLASTMEGRVEQLAEQRQVIEAGGGERRVEKQHSQGKQT...,RecName: Full=Methylmalonyl-CoA carboxyltransf...,IPR034733||IPR000438||IPR029045||IPR011763||IP...,GO:0009317||GO:0003989||GO:0047154||GO:0006633,,,PF01039,,...,Bacteria,,Actinobacteria,Actinobacteria,Propionibacteriales,Propionibacteriaceae,,Propionibacterium,,0.0
14KD_MYCBO,P0A5B8||A0A1R3Y251||P30223||X2BJK6,144.0,MATTLPVQRHPRSLFPEFSELFAAFPSFAGLRPTFDTRLMRLEDEM...,RecName: Full=14 kDa antigen; AltName: Full=16...,IPR002068||IPR008978,GO:0005618||GO:0005576,,2.60.40.790,PF00011,,...,Bacteria,,Actinobacteria,Actinobacteria,Corynebacteriales,Mycobacteriaceae,,Mycobacterium,,0.0


In [3]:
umap_df = pd.read_csv('../data/umap.csv', index_col=0)

umap_df.head()

Unnamed: 0_level_0,umap_0,umap_1
entry_name,Unnamed: 1_level_1,Unnamed: 2_level_1
12AH_CLOS4,14.772205,9.630126
12KD_MYCSM,16.19941,8.673423
12OLP_LISIN,16.839794,4.398923
12S_PROFR,6.8213,1.796317
14KD_MYCBO,11.173403,9.302528


In [4]:
swiss_df = swiss_df.join(umap_df)

## Transmembrane residues

In [5]:
swiss_df['TransmembranePrct'] = swiss_df['Transmembrane'] * 100

In [6]:
scatterplot(
    swiss_df,
    'umap_1',
    'umap_0',
    color='TransmembranePrct',
    width=800,
    height=1000,
    invert_yaxis=True,
    title='Transmembrane residues', 
    clabel='Transmembrane residues [%]',
    cnorm='linear',
    colorbar_position='bottom',
    colorbar_opts=dict(ticker=tickers.BasicTicker(desired_num_ticks=7)),
    clim=(0, 65)
)



### Sequence length

In [7]:
ticks=[10, 20, 50, 100, 200, 500, 1000, 2000]
ticker = tickers.FixedTicker(ticks=ticks)

In [8]:
scatterplot(
    swiss_df,
    'umap_1',
    'umap_0',
    color='sequence_length',
    width=800,
    height=1000,
    invert_yaxis=True,
    title='Sequence length', 
    clabel='Sequence length',
    cnorm="log",
    clim=(10, 1500),
    colorbar_opts=dict(ticker=ticker),
    colorbar_position='bottom',
)



### Compute Reconstruction Error

In [12]:
import glob
from pathlib import Path

In [13]:
results_path = Path('/microbiome/kodrzywolek/publication/15-annotation-recovery/bac-iou/ardi-tape-uhgp_run_lmdb_1-avg/')

rec = pd.concat({
        path.stem:
        pd.read_csv(path) for path in results_path.glob('*.csv')
}).reset_index(level=0)
rec = rec[~rec['level_0'].isin(['Phylum', 'Genus', 'Order', 'Family'])]
rec.head()

Unnamed: 0,level_0,entry_name,k=3,k=15,k=51
0,EC number,12AH_CLOS4,0.0,0.0,0.0
1,EC number,12OLP_LISIN,0.0,0.0,0.0
2,EC number,12S_PROFR,0.0,0.0,0.0
3,EC number,16ABF_BIFLN,0.0,0.0,0.0
4,EC number,1A1D_ACIAC,1.0,1.0,1.0


In [14]:
rec = rec.groupby('entry_name').mean()
swiss_df['REE'] = 1 - rec['k=51']

In [15]:
scatterplot(
    swiss_df,
    'umap_1',
    'umap_0',
    color='REE',
    width=800,
    height=1000,
    invert_yaxis=True,
    title='Reconstruction Error Rate',
    clabel='Reconstruction Error Rate',
    cnorm="log",
    colorbar_opts=dict(ticker=tickers.BasicTicker(desired_num_ticks=11)),
    clim=(0.1, 1.0),
    colorbar_position='bottom',
)

## KO plot

In [10]:
scatterplot(swiss_df, 'umap_1', 'umap_0', color='KO', width=700, height=700, invert_yaxis=True,)