common.ipy

#!/usr/bin/env ipython
from __future__ import print_function

from petmarrna.libs import *

'''
Libraries, functions, settings common to all petmar notebooks.
'''

#######################
# Functions, settings #
#######################

res_fn = 'resources.json'
cfg_fn = 'config.json'

with open(res_fn, 'r') as fp:
    print('** Using data resources found in {c}'.format(c=res_fn), file=sys.stderr)
    resources = json.load(fp)
with open(cfg_fn, 'r') as fp:
    print('** Using config found in {c}'.format(c=cfg_fn), file=sys.stderr)
    config = json.load(fp)

resources_df = pd.DataFrame(resources).transpose()
sample_df = resources_df[resources_df.meta_type == 'sample']
prefix = config['pipeline']['prefix']

def wdir(fn=''):
    return os.path.join(os.path.join(config['pipeline']['work_dir']), fn)

def figdir(fn=''):
    return os.path.join('./figures/', fn)

def len_func(fn):
    fn = wdir(fn)
    N = !grep -c ">" {fn}
    return int(N[0])

resources_df['size'] = resources_df[(resources_df.meta_type != 'sample') &
                                    (resources_df.meta_type == 'fasta_database')].filename.apply(len_func)


outfmt6 = ['sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', \
           'qend', 'sstart', 'send', 'evalue', 'bitscore']

import warnings


with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    sample_df['tissue_c'] = list(Categorical(list(sample_df.tissue)).codes)
    tissue_palette = sns.color_palette('Set3', n_colors=len(sample_df.groupby('tissue_c').count()))
    sample_df['color'] = sample_df.tissue_c.apply(lambda x: tissue_palette[x])

warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings("ignore",category=UserWarning)

pd.set_option('display.max_rows', 200)

def uniprot_str(uid):
    if type(uid) in [str, unicode]:
        tmp = uid.split('|')
        return tmp[1]
    return uid

def filter_species(gene_set, mg, species='7757'):
    if mg is None:
        mg = mygene.MyGeneInfo()
    results = set()
    for result in mg.querymany(list(gene_set), scopes='symbol,name,ensemblgene,entrezgene,uniprot', species=species):
        if 'notfound' in result:
            results.add(result['query'])
    return list(results)

def gene_from_symbol(gene_list, mg):
    if mg is None:
        mg = mygene.MyGeneInfo()
    results = mg.querymany(gene_list, scopes='symbol,name,alias', 
                            fields='name,alias,summary,go,refseq',
                            species='all', as_dataframe=True, returnall=False)
    return results

def dedupe_gene_func(X):
    pos = np.array(X.isnull().sum(axis=1)).argmin()
    return X.ix[pos]

def transcript_write_func(X, gene_map, db, outfp):
    gene_tag = X.tag
    tr_names = gene_map[gene_tag]
    gene_alias = X.alias_str
    gene_name = X.name
    
    if type(gene_tag) == float and np.isnan(gene_tag):
        gene_tag = ''
    else:
        gene_tag = 'tag={}'.format(gene_tag)
    if type(gene_alias) == float and np.isnan(gene_alias):
        gene_alias = ''
    else:
        gene_alias = 'alias={}'.format(gene_alias)
    if type(gene_name) == float and np.isnan(gene_name):
        gene_name = ''
    else:
        gene_name = 'name={}'.format(gene_name)

    for tr_name in tr_names:
        h_line = '{0} {1} {2} {3}'.format(tr_name, gene_tag, gene_alias, gene_name)
        tr = db[tr_name].sequence
        outfp.write('>{}\n{}\n'.format(h_line, tr))