forked from dib-lab/2015-petMarSB
-
Notifications
You must be signed in to change notification settings - Fork 0
/
common.ipy
109 lines (85 loc) · 3.36 KB
/
common.ipy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/env ipython
from __future__ import print_function
from petmarrna.libs import *
'''
Libraries, functions, settings common to all petmar notebooks.
'''
#######################
# Functions, settings #
#######################
res_fn = 'resources.json'
cfg_fn = 'config.json'
with open(res_fn, 'r') as fp:
print('** Using data resources found in {c}'.format(c=res_fn), file=sys.stderr)
resources = json.load(fp)
with open(cfg_fn, 'r') as fp:
print('** Using config found in {c}'.format(c=cfg_fn), file=sys.stderr)
config = json.load(fp)
resources_df = pd.DataFrame(resources).transpose()
sample_df = resources_df[resources_df.meta_type == 'sample']
prefix = config['pipeline']['prefix']
def wdir(fn=''):
return os.path.join(os.path.join(config['pipeline']['work_dir']), fn)
def figdir(fn=''):
return os.path.join('./figures/', fn)
def len_func(fn):
fn = wdir(fn)
N = !grep -c ">" {fn}
return int(N[0])
resources_df['size'] = resources_df[(resources_df.meta_type != 'sample') &
(resources_df.meta_type == 'fasta_database')].filename.apply(len_func)
outfmt6 = ['sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', \
'qend', 'sstart', 'send', 'evalue', 'bitscore']
import warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore")
sample_df['tissue_c'] = list(Categorical(list(sample_df.tissue)).codes)
tissue_palette = sns.color_palette('Set3', n_colors=len(sample_df.groupby('tissue_c').count()))
sample_df['color'] = sample_df.tissue_c.apply(lambda x: tissue_palette[x])
warnings.filterwarnings("ignore",category=DeprecationWarning)
warnings.filterwarnings("ignore",category=UserWarning)
pd.set_option('display.max_rows', 200)
def uniprot_str(uid):
if type(uid) in [str, unicode]:
tmp = uid.split('|')
return tmp[1]
return uid
def filter_species(gene_set, mg, species='7757'):
if mg is None:
mg = mygene.MyGeneInfo()
results = set()
for result in mg.querymany(list(gene_set), scopes='symbol,name,ensemblgene,entrezgene,uniprot', species=species):
if 'notfound' in result:
results.add(result['query'])
return list(results)
def gene_from_symbol(gene_list, mg):
if mg is None:
mg = mygene.MyGeneInfo()
results = mg.querymany(gene_list, scopes='symbol,name,alias',
fields='name,alias,summary,go,refseq',
species='all', as_dataframe=True, returnall=False)
return results
def dedupe_gene_func(X):
pos = np.array(X.isnull().sum(axis=1)).argmin()
return X.ix[pos]
def transcript_write_func(X, gene_map, db, outfp):
gene_tag = X.tag
tr_names = gene_map[gene_tag]
gene_alias = X.alias_str
gene_name = X.name
if type(gene_tag) == float and np.isnan(gene_tag):
gene_tag = ''
else:
gene_tag = 'tag={}'.format(gene_tag)
if type(gene_alias) == float and np.isnan(gene_alias):
gene_alias = ''
else:
gene_alias = 'alias={}'.format(gene_alias)
if type(gene_name) == float and np.isnan(gene_name):
gene_name = ''
else:
gene_name = 'name={}'.format(gene_name)
for tr_name in tr_names:
h_line = '{0} {1} {2} {3}'.format(tr_name, gene_tag, gene_alias, gene_name)
tr = db[tr_name].sequence
outfp.write('>{}\n{}\n'.format(h_line, tr))