/
cleanup.py
161 lines (134 loc) · 5.13 KB
/
cleanup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import glob
import click
import json
import os
from os.path import splitext, join, basename, dirname
import functools
import pathlib
import shutil
@click.group()
def cli():
pass
class Rewritter:
def __init__(self):
self.srcdst = []
def replace_prefix(self, old, new, path):
assert path.startswith(old)
stripped = path[len(old):]
if stripped.startswith('/'):
# os.path.join will not prefix if starting with root
stripped = stripped[1:]
replaced = join(new, stripped)
self.srcdst.append((path, stripped))
return replaced
@cli.command()
@click.option('--base', type=click.Path(exists=True),
help='The results base directory')
def delete_unnecessary_files(base):
"""Remove files we don't need to have persist"""
drop_files = glob.glob(join(base, '*/*/*/*.droplist'))
for fp in drop_files:
with open(fp) as openfp:
for f in openfp:
os.remove(f)
@cli.command()
@click.option('--base', type=click.Path(exists=True),
help='The results base directory')
@click.option('--output', type=click.Path(exists=False),
help='Where to write the configuration too')
@click.option('--port', type=int, default=8082, help='API port')
@click.option('--prefix', type=str, required=True,
help="Prefix to use on the API server")
@click.option('--copy-prefix', type=str, required=True,
help="Copy prefix to use when copying files to a dst")
@click.option('--actually-copy', is_flag=True, default=False,
help="If specified, actually copy the files")
def create_configuration(base, output, port, prefix, copy_prefix,
actually_copy):
detail_files = glob.glob(join(base, '*/*/*/*.json'))
rewritter = Rewritter()
datasets = {}
die = False
for detail_fp in detail_files:
detail = json.loads(open(detail_fp).read())
name = detail.pop('name')
datatag = name.split('-')[0]
sampletype = name.split('-')[-1]
# we'll only keep taxonomy data for TMI subsets
# as this entity requires a lot of resident memory for the api
keep_tax = False
if datatag == 'tmi' and sampletype in ('gut', 'skin', 'oral'):
keep_tax = True
results_dir = dirname(detail_fp)
d = functools.partial(join, results_dir)
pre = functools.partial(rewritter.replace_prefix, base, prefix)
# sanity check for completion
if not os.path.exists(d('beta/pcoa/unweighted_unifrac.qza')):
click.echo(f"No PCoA: {name}", err=True)
die = True
continue
bloom = ''
for f in os.listdir(results_dir):
if 'nobloom' in f:
bloom = 'nobloom.'
break
metadata = pre(d('raw.columns_of_interest.txt'))
alpha = {splitext(basename(f))[0]: pre(f)
for f in glob.glob(d('alpha/*.qza'))}
if keep_tax:
taxtable = pre(d(f'raw.{bloom}minfeat.mindepth.biom.qza'))
taxtax = pre(d(f'raw.{bloom}minfeat.mindepth.taxonomy.qza'))
# naively limit to unweighted and all samples right now as we're
# not doing anything with the other data yet
# beta = {splitext(basename(f))[0]: pre(f)
# for f in glob.glob(d('beta/*.qza'))
# if f.endswith('unweighted_unifrac.qza')}
pcoa = {splitext(basename(f))[0]: pre(f)
for f in glob.glob(d('beta/pcoa/*.qza'))
if f.endswith('unweighted_unifrac.qza')}
# unweighted_unifrac_neighbors -> unweighted_unifrac
neigh = {splitext(basename(f))[0].rsplit('_', 1)[0]: pre(f)
for f in glob.glob(d('beta/*.tsv'))
if f.endswith('neighbors.tsv')}
datasets[name] = {
'__dataset_detail__': detail,
'__metadata__': metadata,
'__alpha__': alpha,
# '__beta__': beta,
'__neighbors__': neigh,
'__pcoa__': {
'full-dataset': pcoa
}
}
if keep_tax:
datasets[name]['__taxonomy__'] = {
'taxonomy': {
'table': taxtable,
'feature-data-taxonomy': taxtax
}
}
final = {'validate': False, # disable route checking for pngs
'resources': {'datasets': datasets},
'port': str(port)}
if die:
import sys
sys.exit(1)
with open(output, 'w') as fp:
fp.write(json.dumps(final, indent=2))
if actually_copy:
pathlib.Path(copy_prefix).mkdir(parents=True, exist_ok=True)
shutil.copy(output, copy_prefix)
else:
print(f"mkdir -p {copy_prefix}")
for src, dst in rewritter.srcdst:
dst = join(copy_prefix, dst)
dst_dir = dirname(dst)
if actually_copy:
print(src, dst)
pathlib.Path(dst_dir).mkdir(parents=True, exist_ok=True)
shutil.copy(src, dst)
else:
print(f"mkdir -p {dst_dir}")
print(f"cp {src} {dst}")
if __name__ == '__main__':
cli()