-
Notifications
You must be signed in to change notification settings - Fork 9
/
generate.py
executable file
·528 lines (457 loc) · 20.2 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
#!/usr/bin/env python3
# ----------------------------------------------------------------------------
# Copyright (c) 2018--, rankratioviz development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE.txt, distributed with this software.
#
# Generates two JSON files: one for a rank plot and one for a sample
# scatterplot of log ratios.
#
# A lot of the code for processing input data in this file was based on code
# by Jamie Morton, some of which is now located in ipynb/Figure3.ipynb in
# https://github.com/knightlab-analyses/reference-frames.
# ----------------------------------------------------------------------------
import os
import logging
# import re
from shutil import copyfile, copytree
import pandas as pd
import altair as alt
from rankratioviz._rank_utils import filter_unextreme_features
from rankratioviz._plot_utils import replace_js_plot_json_definitions
def fix_id(fid):
"""Like escape_id() but a lot lazier."""
new_id = ""
for c in fid:
if c == ".":
new_id += ":"
elif c == "]":
new_id += ")"
elif c == "[":
new_id += "("
elif c == "'" or c == '"' or c == "\\":
continue
else:
new_id += c
return new_id
# def escape_id(fid):
# """Escapes certain characters in an ID for a Vega/Vega-Lite spec.
#
# This is in order to prevent Vega-Lite from interpreting stuff from
# these IDs, which results in problems.
#
# See https://vega.github.io/vega-lite/docs/field.html for context.
# """
#
# # Characters that need to be escaped: ."'\[]
# # (JSON doesn't assign special significance to the single-quote (') but
# # Vega-Lite does, which is why we escape it anyway.)
# spec_char_regex = re.compile("[\.\"\'\\\[\]]")
# new_id = ""
# for c in fid:
# if spec_char_regex.match(c):
# new_id += "\\{}".format(c)
# else:
# new_id += c
# return new_id
def matchdf(df1, df2):
"""Filters both DataFrames to just the rows of their shared indices.
Derived from gneiss.util.match() (https://github.com/biocore/gneiss).
"""
idx = set(df1.index) & set(df2.index)
return df1.loc[idx], df2.loc[idx]
def ensure_df_headers_unique(df, df_name):
"""Raises an error if the index or columns of the DataFrame aren't unique.
(If both index and columns are non-unique, the index error will take
precedence.)
If these fields are unique, no errors are raised and nothing (None) is
implicitly returned.
Parameters
----------
df: pandas.DataFrame
df_name: str
The "name" of the DataFrame -- this is displayed to the user in the
error message thrown if the DataFrame has any non-unique IDs.
"""
if len(df.index.unique()) != df.shape[0]:
raise ValueError(
"Indices of the {} DataFrame are not" " unique.".format(df_name)
)
if len(df.columns.unique()) != df.shape[1]:
raise ValueError(
"Columns of the {} DataFrame are not" " unique.".format(df_name)
)
def process_and_generate(
feature_ranks,
sample_metadata,
biom_table,
output_dir,
feature_metadata=None,
extreme_feature_count=None,
):
"""Just calls process_input() and gen_visualization()."""
U, V, ranking_ids, feature_metadata_cols, processed_table = process_input(
feature_ranks,
sample_metadata,
biom_table,
feature_metadata,
extreme_feature_count,
)
return gen_visualization(
V, ranking_ids, feature_metadata_cols, processed_table, U, output_dir
)
def process_input(
feature_ranks,
sample_metadata,
biom_table,
feature_metadata=None,
extreme_feature_count=None,
):
"""Processes the input files to rankratioviz."""
logging.debug("Starting processing input.")
# Assert that the feature IDs and sample IDs contain only unique IDs.
# (This doesn't check that there aren't any identical IDs between the
# feature and sample IDs, but we shouldn't be using sample IDs to query
# feature IDs anyway. And besides, I think that should technically be
# allowed.)
ensure_df_headers_unique(feature_ranks, "feature ranks")
logging.debug("Ensured uniqueness of feature ranks.")
ensure_df_headers_unique(sample_metadata, "sample metadata")
logging.debug("Ensured uniqueness of sample metadata.")
# NOTE although we always call filter_unextreme_features(), no filtering is
# necessarily done (depending on the value of extreme_feature_count and the
# contents of the table/ranks).
filtered_table, filtered_ranks = filter_unextreme_features(
biom_table, feature_ranks, extreme_feature_count
)
logging.debug("Creating a SparseDataFrame from the BIOM table.")
# Old versions of BIOM accidentally produce an effectively-dense DataFrame
# when using biom.Table.to_dataframe() (see
# https://github.com/biocore/biom-format/issues/808). To get around this,
# we extract the scipy.sparse.csr_matrix data from the BIOM table and
# directly convert that to a pandas SparseDataFrame.
sparse_matrix_data = filtered_table.matrix_data
table = pd.SparseDataFrame(sparse_matrix_data, default_fill_value=0.0)
# The csr_matrix doesn't include column/index IDs, so we manually add them
# in to the SparseDataFrame.
table.index = filtered_table.ids(axis="observation")
table.columns = filtered_table.ids(axis="sample")
logging.debug("Converted BIOM table to SparseDataFrame.")
# Match features to BIOM table, and then match samples to BIOM table.
# This should bring us to a point where every feature/sample is
# supported in the BIOM table. (Note that the input BIOM table might
# contain features or samples that are not included in filtered_ranks or
# sample_metadata, respectively -- this is totally fine. The opposite,
# though, is a big no-no.)
table, V = matchdf(table, filtered_ranks)
logging.debug("Matching table with feature ranks done.")
# Ensure that every ranked feature was present in the BIOM table. Raise an
# error if this isn't the case.
if V.shape[0] != filtered_ranks.shape[0]:
unsupported_feature_ct = filtered_ranks.shape[0] - V.shape[0]
# making this error message as pretty as possible
word = "were"
if unsupported_feature_ct == 1:
word = "was"
raise ValueError(
"Of the {} ranked features, {} {} not present in "
"the input BIOM table.".format(
filtered_ranks.shape[0], unsupported_feature_ct, word
)
)
table, U = matchdf(table.T, sample_metadata)
logging.debug("Matching table with sample metadata done.")
# Allow for dropped samples (e.g. negative controls), but ensure that at
# least one sample is supported by the BIOM table.
if U.shape[0] < 1:
raise ValueError(
"None of the samples in the sample metadata file "
"are present in the input BIOM table."
)
dropped_sample_ct = sample_metadata.index.difference(U.index).shape[0]
if dropped_sample_ct > 0:
print(
"NOTE: {} sample(s) in the sample metadata file were not "
"present in the BIOM table, and have been removed from the "
"visualization.".format(dropped_sample_ct)
)
# Now that we've matched up the BIOM table with the feature ranks and
# sample metadata, we're almost done.
# Before we go through feature metadata: convert all rank column IDs
# to strings (since Altair gets angry if you pass in ints as column IDs),
# and (as a temporary fix for #66) call fix_id() on these column IDs.
filtered_ranks.columns = [fix_id(str(c)) for c in filtered_ranks.columns]
ranking_ids = filtered_ranks.columns
feature_metadata_cols = []
# If the user passed in feature metadata corresponding to taxonomy
# information, then we use that to update the feature data to include
# that metadata. Feature metadata will be represented as additional fields
# for each feature in the rank plot. (This can help out in the searching
# part of the visualization, but it isn't necessary.)
if feature_metadata is not None:
try:
feature_metadata_cols = feature_metadata.columns
# Use of suffixes=(False, False) ensures that columns are unique
# between feature metadata and feature ranks.
filtered_ranks = filtered_ranks.merge(
feature_metadata,
how="left",
left_index=True,
right_index=True,
suffixes=(False, False),
)
except ValueError:
# It might be possible to figure out a way to handle this sort of
# situation automatically, but unless it becomes a problem I'm ok
# with the current solution.
print(
"Column names for the feature metadata and feature ranks "
"should be distinct. Try creating a copy of your feature "
"metadata with identical columns renamed, and use that with "
"rankratioviz."
)
raise
logging.debug("Finished input processing.")
return U, filtered_ranks, ranking_ids, feature_metadata_cols, table
def gen_rank_plot(V, ranking_ids, feature_metadata_cols):
"""Generates altair.Chart object describing the rank plot.
Arguments:
V: pd.DataFrame
feature ranks
ranking_ids: pd.Index
IDs of the actual "ranking" columns in V (since V can include
feature metadata)
feature_metadata_cols: pd.Index or list
IDs of the feature metadata columns in V (if no such IDs present, an
empty list should be passed)
Returns:
JSON describing a Vega-Lite specification for the rank plot.
"""
rank_data = V.copy()
# NOTE that until this point we've treated the actual rank values as just
# "objects", as far as pandas is concerned. However, if we continue to
# treat them as objects when sorting them, we'll get a list of feature
# ranks in lexicographic order... which is not what we want. So we just
# ensure that all of the columns contain numeric data.
for col in ranking_ids:
rank_data[col] = pd.to_numeric(rank_data[col])
# The default rank column is just whatever the first rank is. This is what
# the rank plot will use when it's first drawn.
default_rank_col = ranking_ids[0]
# Set default classification of every feature to "None"
# (This value will be updated when a feature is selected in the rank plot
# as part of the numerator, denominator, or both parts of the current log
# ratio.)
rank_data["Classification"] = "None"
# Replace "index" with "Feature ID". looks nicer in the visualization :)
rank_data.rename_axis("Feature ID", axis="index", inplace=True)
rank_data.reset_index(inplace=True)
# Now, we can actually create the rank plot.
rank_chart = (
alt.Chart(
rank_data,
title="Feature Ranks",
background="#FFFFFF",
autosize=alt.AutoSizeParams(resize=True),
)
.mark_bar()
.transform_window(
sort=[alt.SortField(field=default_rank_col, order="ascending")],
# We don't use an alt.WindowFieldDef here because python gets
# confused when you use "as" as an actual argument name. So we just
# use this syntax.
window=[{"op": "row_number", "as": "rankratioviz_x"}],
)
.encode(
# type="ordinal" needed on the scale here to make bars adjacent;
# see https://stackoverflow.com/a/55544817/10730311.
x=alt.X(
"rankratioviz_x",
title="Sorted Features",
type="ordinal",
scale=alt.Scale(paddingOuter=1, paddingInner=0, rangeStep=1),
axis=alt.Axis(ticks=False, labelAngle=0),
),
y=alt.Y(default_rank_col, type="quantitative"),
color=alt.Color(
"Classification",
scale=alt.Scale(
domain=["None", "Numerator", "Denominator", "Both"],
range=["#e0e0e0", "#f00", "#00f", "#949"],
),
),
tooltip=[
alt.Tooltip(
field="rankratioviz_x",
title="Current Ranking",
type="quantitative",
),
"Classification",
"Feature ID",
*feature_metadata_cols,
],
)
.configure_axis(
# Done in order to differentiate "None"-classification features
# from grid lines
gridColor="#f2f2f2",
labelBound=True,
)
.interactive()
)
rank_chart_json = rank_chart.to_dict()
rank_ordering = "rankratioviz_rank_ordering"
fm_col_ordering = "rankratioviz_feature_metadata_ordering"
# Note we don't use rank_data.columns for setting the rank ordering. This
# is because rank_data's columns now include both the ranking IDs and the
# "Feature ID" and "Classification" columns (as well as any feature
# metadata the user saw fit to pass in).
rank_chart_json["datasets"][rank_ordering] = list(ranking_ids)
rank_chart_json["datasets"][fm_col_ordering] = list(feature_metadata_cols)
return rank_chart_json
def gen_sample_plot(table, metadata):
"""Generates altair.Chart object describing the sample scatterplot.
Arguments:
table: pandas DataFrame describing feature abundances for each sample.
metadata: pandas DataFrame describing metadata for each sample.
Returns:
JSON describing altair.Chart for the sample plot.
"""
# Used to set color
default_metadata_col = metadata.columns[0]
# Since we don't bother setting a default log ratio, we set the balance for
# every sample to NaN so that Altair will filter them out (producing an
# empty scatterplot by default, which makes sense). I guess None would
# also work here.
# TODO just make a new column in a vectorized operation, no need to merge
balance = pd.Series(index=table.index).fillna(float("nan"))
df_balance = pd.DataFrame({"rankratioviz_balance": balance})
# At this point, df_balance is a DataFrame with its index as sample IDs
# and one column ("rankratioviz_balance", which is solely NaNs).
# We know that df_balance and metadata's indices should match up since we
# already ran matchdf() on the loaded BIOM table and metadata.
# (Use of (False, False) means that if metadata contains a column called
# rankratiovz_balance, this will throw an error.)
sample_metadata = pd.merge(
df_balance,
metadata,
left_index=True,
right_index=True,
suffixes=(False, False),
)
# "Reset the index" -- make the sample IDs a column (on the leftmost side)
# First we rename the index "Sample ID", just on the off chance that
# there's a metadata column called "index".
# NOTE that there shouldn't be a metadata column called Sample ID or
# something like that, since that should've been used in the merge with
# df_balance above (and "Sample ID" follows the Q2 metadata conventions for
# an "Identifier Column" name).
sample_metadata.rename_axis("Sample ID", axis="index", inplace=True)
sample_metadata.reset_index(inplace=True)
# Very minor thing -- sort the samples by their IDs. This should ensure
# that the sample plot output is deterministic -- and, therefore, when
# running rankratioviz._plot_utils to see if we need to update the specs in
# the JS, rankratioviz._plot_utils.plot_jsons_equal() should be True
# unless we actually change something in the actual spec details.
sample_metadata.sort_values(by=["Sample ID"], inplace=True)
# Create sample plot chart Vega-Lite spec using Altair.
sample_chart = (
alt.Chart(
sample_metadata,
title="Log Ratio of Abundances in Samples",
background="#FFFFFF",
autosize=alt.AutoSizeParams(resize=True),
)
.mark_circle()
.encode(
alt.X("rankratioviz_balance", type="quantitative"),
alt.Y(
"rankratioviz_balance",
title="log(Numerator / Denominator)",
type="quantitative",
),
color=alt.Color(default_metadata_col, type="nominal"),
tooltip=["Sample ID", "rankratioviz_balance"],
)
.configure_axis(labelBound=True)
.interactive()
)
# Replace the "mark": "circle" definition with a more explicit one. This
# will be useful when adding attributes to the boxplot mark in the
# visualization. (We have to resort to this hack because I haven't been
# able to successfully use alt.MarkDef in the alt.Chart definition above.)
sample_chart_dict = sample_chart.to_dict()
sample_chart_dict["mark"] = {"type": "circle"}
# Return the JSONs as dicts for 1) the sample plot JSON (which only
# contains sample metadata), and 2) the feature counts per sample (which
# will be stored separately from the sample plot JSON in order to not hit
# performance too terribly).
return sample_chart_dict, table.to_dict()
def gen_visualization(
V,
ranking_ids,
feature_metadata_cols,
processed_table,
df_sample_metadata,
output_dir,
):
"""Creates a rankratioviz visualization. This function should be callable
from both the QIIME 2 and standalone rankratioviz scripts.
Returns:
index_path: a path to the index.html file for the output visualization.
This is needed when calling q2templates.render().
"""
# https://altair-viz.github.io/user_guide/faq.html#disabling-maxrows
alt.data_transformers.enable("default", max_rows=None)
logging.debug("Generating rank plot JSON.")
rank_plot_json = gen_rank_plot(V, ranking_ids, feature_metadata_cols)
logging.debug("Generating sample plot JSON.")
sample_plot_json, count_json = gen_sample_plot(
processed_table, df_sample_metadata
)
logging.debug("Finished generating both plots.")
os.makedirs(output_dir, exist_ok=True)
# copy files for the visualization
loc_ = os.path.dirname(os.path.realpath(__file__))
# NOTE: We can just join loc_ with support_files/, since support_files/ is
# located within the same directory as generate.py. Previously (when this
# code was contained in q2/_method.py and scripts/_plot.py), I joined loc_
# with .. and then with support_files since we had to first navigate up to
# the directory containing generate.py and support_files/. Now, we don't
# have to do that any more.
support_files_loc = os.path.join(loc_, "support_files")
index_path = None
for file_ in os.listdir(support_files_loc):
if file_ != ".DS_Store":
copy_func = copyfile
# If we hit a directory in support_files/, just copy the entire
# directory to our destination using shutil.copytree()
if os.path.isdir(os.path.join(support_files_loc, file_)):
copy_func = copytree
copy_func(
os.path.join(support_files_loc, file_),
os.path.join(output_dir, file_),
)
if file_ == "index.html":
index_path = os.path.join(output_dir, file_)
if index_path is None:
# This should never happen -- assuming rankratioviz has been installed
# fully, i.e. with a complete set of support_files/ -- but we handle it
# here just in case.
raise FileNotFoundError("Couldn't find index.html in support_files/")
# create JS code that loads these JSON files in main.js
main_loc = os.path.join(support_files_loc, "main.js")
output_loc = os.path.join(output_dir, "main.js")
exit_code = replace_js_plot_json_definitions(
main_loc,
rank_plot_json,
sample_plot_json,
count_json,
output_file_loc=output_loc,
)
if exit_code != 0:
raise ValueError("Wasn't able to replace JSONs and write to main.js.")
logging.debug("Finished writing the visualization contents.")
return index_path