-
Notifications
You must be signed in to change notification settings - Fork 81
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Per sample alpha plots #185
Changes from all commits
bd68cfb
0de875d
e208140
014f2ee
a5be4d3
c0234bc
bf9b929
19a36eb
ce6c9e8
ff4c532
00e78a8
fbc0de0
dbcd8ef
10d426d
0c8a970
1120c9b
c212163
1fdcc40
5691e6d
32b5875
0655859
9c89abe
1e5351c
f74ae60
560dbdc
6c9b4a6
e2519c2
1bc8699
49bbd30
32f7917
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,23 @@ | ||
import os | ||
|
||
from matplotlib import use, rcParams | ||
use('Agg') # noqa | ||
|
||
import biom | ||
import matplotlib.pyplot as plt | ||
import pandas as pd | ||
from qiime.util import qiime_system_call | ||
import seaborn as sn | ||
|
||
import americangut.util as agu | ||
import americangut.notebook_environment as agenv | ||
import americangut.results_utils as agru | ||
|
||
# Sets up plotting parameters so that the default setting is use to Helvetica | ||
# in plots | ||
rcParams['font.family'] = 'sans-serif' | ||
rcParams['font.sans-serif'] = ['Arial'] | ||
|
||
|
||
def create_opts(sample_type, chp_path, gradient_color_by, barchart_categories): | ||
"""Create a dict of options for processing functions | ||
|
@@ -247,6 +258,66 @@ def taxa_summaries(opts, sample_ids): | |
return results | ||
|
||
|
||
def alpha_plot(opts, sample_ids): | ||
"""Produces digestable alpha diversity distribution plots per sample | ||
|
||
Parameters | ||
---------- | ||
opts : dict | ||
A dict of relevant opts. | ||
sample_ids : Iterable of str | ||
A list of sample IDs of interest | ||
|
||
Returns | ||
------- | ||
dict | ||
A dict containing each sample ID and any errors observed or None if | ||
no error was observed for the sample. {str: str or None} | ||
""" | ||
|
||
results = {} | ||
alpha_map = pd.read_csv( | ||
agu.get_existing_path(opts['collapsed']['100nt']['alpha-map']), | ||
sep='\t', | ||
dtype=str, | ||
) | ||
|
||
alpha_metrics = ['shannon_1k', 'PD_whole_tree_1k'] | ||
|
||
# Checks the alpha_field is in the mapping file | ||
for metric in alpha_metrics: | ||
if metric not in alpha_map.columns: | ||
raise ValueError('%s is not a valid alpha diversity field name.' | ||
% metric) | ||
# Checks the group_field is in the mapping file | ||
if 'SIMPLE_BODY_SITE' not in alpha_map.columns: | ||
raise ValueError('SIMPLE_BODY_SITE is not a valid field name.') | ||
|
||
alpha_map[alpha_metrics] = alpha_map[alpha_metrics].astype(float) | ||
alpha_map.set_index('#SampleID', inplace=True) | ||
|
||
results = {} | ||
for id_ in sample_ids: | ||
if id_ not in alpha_map.index: | ||
results[id_] = 'ID not found' | ||
else: | ||
results[id_] = None | ||
shannon_path = os.path.join(_result_path(opts, id_), | ||
'shannon_%s.pdf' % id_) | ||
_plot_alpha(id_, alpha_map, 'shannon_1k', | ||
xlabel='Shannon Diversity', | ||
fp=shannon_path) | ||
|
||
# Generates the pd whole tree diversity figure | ||
pd_path = os.path.join(_result_path(opts, id_), | ||
'pd_%s.pdf' % id_) | ||
_plot_alpha(id_, alpha_map, 'PD_whole_tree_1k', | ||
xlabel='PD Whole Tree Diversity', | ||
fp=pd_path) | ||
|
||
return results | ||
|
||
|
||
def sufficient_sequence_counts(opts, sample_ids): | ||
"""Errors if the sequence counts post filtering are < 1000 | ||
|
||
|
@@ -515,3 +586,112 @@ def stage_per_sample_specific_statics(opts, sample_ids): | |
result[id_] = "Cannot symlink for statics." | ||
|
||
return result | ||
|
||
|
||
def _plot_alpha(sample, alpha_map, alpha_field, group_field='SIMPLE_BODY_SITE', | ||
output_dir=None, xlabel=None, fp=None, debug=False): | ||
"""Generates a distrbution plot for the data | ||
|
||
Parameters | ||
---------- | ||
sample : str | ||
The sample ID to be plotted | ||
alpha_map_fp : pandas DataFrame | ||
A pandas dataframe containing the sample metadata. The sample ID | ||
should be given in the `'#SampleID'` column, a column with | ||
the name given by `alpha_field` contains alpha diversity values, | ||
and the `group_field` column specifying the groups which should be | ||
used to seperate the data for making the distribution plot. | ||
alpha_field : str | ||
The name of the column in `alpha_map` which includes the alpha | ||
diversity values. | ||
group_field : str | ||
Default is 'SIMPLE_BODY_SITE'. The name of the column in `alpha_map` | ||
which provides the grouping for generating distribution plots. | ||
output_dir : str | ||
The location where the alpha diversity figures should be saved. | ||
xlabel : str | ||
Text describing the quantity on the x-axis. | ||
|
||
Returns | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why return if the return isn't used? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Inline debugging |
||
------- | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This returns is a bit convoluted. What do you think about returning the exact same return types
And just setting the attributes to None if debug is specified and what not. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because having multiple levels of returns means not having a tuple of Nones. This simplifies the input/output and targets whats needed. |
||
If the sample is present, a matplotlib figure with the alpha diversity | ||
distribution and a line indicating the sample value is returned. If a | ||
file path is specified, the figure will be saved at the filepath instead | ||
of returning. | ||
|
||
If debug is passed, the following parameters are returned: | ||
group : str | ||
The value of the `group_field` for the sample | ||
group_alpha : ndarray | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ndarray -> np.ndarray ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ive not seen that notation in doc strings before, typically. Could you direct me to the standard you're using? Thanks There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See bullet 9 here It's good practice to add the module name in front of imported functions/objects There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Although the documentation guidelines don't really make this clear, so I think it is safe to ignore. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just as a note, we started using @mortonjt 's suggestion in Qiita. I think we should start using such notation in any new code as it helps new developers unfamiliar with the code to get up to speed. |
||
The alpha diversity values associated with the group | ||
sample_alpha : float | ||
The alpha diversity for the sample | ||
xlabel : str | ||
The label used for the x-axis of the plot. | ||
|
||
""" | ||
|
||
# Explicitly casts the alpha diversity to a float | ||
alpha_map[alpha_field] = alpha_map[alpha_field].astype(float) | ||
|
||
# Draws the observations and group | ||
group = alpha_map.loc[sample, group_field] | ||
group_alpha = alpha_map.loc[alpha_map[group_field] == group, alpha_field] | ||
sample_alpha = alpha_map.loc[sample, alpha_field] | ||
|
||
if xlabel is None: | ||
xlabel = '%sdiversity' % alpha_field.split('1')[0].replace('_', ' ') | ||
|
||
if debug: | ||
return group, group_alpha, sample_alpha, xlabel | ||
|
||
# Defines the group color. This is currently hardcoded, although the | ||
# longer term plan is to substitute in function which will define the color | ||
# based on the relationship between the sample and a yet to be written | ||
# predicted value. | ||
group_color = '#1f78b4' | ||
sample_color = '#525252' | ||
|
||
with sn.axes_style('ticks', {'axes.facecolor': 'none'}): | ||
# Sets up the axis for plotting | ||
ax = plt.axes() | ||
|
||
# Plots the distribution | ||
sn.kdeplot(group_alpha, | ||
ax=ax, | ||
legend=False, | ||
color=group_color) | ||
ylim = ax.get_ylim() | ||
|
||
# Plots the individual line | ||
ax.plot([sample_alpha, sample_alpha], [-1, 1], color=sample_color) | ||
|
||
# Returns the y-limits to the original value and removes the ticks | ||
ax.set_ylim(ylim) | ||
ax.set_yticks([]) | ||
# Removes the spine | ||
sn.despine(offset=5, trim=True, top=True, left=True, right=True) | ||
# Updates the xticks to match the correct font | ||
ax.set_xticklabels(map(int, ax.get_xticks()), size=11) | ||
ax.set_xlabel(xlabel, size=13) | ||
|
||
# Adds text describing the sample | ||
ax.text(x=ax.get_xticks().max(), | ||
y=ax.get_ylim()[1]*0.85, | ||
s='Your Sample:\t%1.1f\nAverage:\t%1.1f' | ||
% (sample_alpha, group_alpha.mean()), | ||
ha='right', | ||
size=11, | ||
) | ||
|
||
# Sets the figure size | ||
fig = ax.figure | ||
fig.set_size_inches((5, 2.5)) | ||
ax.set_position((0.125, 0.375, 0.75, 0.5)) | ||
|
||
if fp is None: | ||
return fig | ||
else: | ||
fig.savefig(fp, dpi=300) | ||
fig.clear() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
#SampleID TEST_CATEGORY AWESOME_CATEGORY PD_whole_tree_1k shannon_1ksample_a 1 super 5 2sample_b 2 totally 12 4 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Consider rewording? Keys are sampleIDs, values are error messages?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the documentation is modeled after every other function in this library.