-
Notifications
You must be signed in to change notification settings - Fork 4
/
atacseq.py
2964 lines (2495 loc) · 135 KB
/
atacseq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
import os
import pickle
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pybedtools
import seaborn as sns
from ngs_toolkit import _LOGGER
from ngs_toolkit.general import Analysis
from ngs_toolkit.decorators import (check_organism_genome, check_has_sites)
class ATACSeqAnalysis(Analysis):
"""
Class to model analysis of ATAC-seq data.
Inherits from the `ngs_toolkit.general.Analysis` class.
:param name: Name to give analysis object. Default is `atacseq_analysis`.
:param list samples: Iterable of peppy.Sample objects use in analysis.
If not provided (`None` is passed) and `prj` is, it will \
default to all samples in the `prj` object (`samples` attribute).
:param peppy.Project prj: Project to tie analysis to.
:param str data_dir: Directory containing relevant data for analysis. Default is `data`.
:param str results_dir: Directory to output relevant analysis results. Default is `results`.
:param str pickle_file: File path to use to save serialized object in `pickle` format.
:param bool from_pickle: If the analysis should be loaded from an \
existing pickle object. Default is `False.
:returns: Self
Remaining keyword arguments will be passed to parent class `ngs_toolkit.general.Analysis`.
:Example:
.. code-block:: python
import os
from peppy import Project
from ngs_toolkit.atacseq import ATACSeqAnalysis
prj = Project(os.path.join("metadata", "project_config.yaml"))
atac_analysis = ATACSeqAnalysis(
name=prj.project_name, prj=prj,
samples=[s for s in prj.samples if s.protocol == "ATAC-seq"])
# Get consensus peak set from all samples
atac_analysis.get_consensus_sites(atac_analysis.samples)
# Annotate regions
atac_analysis.calculate_peak_support(atac_analysis.samples)
atac_analysis.get_peak_gene_annotation()
atac_analysis.get_peak_genomic_location()
atac_analysis.get_peak_chromatin_state(
os.path.join(atac_analysis.data_dir, "external", "E032_15_coreMarks_mnemonics.bed"))
# Get coverage values for each peak in each sample of ATAC-seq
atac_analysis.measure_coverage(atac_analysis.samples)
# Normalize jointly (quantile normalization + GC correction)
atac_analysis.normalize(method="gc_content")
# Annotate quantified peaks with previously calculated metrics and features
atac_analysis.annotate()
# Annotate with sample metadata
atac_analysis.accessibility = atac_analysis.annotate_with_sample_metadata(
quant_matrix="coverage_annotated",
attributes=atac_analysis.sample_variables)
# Save object
atac_analysis.to_pickle()
"""
def __init__(
self,
name="atacseq_analysis",
samples=None,
prj=None,
data_dir="data",
results_dir="results",
pickle_file=None,
from_pickle=False,
**kwargs):
super(ATACSeqAnalysis, self).__init__(
name=name,
data_dir=data_dir,
results_dir=results_dir,
pickle_file=pickle_file,
samples=samples,
prj=prj,
from_pickle=from_pickle,
**kwargs)
self.data_type = self.__data_type__ = "ATAC-seq"
self._var_names = "region"
self._quantity = "accessibility"
self._norm_units = "RPM"
self._raw_matrix_name = "coverage"
self._norm_matrix_name = "coverage_qnorm" # or coverage_gc_corrected
# TODO: have normalization method reset the above value, with info to user
# or just overwrite
self._annot_matrix_name = "accessibility"
def load_data(self, output_mapping=None, only_these_keys=None, permissive=True, n_header_vars=5):
"""
Load the output files of the major functions of the Analysis.
:param dict output_mapping: Dictionary with "attribute name": "path prefix" to load the files.
:param list | None only_these_keys: Iterable of analysis attributes to load up.
Possible attributes:
"sites", "support", "coverage", "coverage_qnorm",
"nuc", "coverage_gc_corrected", "gene_annotation",
"region_annotation", "region_annotation_b",
"chrom_state_annotation", "chrom_state_annotation_b",
"coverage_annotated", "accessibility".
:param permissive bool: Whether an error should be thrown if reading a file causes IOError.
:raises IOError: If not `permissive` and any of the files are not readable.
:var pybedtools.BedTool sites: Sets a `sites` variable.
:var pandas.DataFrame support: Sets a `support` variable.
:var pandas.DataFrame coverage: Sets a `coverage` variable.
:var pandas.DataFrame coverage_qnorm: Sets a `coverage_qnorm` variable.
:var pandas.DataFrame nuc: Sets a `nuc` variable.
:var pandas.DataFrame coverage_gc_corrected: Sets a `coverage_gc_corrected` variable.
:var pandas.DataFrame gene_annotation: Sets a `gene_annotation` variable.
:var pandas.DataFrame region_annotation: Sets a `region_annotation` variable.
:var pandas.DataFrame region_annotation_b: Sets a `region_annotation_b` variable.
:var pandas.DataFrame chrom_state_annotation: Sets a `chrom_state_annotation` variable.
:var pandas.DataFrame chrom_state_annotation_b: Sets a `chrom_state_annotation_b` variable.
:var pandas.DataFrame coverage_annotated: Sets a `coverage_annotated` variable.
:var pandas.DataFrame accessibility: Sets a `accessibility` variable.
:returns: `None`
"""
if output_mapping is None:
# TODO: get default mapping by having functions declare what they output
# perhaps also with a dict of kwargs to pass to pandas.read_csv
output_mapping = {
"support": "_peaks.support.csv",
"coverage": "_peaks.raw_coverage.csv",
"coverage_qnorm": "_peaks.coverage_qnorm.csv",
"nuc": "_peaks.gccontent_length.csv",
"coverage_gc_corrected": "_peaks.coverage_gc_corrected.csv",
"gene_annotation": "_peaks.gene_annotation.csv",
"closest_tss_distances": "_peaks.closest_tss_distances.csv",
"region_annotation": "_peaks.region_annotation.csv",
"region_annotation_b": "_peaks.region_annotation_background.csv",
"region_annotation_mapping": "_peaks.region_annotation_mapping.csv",
"region_annotation_b_mapping": "_peaks.region_annotation_background_mapping.csv",
"chrom_state_annotation": "_peaks.chromatin_state.csv",
"chrom_state_annotation_b": "_peaks.chromatin_state_background.csv",
"chrom_state_annotation_mapping": "_peaks.chrom_state_annotation_mapping.csv",
"chrom_state_annotation_b_mapping": "_peaks.chrom_state_annotation_background_mapping.csv",
"stats": "_peaks.stats_per_region.csv",
"coverage_annotated": "_peaks.coverage_qnorm.annotated.csv",
}
if only_these_keys is None:
only_these_keys = [
"sites", "support", "coverage", "coverage_qnorm",
"nuc", "coverage_gc_corrected", "closest_tss_distances",
"gene_annotation", "region_annotation", "region_annotation_b",
"chrom_state_annotation", "chrom_state_annotation_b",
"chrom_state_annotation_mapping", "chrom_state_annotation_b_mapping",
"stats",
"coverage_annotated", "accessibility"]
output_mapping = {k: v for k, v in output_mapping.items() if k in only_these_keys}
for name, suffix in output_mapping.items():
file = os.path.join(self.results_dir, self.name + suffix)
if name in only_these_keys:
_LOGGER.info("Loading '{}' analysis attribute.".format(name))
try:
setattr(self, name, pd.read_csv(file, index_col=0))
except IOError as e:
if not permissive:
raise e
else:
_LOGGER.warning(e)
# Special cases
if "sites" in only_these_keys:
file = os.path.join(self.results_dir, self.name + "_peak_set.bed")
try:
setattr(self, "sites", pybedtools.BedTool(file))
except IOError as e:
if not permissive:
raise e
else:
_LOGGER.warning(e)
if "accessibility" in only_these_keys:
file = os.path.join(self.results_dir, self.name + ".accessibility.annotated_metadata.csv")
try:
# TODO: add configuration for custom number of header lines
# TODO: or think of smart way to infer
setattr(self, "accessibility", pd.read_csv(file, index_col=0, header=list(range(n_header_vars))))
except IOError as e:
if not permissive:
raise e
else:
_LOGGER.warning(e)
@staticmethod
def check_region_index(matrix):
return matrix.index.str.contains(":").all() and matrix.index.str.contains("-").all()
@staticmethod
def set_region_index(matrix):
from ngs_toolkit.atacseq import ATACSeqAnalysis
if ATACSeqAnalysis.check_region_index(matrix):
_LOGGER.warning("Matrix already has well-formatted index.")
return matrix
else:
req = ['chrom', 'start', 'end']
if all([x in matrix.columns for x in req]):
matrix.index = (
matrix['chrom'].astype(str) +
":" +
matrix['start'].astype(str) +
"-" +
matrix['end'].astype(str))
else:
raise ValueError()
def get_consensus_sites(
self, samples=None, region_type="summits", extension=250,
blacklist_bed=None,
filter_chrM=True,
permissive=False):
"""
Get consensus (union) of enriched sites (peaks) across samples.
There are two modes possible, defined by the value of ``region_type``:
- peaks: simple union of all sites
- summits: peak summits are extended by ``extension`` and a union is made,
:param list samples: Iterable of peppy.Sample objects to restrict
to. Must have a `peaks` attribute set.
If not provided (`None` is passed) if will default to all samples
in the analysis (`samples` attribute).
:param str region_type: One of `summits` or `peaks`. The type of region
to use to create the consensus region set.
If `summits`, peak summits will be extended by `extension`
before union. Otherwise sample peaks will be used with
no modification.
:param int extension: Amount to extend peaks summits by in both directions.
:param str blacklist_bed: A (3 column) BED file with genomic positions to
exclude from consensus peak set.
:param bool filter_chrM: Whether to exclude 'chrM' from peak set.
:param bool permissive: Whether Samples that which `region_type` attribute file does
not exist should be simply skipped or an error thrown.
:raises IOError: If not `permissive` and either the `peaks` or `summits` file
of a sample is not readable. Or if `permissive` but none of the samples
has an existing file.
:var pybedtools.BedTool sites: Sets a `sites` variable with consensus peak set.
:returns: `None`
"""
from tqdm import tqdm
if region_type not in ['summits', 'peaks']:
msg = "`region_type` attribute must be one of 'summits' or 'peaks'!"
_LOGGER.error(msg)
raise ValueError(msg)
if samples is None:
samples = self.samples
# Check whether all or any samples have input files (dependent on permissive)
check = self._check_samples_have_file(attr=region_type, f=all if permissive else any)
if (not permissive) and (not check):
miss = self._get_samples_missing_file(attr=region_type)
msg = "Not all samples have '{}' files.".format(region_type)
hint = " Samples missing files: {}".format(", ".join([s.name for s in miss]))
_LOGGER.error(msg + hint)
raise IOError(msg)
if permissive:
# if permissive, work only with samples with file
samples = self._return_samples_have_file(attr=region_type)
if blacklist_bed is None:
from ngs_toolkit.general import get_blacklist_annotations
_LOGGER.info("Blacklist file not provided. Downloading...")
try:
get_blacklist_annotations(self.organism, self.genome)
except AttributeError:
msg = "Blacklist file was not provided and cannot"
msg += " get one without analysis having `organism` and `genome` set."
_LOGGER.error(msg)
raise AttributeError(msg)
for i, sample in tqdm(enumerate(samples), total=len(samples), desc="Sample"):
# print(sample.name)
if region_type == "summits":
try:
peaks = pybedtools.BedTool(sample.summits).slop(b=extension, genome=sample.genome)
except ValueError as e:
if permissive:
_LOGGER.warning(
"Summits for sample {} ({}) not found!".format(sample, sample.summits))
continue
else:
raise e
else:
try:
peaks = pybedtools.BedTool(sample.peaks)
except ValueError as e:
if permissive:
_LOGGER.warning(
"Peaks for sample {} ({}) not found!".format(sample, sample.peaks))
continue
else:
raise e
# Merge overlaping peaks within a sample
peaks = peaks.merge()
if i == 0:
sites = peaks
else:
# Concatenate all peaks
sites = sites.cat(peaks)
if "sites" not in locals():
msg = "Couldn't read peak file for any sample."
_LOGGER.error(msg)
raise ValueError(msg)
# Merge overlaping peaks across samples
sites = sites.merge()
# Filter
# # remove blacklist regions
if blacklist_bed is not False:
if not isinstance(blacklist_bed, pybedtools.BedTool):
blacklist = pybedtools.BedTool(blacklist_bed)
sites = sites.intersect(v=True, b=blacklist)
# # remove chrM peaks
if filter_chrM:
sites = sites.filter(lambda x: x.chrom != 'chrM')
# Save
sites.saveas(os.path.join(self.results_dir, self.name + "_peak_set.bed"))
# Read up again
self.sites = pybedtools.BedTool(os.path.join(self.results_dir, self.name + "_peak_set.bed"))
def set_consensus_sites(self, bed_file, overwrite=True):
"""
Set consensus (union) sites across samples given a BED file.
:param str bed_file: BED file to use as consensus sites.
:param book overwrite: Whether a possibly existing file with a consensus peak set \
for this analysis should be overwritten in disk.
:var pybedtools.BedTool sites: Sets a `sites` variable with consensus peak set.
"""
self.sites = pybedtools.BedTool(bed_file)
if overwrite:
default_sites = os.path.join(self.results_dir, self.name + "_peak_set.bed")
# pybedtools will pipe to the input file!
if bed_file == default_sites:
self.sites.saveas(default_sites + ".new")
os.rename(default_sites + ".new", default_sites)
else:
self.sites.saveas(default_sites)
@check_has_sites
def calculate_peak_support(self, samples=None, region_type="summits", permissive=False):
"""
Count number of called peaks per sample in the consensus region set.
In addition calculate a measure of peak support (or ubiquitouness) by
observing the ratio of samples containing a peak overlapping each region.
:param list samples: Iterable of peppy.Sample objects to restrict
to. Must have a `peaks` attribute set.
If not provided (`None` is passed) if will default to all samples
in the analysis (`samples` attribute).
:param str region_type: One of `summits` or `peaks`. The type of region
to use to create the consensus region set.
If `summits`, peak summits will be extended by `extension`
before union. Otherwise sample peaks will be used with
no modification.
:param bool permissive: Whether Samples that which `region_type` attribute file does
not exist should be simply skipped or an error thrown.
:raises IOError: If not `permissive` and either the `peaks` or `summits` file
of a sample is not readable. Or if `permissive` but none of the samples
has an existing file.
:var pandas.DataFrame support: Sets a `support` variable with peak set overlap.
"""
from tqdm import tqdm
if samples is None:
samples = self.samples
# Check whether all or any samples have input files (dependent on permissive)
check = self._check_samples_have_file(attr=region_type, f=all if permissive else any)
if (not permissive) and (not check):
miss = self._get_samples_missing_file(attr=region_type)
msg = "Not all samples have '{}' files.".format(region_type)
hint = " Samples missing files: {}".format(", ".join([s.name for s in miss]))
_LOGGER.error(msg + hint)
raise IOError(msg)
if permissive:
# if permissive, work only with samples with file
samples = self._return_samples_have_file(attr=region_type)
# calculate support (number of samples overlaping each merged peak)
for i, sample in tqdm(enumerate(samples), total=len(samples), desc="Sample"):
# print(sample.name)
if region_type == "summits":
peaks = sample.summits
else:
peaks = sample.peaks
if i == 0:
support = self.sites.intersect(peaks, wa=True, c=True)
else:
support = support.intersect(peaks, wa=True, c=True)
try:
support = support.to_dataframe()
except (ValueError, pybedtools.MalformedBedLineError, pybedtools.helpers.BEDToolsError):
_LOGGER.debug("Could not convert support intersection directly to dataframe, saving temporarly file.")
support.saveas("_tmp.peaks.bed")
support = pd.read_csv("_tmp.peaks.bed", sep="\t", header=None)
os.remove("_tmp.peaks.bed")
support.columns = ["chrom", "start", "end"] + [sample.name for sample in samples]
support.index = support['chrom'] + ":" + support['start'].astype(str) + "-" + support['end'].astype(str)
support.to_csv(os.path.join(self.results_dir, self.name + "_peaks.binary_overlap_support.csv"), index=True)
# divide sum (of unique overlaps) by total to get support value between 0 and 1
support["support"] = (
support[[sample.name for sample in samples]]
.apply(lambda x: sum([i if i <= 1 else 1 for i in x]) / float(len(self.samples)), axis=1))
# save
support.to_csv(os.path.join(self.results_dir, self.name + "_peaks.support.csv"), index=True)
setattr(self, 'support', support)
return self.support
def get_supported_peaks(self, samples=None):
"""
Get mask of sites with 0 support in the given samples.
Requires support matrix produced by `ngs_toolkit.atacseq.ATACSeqAnalysis.calculate_peak_support`.
:param list samples: Iterable of peppy.Sample objects to restrict to.
:returns pd.Series: Boolean Pandas Series with sites with at least one of the \
given samples having a peak called.
"""
if samples is None:
samples = self.samples
return self.support.loc[:, [s.name for s in samples]].sum(1) != 0
def measure_coverage(
self, samples=None, sites=None, assign=True,
save=True, output_file=None, permissive=False):
"""
Measure read coverage (counts) of each sample in each region in consensus sites.
Will try to use parallel computing using the `parmap` library.
:param list samples: Iterable of peppy.Sample objects to restrict
to. Must have a `filtered` attribute set.
If not provided (`None` is passed) if will default to all samples
in the analysis (`samples` attribute).
:param pybedtools.BedTool sites: Sites in the genome to quantify.
If `None` the object's `sites` attribute will be used.
:param pybedtools.BedTool assign: Whether to assign the matrix to an attribute of self
named `coverage`.
:param pybedtools.BedTool save: Whether to save to disk the coverage matrix with filename
`output_file`.
:param str output_file: A path to a CSV file with coverage output.
Default is `self.results_dir/self.name + "_peaks.raw_coverage.csv"`.
:param bool permissive: Whether Samples that which `region_type` attribute file does
not exist should be simply skipped or an error thrown.
:raises IOError: If not `permissive` and the 'aligned_filtered_bam' file attribute
of a sample is not readable. Or if `permissive` but none of the samples
has an existing file.
:var pd.DataFrame coverage: Sets a `coverage` variable with DataFrame with read counts
of shape (n_sites, m_samples).
:returns pd.DataFrame: Pandas DataFrame with read counts of shape (n_sites, m_samples).
"""
import multiprocessing
import parmap
from ngs_toolkit.general import count_reads_in_intervals
if samples is None:
samples = self.samples
# Check whether all or any samples have input files (dependent on permissive)
attr = "aligned_filtered_bam"
check = self._check_samples_have_file(attr=attr, f=all)
if not check:
miss = self._get_samples_missing_file(attr=attr)
msg = "Not all samples have '{}' files.".format(attr)
hint = " Samples missing files: {}".format(", ".join([s.name for s in miss]))
_LOGGER.error(msg + hint)
raise IOError(msg)
if permissive:
# if permissive, work only with samples with file
samples = self._return_samples_have_file(attr=attr)
if sites is None:
sites = self.sites
# Count reads with pysam
# make strings with intervals
if isinstance(sites, pybedtools.bedtool.BedTool):
sites_str = [
str(i.chrom) + ":" +
str(i.start) + "-" +
str(i.stop) for i in self.sites]
elif isinstance(sites, pd.core.frame.DataFrame):
sites_str = (
sites.iloc[:, 0] + ":" +
sites.iloc[:, 1].astype(str) + "-" +
sites.iloc[:, 2].astype(str)).astype(str).tolist()
elif isinstance(sites, str):
sites_str = [
str(i.chrom) + ":" +
str(i.start) + "-" +
str(i.stop) for i in pybedtools.bedtool.BedTool(sites)]
# count, create dataframe
coverage = pd.DataFrame(
map(
lambda x:
pd.Series(x),
parmap.map(
count_reads_in_intervals,
[sample.aligned_filtered_bam for sample in samples],
sites_str,
parallel=True
)
),
index=[sample.name for sample in samples]
).T
if assign:
self.coverage = coverage
if save:
if output_file is not None:
coverage.to_csv(output_file, index=True)
else:
self.coverage.to_csv(os.path.join(
self.results_dir, self.name + "_peaks.raw_coverage.csv"), index=True)
return coverage
def normalize_coverage_rpm(
self, matrix=None, samples=None,
mult_factor=1e6, log_transform=True, pseudocount=1,
save=True, assign=True):
"""
Normalization of matrix of (n_features, n_samples) by total in each sample.
:param str matrix: Attribute name of matrix to normalize. Defaults to 'coverage'.
:param list samples: Iterable of peppy.Sample objects to restrict matrix to.
If not provided (`None` is passed) the matrix will not be subsetted.
:param float mult_factor: A constant to multiply values for.
:param bool log_transform: Whether to log transform values or not.
:param [int, float] pseudocount: A constant to add to values.
:param bool save: Whether to write normalized DataFrame to disk.
:param bool assign: Whether to assign the normalized DataFrame to an attribute ``.
"""
to_norm = self.get_matrix(matrix=matrix, samples=samples, matrix_name="coverage")
# apply normalization over total
coverage_rpm = ((pseudocount + to_norm) / (pseudocount + to_norm).sum()) * mult_factor
# Log2 transform
if log_transform:
coverage_rpm = np.log2(coverage_rpm)
# coverage_rpm = coverage_rpm.join(self.coverage[['chrom', 'start', 'end']])
if save:
coverage_rpm.to_csv(os.path.join(self.results_dir, self.name + "_peaks.coverage_rpm.csv"), index=True)
if assign:
self.coverage_rpm = coverage_rpm
return coverage_rpm
def normalize_coverage_quantiles(
self, matrix=None, samples=None, implementation="Python",
log_transform=True, pseudocount=1, save=True, assign=True):
"""
Quantile normalization of matrix of (n_features, n_samples).
:param str matrix: Attribute name of matrix to normalize. Defaults to 'coverage'.
:param list samples: Iterable of peppy.Sample objects to restrict matrix to.
If not provided (`None` is passed) the matrix will not be subsetted.
:param str implementation: One of `"R"` or `"Python"`. Dictates which implementation
is to be used. The R implementation comes from the
`preprocessCore` package, and the Python one is from
here https://github.com/ShawnLYU/Quantile_Normalize.
:param bool log_transform: Whether to log transform values or not.
:param float pseudocount: A constant to add before log transformation.
:param bool save: Whether to write normalized DataFrame to disk.
:param bool assign: Whether to assign the normalized DataFrame to an attribute ``.
"""
if matrix is None:
to_norm = self.get_matrix(matrix=matrix, samples=samples, matrix_name="coverage")
else:
to_norm = matrix
if implementation == "R":
from ngs_toolkit.general import normalize_quantiles_r
coverage_qnorm = pd.DataFrame(
normalize_quantiles_r(to_norm.values),
index=to_norm.index,
columns=to_norm.columns)
elif implementation == "Python":
from ngs_toolkit.general import normalize_quantiles_p
coverage_qnorm = normalize_quantiles_p(to_norm)
else:
msg = "Implementation of quantile normalization must be one of 'R' of 'Python'"
_LOGGER.error(msg)
raise ValueError(msg)
# Log2 transform
if log_transform:
coverage_qnorm = np.log2(pseudocount + coverage_qnorm)
if coverage_qnorm.min().min() <= 0:
coverage_qnorm += coverage_qnorm.abs().min().min()
# # Add back postition columns
# coverage_qnorm = coverage_qnorm.join(self.coverage[['chrom', 'start', 'end']])
if save:
coverage_qnorm.to_csv(os.path.join(
self.results_dir, self.name + "_peaks.coverage_qnorm.csv"), index=True)
if assign:
self.coverage_qnorm = coverage_qnorm
return coverage_qnorm
@check_organism_genome
def get_peak_gccontent_length(
self, bed_file=None, fasta_file=None):
"""
Get length and GC content of features in region set.
:param str bed_file: A BED file with regions to calculate GC content on.
Must be a 3-column BED!
If not provided the calculation will be for the analysis
`sites` attribute.
:param str genome: Genome assembly.
:param str fasta_file: Fasta file of `genome`. Should be indexed.
If not given, will try to download.
:var nuc: DataFrame with nucleotide content and length of each region.
:returns pandas.DataFrame: DataFrame with nucleotide content and length of each region.
"""
if bed_file is None:
sites = self.sites
else:
sites = pybedtools.BedTool(bed_file)
if fasta_file is None:
_LOGGER.info("Reference genome FASTA file was not given, will try to get it.")
_LOGGER.info("Getting genome FASTA file for organism '{}', genome '{}'. "
.format(self.organism, self.genome))
fasta_file = self.get_annotations(steps=['fasta'])['fasta_file']
nuc = sites.nucleotide_content(fi=fasta_file).to_dataframe(comment="#")[["score", "blockStarts"]]
nuc.columns = ["gc_content", "length"]
nuc.index = [str(i.chrom) + ":" + str(i.start) + "-" + str(i.stop) for i in sites]
# get only the sites matching the coverage (not overlapping blacklist)
self.nuc = nuc.ix[self.coverage.index]
self.nuc.to_csv(os.path.join(
self.results_dir, self.name + "_peaks.gccontent_length.csv"), index=True)
return self.nuc
def normalize_gc_content(self, matrix=None, samples=None, save=True, assign=True):
"""
Quantile normalization of matrix of (n_features, n_samples) followed by GC content
correction by regression.
Requires the R package "cqn" to be installed:
>>> source('http://bioconductor.org/biocLite.R')
>>> biocLite('cqn')
:param str matrix: Attribute name of matrix to normalize. Defaults to 'coverage'.
:param list samples: Iterable of peppy.Sample objects to restrict
matrix to.
If not provided (`None` is passed) the matrix will not be subsetted.
:param bool save: Whether to write normalized DataFrame to disk.
:param bool assign: Whether to assign the normalized DataFrame to an attribute ``.
"""
"""
# Run manually:
library("cqn")
gc = read.csv("gccontent_length.csv", sep=",", row.names=1)
cov = read.csv("coverage_qnorm.csv", sep=",", row.names=1)
cov2 = cov[, 1:(length(cov) - 3)]
cqn_out = cqn(cov2, x=gc$gc_content, lengths=gc$length)
y = cqn_out$y +cqn_out$offset
y2 = cbind(y, cov[, c("chrom", "start", "end")])
write.csv(y2, "coverage_gc_corrected.csv", sep=",")
# Fix R's stupid colnames replacement
sed -i 's/ATAC.seq_/ATAC-seq_/g' coverage_gc_corrected.csv
"""
def cqn(cov, gc_content, lengths):
# install R package
# source('http://bioconductor.org/biocLite.R')
# biocLite('cqn')
from rpy2 import robjects
import rpy2.robjects.pandas2ri
import warnings
from rpy2.rinterface import RRuntimeWarning
warnings.filterwarnings("ignore", category=RRuntimeWarning)
robjects.numpy2ri.deactivate()
rpy2.robjects.pandas2ri.activate()
robjects.r('require("cqn")')
cqn = robjects.r('cqn')
cqn_out = cqn(cov, x=gc_content, lengths=lengths)
y_r = cqn_out[list(cqn_out.names).index('y')]
y = pd.DataFrame(
np.array(y_r),
index=cov.index,
columns=cov.columns)
offset_r = cqn_out[list(cqn_out.names).index('offset')]
offset = pd.DataFrame(
np.array(offset_r),
index=cov.index,
columns=cov.columns)
return y + offset
# Perform quantile normalization first
if not hasattr(self, "nuc"):
self.normalize_coverage_quantiles(samples)
# Get GC content and length of each feature
if not hasattr(self, "nuc"):
self.get_peak_gccontent_length()
to_norm = self.get_matrix(matrix=matrix, samples=samples, matrix_name="coverage")
coverage_gc_corrected = (
cqn(cov=to_norm, gc_content=self.nuc["gc_content"], lengths=self.nuc["length"])
# .join(self.coverage[['chrom', 'start', 'end']])
)
if save:
coverage_gc_corrected.to_csv(os.path.join(
self.results_dir, self.name + "_peaks.coverage_gc_corrected.csv"), index=True)
if assign:
self.coverage_gc_corrected = coverage_gc_corrected
return coverage_gc_corrected
def normalize(self, method="quantile", matrix=None, samples=None, save=True, assign=True):
"""
Normalization of matrix of (n_features, n_samples).
:param str method: Normalization method to apply. One of:
`total`: Reads per total normalization (RPM).
`quantile`: Quantile normalization and log2 transformation.
`gc_content`: Quantile normalization followed by GC content
correction by regression (cqn R package)
and log2 transformation.
:param str matrix: Attribute name of matrix to normalize.
:param list samples: Iterable of peppy.Sample objects to restrict
matrix to.
If not provided (`None` is passed) the matrix will not be subsetted.
:param bool save: Whether to write normalized DataFrame to disk.
:param bool assign: Whether to assign the normalized DataFrame to an attribute
(see variables below for each respective normalization type).
:var pd.DataFrame coverage_rpm: If `assign` is True, `coverage_rpm` is a pandas
DataFrame normalized with RPM.
:var pd.DataFrame coverage_qnorm: If `assign` is True, `coverage_qnorm`
is a pandas DataFrame normalized with
quantile normalization.
:var pd.DataFrame coverage_gc_corrected: If `assign` is True, `coverage_gc_corrected`
is a pandas DataFrame normalized with GC correction
and quantile normalization.
:returns pd.DataFrame: Normalized pandas DataFrame.
"""
if method == "total":
return self.normalize_coverage_rpm(
matrix=matrix, samples=samples, save=save, assign=assign)
elif method == "quantile":
return self.normalize_coverage_quantiles(
matrix=matrix, samples=samples, save=save, assign=assign)
elif method == "gc_content":
return self.normalize_gc_content(
matrix=matrix, samples=samples, save=save, assign=assign)
else:
msg = "Requested normalization method is not available!"
_LOGGER.error(msg)
raise ValueError(msg)
@check_has_sites
def get_peak_gene_annotation(self, tss_file=None, max_dist=100000):
"""
Annotates peaks with closest gene.
The annotation reference can either be given in the `tss_file` parameter
but if ommited, it will be fetched if analysis has `genome` and `organism`
attributes.
A dataframe with each feature's distance to the nearest gene is also saved.
:param str tss_file: A valid BED file where the name field (4th column) identifies the gene
and the strand column (6th column). Other fields will not be used.
:param int max_dist: Maximum absolute distance allowed to perform associations.
Regions with no genes within the range will have NaN values.
:var pd.DataFrame gene_annotation: A pandas DataFrame containing the genome
annotations of the region features. If a feature
overlaps more than one gene, the two gene values will
be concatenated with a comma.
:var pd.DataFrame closest_tss_distances: A pandas DataFrame containing unique region->gene
associations. In contrast to gene_annotation dataframe,
this contains one row per region->gene assignment.
:returns: A dataframe with genes annotated for the peak set.
"""
from ngs_toolkit.general import bed_to_index
cols = [6, 8, 9]
if tss_file is None:
_LOGGER.info("Reference TSS file was not given, will try to get TSS annotations.")
_LOGGER.info("Getting TSS annotations for organism '{}', genome '{}'. "
.format(self.organism, self.genome))
tss_file = self.get_annotations(steps=['tss'])['tss_file']
# extract only relevant columns
tss = pd.read_csv(tss_file, header=None, sep="\t")
tss = tss.iloc[:, list(range(6))]
tss = pybedtools.BedTool.from_dataframe(tss)
if isinstance(self.sites, str):
self.sites = pybedtools.BedTool(self.sites)
# get closest TSS of each region
self.closest_tss_distances = self.sites.closest(tss, D="b").to_dataframe()
# rename
self.closest_tss_distances = (self.closest_tss_distances[
['chrom', 'start', 'end'] +
self.closest_tss_distances.columns[cols].tolist()])
self.closest_tss_distances.columns = [
'chrom', 'start', 'end', 'gene_name', "strand", 'distance']
# set NaN to distance without assignment (rather than the default '-1' from bedtools)
self.closest_tss_distances.loc[
self.closest_tss_distances['gene_name'] == '.', 'distance'] = np.nan
# set NaN to assignments out of range
self.closest_tss_distances.loc[
self.closest_tss_distances['distance'].abs() > max_dist,
['gene_name', 'strand', 'distance']] = np.nan
# aggregate annotation per peak, concatenate various genes (comma-separated)
self.gene_annotation = (
self.closest_tss_distances
.groupby(['chrom', 'start', 'end'])
.aggregate(
lambda x: ",".join(set([str(i) for i in x if i != '.'])))
.reset_index())
self.closest_tss_distances.index = bed_to_index(self.closest_tss_distances)
self.gene_annotation.index = bed_to_index(self.gene_annotation)
# save to disk
self.closest_tss_distances.to_csv(
os.path.join(self.results_dir,
self.name + "_peaks.closest_tss_distances.csv"),
index=True)
self.gene_annotation.to_csv(
os.path.join(self.results_dir,
self.name + "_peaks.gene_annotation.csv"),
index=True)
return self.gene_annotation
@check_organism_genome
@check_has_sites
def get_peak_genomic_location(
self, genomic_context_file=None):
"""
Annotates a consensus peak set (``sites`` attribute of analysis) with their genomic context.
The genomic context is mostly gene-centric, which includes overlap with
gene promoters, UTRs, exons, introns and remaining intergenic space.
If no reference genomic annotation file is given (genomic_context_file kwarg),
it will use the ngs_toolkit.general.get_genomic_context function to get
such data. For more customization of the annotations, use that function directly and pass
the output file to this function.
:param str genomic_context_file: A 4 column BED file (chrom, start, end, feature),
where feature is a string with the type of region.
If not provided will be get with the get_genomic_context
function.
:var pd.DataFrame region_annotation: A DataFrame with the genome
annotations of the region features.
:var pd.DataFrame region_annotation_b: A DataFrame with the genome
annotations of the genome background.
:var pd.DataFrame region_annotation_mapping: A DataFrame with one row for each
chromatin state-region mapping.
:var pd.DataFrame region_annotation_b_mapping: A DataFrame with one row for each
chromatin state-region mapping
of the genome background.
:returns: A dataframe with genomic context annotation for the peak set.
"""
from ngs_toolkit.general import bed_to_index
if genomic_context_file is None:
_LOGGER.info("Reference genomic context file was not given, will try to get it.")
_LOGGER.info("Getting genomic context annotations for organism '{}', genome '{}'. "
.format(self.organism, self.genome))
genomic_context_file = self.get_annotations(steps=['genomic_context'])['genomic_context_file']
context = pybedtools.BedTool(genomic_context_file)
if isinstance(self.sites, str):
self.sites = pybedtools.BedTool(self.sites)
# create background
# shuffle regions in genome to create background (keep them in the same chromossome)
background = self.sites.shuffle(genome=self.genome, chrom=True)
cols = [0, 1, 2, 6]
for label, attr, bed in [
("background", "region_annotation_b", background),
("real", "region_annotation", self.sites)]:
annot = (
bed.intersect(context, wa=True, wb=True, f=0.2)
.sort().to_dataframe(usecols=cols))
annot.index = bed_to_index(annot)
annot.columns = ['chrom', 'start', 'end', 'genomic_region']
# remove duplicates (there shouldn't be anyway)
annot = annot.drop_duplicates()
# join various annotations per peak
annot_comp = (
annot.groupby(['chrom', 'start', 'end'])
.aggregate(lambda x: ",".join(set([str(i) for i in x]))).reset_index())
annot_comp.index = bed_to_index(annot_comp)
annot_comp.columns = ['chrom', 'start', 'end', 'genomic_region']
# save to disk
a = "" if attr == "real" else "_" + attr
annot.to_csv(os.path.join(
self.results_dir, self.name + "_peaks.region_annotation{}_mapping.csv".format(a)),
index=True)
annot_comp.to_csv(os.path.join(
self.results_dir, self.name + "_peaks.region_annotation{}.csv".format(a)),
index=True)
setattr(self, attr, annot_comp)
setattr(self, attr + "_mapping", annot)
return self.region_annotation
@check_organism_genome
@check_has_sites
def get_peak_chromatin_state(
self, chrom_state_file, frac=0.2):
"""
Annotates a consensus peak set (``sites`` attribute of analysis) with their chromatin
state context. This would be given, for example by a chromatin state segmentation
file from projects such as Roadmap Epigenomics.
See examples of such files for various cell types/assemblies here:
https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/coreMarks/jointModel/final/
(the *_dense.bed.gz files are optimal).
:param str chrom_state_file: A 4 column BED file (chrom, start, end, feature),
where feature is a string with the type of region.
:param float frac: Minimal fraction of region to overlap with a feature.
:var pd.DataFrame chrom_state_annotation: A DataFrame with the chromatin state
annotations of the region features.
:var pd.DataFrame chrom_state_annotation_b: A DataFrame with the chromatin state
annotations of the genome background.
:var pd.DataFrame chrom_state_annotation_mapping: A DataFrame with one row for each
chromatin state-region mapping.
:var pd.DataFrame chrom_state_annotation_b_mapping: A DataFrame with one row for each
chromatin state-region mapping
of the genome background.
:returns: A dataframe with chromatin state annotation for the peak set.
"""
from ngs_toolkit.general import bed_to_index
states = pybedtools.BedTool(chrom_state_file)
if isinstance(self.sites, str):
self.sites = pybedtools.BedTool(self.sites)
# create background
# shuffle regions in genome to create background (keep them in the same chromossome)
background = self.sites.shuffle(genome=self.genome, chrom=True)
for label, attr, bed in [
("real", "chrom_state_annotation", self.sites),