-
Notifications
You must be signed in to change notification settings - Fork 4
/
general.py
5389 lines (4711 loc) · 240 KB
/
general.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
import os
import numpy as np
import pandas as pd
from ngs_toolkit import _LOGGER
from ngs_toolkit import _CONFIG
from ngs_toolkit.analysis import Analysis
Analysis
def get_genome_reference(
organism, genome_assembly=None, output_dir=None,
genome_provider="UCSC", file_format="fasta", dry_run=False):
"""
Get genome FASTA/2bit file.
Saves results to disk and returns path to file.
:param organism: Organism to get annotation for. Currently supported: "human" and "mouse".
:type organism: str
:param output_dir: Directory to write output to. Defaults to current directory
:type output_dir: str, optional
:param genome_provider: Which genome provider to use. One of 'UCSC' or 'Ensembl'.
:type genome_provider: str, optional
:param file_format: File format to get. One of 'fasta' or '2bit'.
:type file_format: str, optional
:param dry_run: Whether to not download and just return path to file.
:type dry_run: bool, optional
:returns: If not ``dry_run``, path to genome FASTA/2bit file,
otherwise tuple of URL of reference genome and path to file.
:rtype: str | tuple
"""
from ngs_toolkit.general import download_gzip_file, download_file
import pybedtools
if output_dir is None:
output_dir = os.path.join(os.path.abspath(os.path.curdir), "reference")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
opts = ['UCSC', 'Ensembl']
if genome_provider not in opts:
msg = "`genome_provider` attribute must be one of '{}'.".format(", ".join(opts))
_LOGGER.error(msg)
raise ValueError(msg)
opts = ['fasta', '2bit']
if file_format not in opts:
msg = "`file_format` attribute must be one of '{}'.".format(", ".join(opts))
_LOGGER.error(msg)
raise ValueError(msg)
if (genome_provider == "Ensembl") and (file_format == '2bit'):
msg = "Ensembl does not provide 2bit files."
hint = " Use for example 'faToTwoBit' to convert the FASTA file."
_LOGGER.error(msg + hint)
raise ValueError(msg)
if (genome_provider == "UCSC") and (file_format == 'fasta') and (genome_assembly == 'hg19'):
msg = "UCSC does not provide FASTA files for the hg19 assembly."
hint = " Download a 2bit file and use for example 'TwoBitToFa' to convert."
_LOGGER.error(msg + hint)
raise ValueError(msg)
if genome_provider == "UCSC":
organisms = {
"human": "hg19", "hsapiens": "hg19", "homo_sapiens": "hg19",
"mouse": "mm10", "mmusculus": "mm10", "mus_musculus": "mm10"}
base_link = "http://hgdownload.cse.ucsc.edu/goldenPath/{assembly}/bigZips/{assembly}"
base_link += '.fa.gz' if file_format == 'fasta' else '.2bit'
if genome_assembly is None:
genome_assembly = organisms[organism]
url = base_link.format(assembly=genome_assembly)
elif genome_provider == "Ensembl":
organisms = {
"human": {"long_species": "homo_sapiens", "version": "grch37", "release": "75"},
"hsapiens": {"long_species": "homo_sapiens", "version": "grch37", "release": "75"},
"homo_sapiens": {"long_species": "homo_sapiens", "version": "grch37", "release": "75"},
"mouse": {"long_species": "mus_musculus", "version": "grcm38", "release": "94"},
"mmusculus": {"long_species": "mus_musculus", "version": "grcm38", "release": "94"},
"mus_musculus": {"long_species": "mus_musculus", "version": "grcm38", "release": "94"}}
if genome_assembly is None:
genome_assembly = organisms[organism]['version'].replace("grc", "GRC")
base_link = "ftp://ftp.ensembl.org/pub/release-{release}/fasta/{long_organism}/dna/"
base_link += "{Clong_organism}.{assembly}."
base_link += "{}.".format(organisms[organism]['release']) if genome_assembly.endswith("37") else ""
base_link += "{sequence_type}.{id_type}.{id}fa.gz".format(
sequence_type="dna", id_type="primary_assembly", id="")
url = base_link.format(release=organisms[organism]['release'],
long_organism=organisms[organism]['long_species'],
Clong_organism=organisms[organism]['long_species'].capitalize(),
assembly=genome_assembly)
genome_file = os.path.join(output_dir, "{}.{}.fa".format(organism, genome_assembly))
# create .fai index for fasta file
if file_format == 'fasta':
if not dry_run:
download_gzip_file(url, genome_file)
bed = pd.DataFrame([['chr1', 1, 2]])
try:
bed = pybedtools.BedTool().from_dataframe(bed)
bed.nucleotide_content(fi=genome_file)
# The idea is to use a hidden method of bedtools
# to create an index (to skip having e.g. samtools as dependency)
# and use the bedtool nuc command to to do it.
# This actually fails to get nucleotide content every time due to this 'bug':
# https://github.com/daler/pybedtools/issues/147
# but nonetheless creates an index :whatever:
except pybedtools.helpers.BEDToolsError:
pass
return genome_file
else:
if not dry_run:
download_file(url, genome_file)
return genome_file
return (url, genome_file)
def get_blacklist_annotations(
organism, genome_assembly=None, output_dir=None):
"""
Get annotations of blacklisted genomic regions for a given organism/genome assembly.
Saves results to disk and returns a path to a BED file.
:param organism: Organism to get annotation for. Currently supported: "human" and "mouse".
:type organism: str
:param genome_assembly: Ensembl assembly/version to use.
Default for "human" is "hg19/grch37" and for "mouse" is "mm10/grcm38".
:type genome_assembly: str, optional
:param output_dir: Directory to write output to. Defaults to "reference" in current directory.
:type output_dir: str, optional
:returns: Path to blacklist BED file
:rtype: str
"""
from ngs_toolkit.general import download_gzip_file
if output_dir is None:
output_dir = os.path.join(os.path.abspath(os.path.curdir), "reference")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
organisms = {
"human": "hg19",
"mouse": "mm10"}
if genome_assembly is None:
genome_assembly = organisms[organism]
_LOGGER.warn("Genome assembly not selected. Using assembly '{}' for '{}'."
.format(genome_assembly, organism))
url = "http://mitra.stanford.edu/kundaje/akundaje/release/blacklists/"
if genome_assembly not in ['hg19']:
url += "{0}-{1}/{0}.blacklist.bed.gz".format(genome_assembly, organism)
else:
url += "{0}-human/wgEncodeHg19ConsensusSignalArtifactRegions.bed.gz".format(genome_assembly)
output = os.path.join(output_dir, "{}.{}.blacklist.bed"
.format(organism, genome_assembly))
download_gzip_file(url, output)
return output
def get_tss_annotations(
organism, genome_assembly=None, save=True, output_dir=None, chr_prefix=True,
gene_types=['protein_coding', 'processed_transcript', 'lincRNA', 'antisense']):
"""
Get annotations of TSS for a given organism/genome assembly.
This is a simple approach using Biomart's API querying the Ensembl database.
Saves results to disk and returns a dataframe.
:param organism: Organism to get annotation for. Currently supported: "human" and "mouse".
:type organism: str
:param genome_assembly: Ensembl assembly/version to use.
Default for "human" is "grch37" and for "mouse" is "grcm38".
:type genome_assembly: str, optional
:param save: Whether to save to disk under ``output_dir``. Defaults to True.
:type save: bool, optional
:param output_dir: Directory to write output to. Defaults to "reference" in current directory.
:type output_dir: str, optional
:param chr_prefix: Whether chromosome names should have the "chr" prefix. Defaults to True
:type chr_prefix: bool, optional
:param gene_types: Subset of transcript biotypes to keep.
See here the available biotypes https://www.ensembl.org/Help/Faq?id=468
Defaults to 'protein_coding', 'processed_transcript', 'lincRNA', 'antisense'.
:type gene_types: list, optional
:returns: DataFrame with genome annotations
:rtype: pandas.DataFrame
"""
from ngs_toolkit.general import query_biomart
organisms = {
"human": {"species": "hsapiens", "ensembl_version": "grch37"},
"mouse": {"species": "mmusculus", "ensembl_version": "grcm38"}
}
if genome_assembly is None:
genome_assembly = organisms[organism]['ensembl_version']
if genome_assembly == "hg19":
genome_assembly = "grch37"
if genome_assembly == "hg38":
genome_assembly = "grch38"
if genome_assembly == "mm10":
genome_assembly = "grcm38"
if save:
if output_dir is None:
output_dir = os.path.join(os.path.abspath(os.path.curdir), "reference")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
attributes = [
"chromosome_name", "start_position",
"ensembl_gene_id", "external_gene_name",
"strand", "transcript_biotype"]
res = query_biomart(
attributes=attributes,
species=organisms[organism]['species'],
ensembl_version=genome_assembly)
if gene_types is None:
res = res.drop(['transcript_biotype'], axis=1).drop_duplicates()
else:
res = res.loc[res['transcript_biotype'].isin(gene_types), :]
if chr_prefix:
res.loc[:, 'chromosome_name'] = "chr" + res['chromosome_name']
res.loc[:, 'start_position'] = res['start_position'].astype(int)
res.loc[:, 'strand'] = res['strand'].replace("1", "+").replace("-1", "-")
res.loc[:, 'end'] = res.apply(
lambda x: x['start_position'] + 1 if x['strand'] == "+" else x['start_position'],
axis=1)
res.loc[:, 'start_position'] = res.apply(
lambda x: x['start_position'] - 1 if x['strand'] == "-" else x['start_position'],
axis=1)
# drop same gene duplicates if starting in same position but just annotated with
# different biotypes
res = (
res[attributes[:2] + ["end"] + attributes[2:]]
.sort_values(by=res.columns.tolist(), axis=0)
.drop_duplicates(subset=res.columns[:3], keep="last"))
# make real BED format
res.loc[:, 'fill'] = '.'
cols = [
'chromosome_name', 'start_position', 'end', 'external_gene_name',
'fill', 'strand', 'ensembl_gene_id', 'transcript_biotype']
res = res.loc[:, cols]
res.columns = [
'chr', 'start', 'end', 'gene_name', 'score', 'strand',
'ensembl_gene_id', 'transcript_biotype']
# save
if save:
res.to_csv(
os.path.join(output_dir, "{}.{}.gene_annotation.tss.bed"
.format(organism, genome_assembly)),
index=False, header=False, sep="\t")
res[res['transcript_biotype'] == "protein_coding"].drop(attributes[-1], axis=1).to_csv(
os.path.join(output_dir, "{}.{}.gene_annotation.protein_coding.tss.bed"
.format(organism, genome_assembly)),
index=False, header=False, sep="\t")
return res
def get_genomic_context(
organism, genome_assembly=None, save=True, output_dir=None, chr_prefix=True,
region_subset=['promoter', 'exon', '5utr', '3utr', 'intron', 'genebody', 'intergenic'],
gene_types=['protein_coding', 'processed_transcript', 'lincRNA', 'antisense'],
promoter_width=3000):
"""
Get annotations of TSS for a given organism/genome assembly.
This is a simple approach using Biomart's API querying the Ensembl database.
Saves results to disk and returns a dataframe.
The API call to BioMart takes a bit long, so the function should take ~4 min.
:param organism: Organism to get annotation for. Currently supported: "human" and "mouse".
:type organism: str
:param genome_assembly: Ensembl assembly/version to use.
Default for "human" is "grch37" and for "mouse" is "grcm38".
:type genome_assembly: str, optional
:param save: Whether to save to disk under ``output_dir``. Defaults to True.
:type save: bool, optional
:param output_dir: Directory to write output to. Defaults to "reference" in current directory.
:type output_dir: str, optional
:param chr_prefix: Whether chromosome names should have the "chr" prefix. Defaults to True
:type chr_prefix: bool, optional
:param gene_types: Subset of transcript biotypes to keep.
See here the available biotypes https://www.ensembl.org/Help/Faq?id=468
Defaults to 'protein_coding', 'processed_transcript', 'lincRNA', 'antisense'.
:type gene_types: list, optional
:returns: DataFrame with genome annotations
:rtype: pandas.DataFrame
"""
from ngs_toolkit.general import query_biomart
import pybedtools
from pybedtools import BedTool
organisms = {
"human": {"species": "hsapiens", "ensembl_version": "grch37", "ucsc_version": "hg19"},
"mouse": {"species": "mmusculus", "ensembl_version": "grcm38", "ucsc_version": "mm10"}
}
if genome_assembly is None:
genome_assembly = organisms[organism]['ucsc_version']
ensembl_genome_assembly = organisms[organism]['ensembl_version']
elif genome_assembly == "hg19":
ensembl_genome_assembly = "grch37"
elif genome_assembly == "hg38":
ensembl_genome_assembly = "grch38"
elif genome_assembly == "mm10":
ensembl_genome_assembly = "grcm38"
else:
_LOGGER.warning()
ensembl_genome_assembly = genome_assembly
if save:
if output_dir is None:
output_dir = os.path.join(os.path.abspath(os.path.curdir), "reference")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
attrs = [
"ensembl_gene_id", "chromosome_name", "start_position", "end_position",
"exon_chrom_start", "exon_chrom_end",
"5_utr_start", "5_utr_end", "3_utr_start", "3_utr_end",
"strand", "external_gene_name", "gene_biotype"]
res = query_biomart(
attributes=attrs,
species=organisms[organism]['species'],
ensembl_version=ensembl_genome_assembly)
if chr_prefix:
res['chromosome_name'] = "chr" + res['chromosome_name']
# Keep only desired gene types
res = res.loc[res['gene_biotype'].isin(gene_types), :]
# Get strand
res.loc[:, 'strand'] = res['strand'].replace("1", "+").replace("-1", "-")
# remove exons which are also UTRs
bed_cols = ['chromosome_name', 'start_position', 'end_position', 'strand']
# # Promoters = genebody:start +/- N
promoter = res[bed_cols].drop_duplicates()
for col in promoter.columns:
if ("start" in col) or ("end" in col):
promoter.loc[:, col] = promoter.loc[:, col].astype(int)
promoter.loc[promoter['strand'] == "+", 'start_position'] = (
promoter.loc[promoter['strand'] == "+", 'start_position'] - (promoter_width / 2))
promoter.loc[promoter['strand'] == "-", 'start_position'] = (
promoter.loc[promoter['strand'] == "-", 'end_position'] - (promoter_width / 2)) # + 1
promoter.loc[promoter['strand'] == "+", 'end_position'] = (
promoter.loc[:, 'start_position'] + (promoter_width / 2))
for col in bed_cols[1:3]:
promoter.loc[:, col] = promoter.loc[:, col].astype(int)
# # genebody = start->end + promoter
gb = res[bed_cols].drop_duplicates()
for col in gb.columns:
if ("start" in col) or ("end" in col):
gb.loc[:, col] = gb.loc[:, col].astype(int)
gb = gb.append(promoter)
for col in bed_cols[1:3]:
gb.loc[:, col] = gb.loc[:, col].astype(int)
gb = gb.sort_values(gb.columns.tolist())
genebody_bed = BedTool.from_dataframe(gb)
genebody_bed = genebody_bed.sort().merge()
genebody = genebody_bed.to_dataframe()
# # Exon
exon = (
res[["chromosome_name", "exon_chrom_start", "exon_chrom_end"]]
.drop_duplicates().dropna())
# # 5utr
utr5 = (
res[["chromosome_name", "5_utr_start", "5_utr_end"]]
.drop_duplicates().dropna())
# # 3utr
utr3 = (
res[["chromosome_name", "3_utr_start", "3_utr_end"]]
.drop_duplicates().dropna())
for d in [exon, utr5, utr3]:
for col in d.columns:
if ("start" in col) or ("end" in col):
d.loc[:, col] = d.loc[:, col].astype(int)
# # Introns = genebody - (promoter + exon + 5utr + 3utr)
promoter = promoter.drop(['strand'], axis=1)
exon.columns = utr3.columns = utr5.columns = promoter.columns = bed_cols[:3]
others = promoter.append(exon).append(utr5).append(utr3)
for col in others.columns[1:]:
others.loc[:, col] = others.loc[:, col].astype(int)
intron = genebody_bed.subtract(BedTool.from_dataframe(others))
intron = intron.to_dataframe()
# # Intergenic = ChromSizes - genebody
cs = pybedtools.get_chromsizes_from_ucsc(genome_assembly)
cs = pd.DataFrame(cs).T.reset_index()
if not chr_prefix:
cs['index'] = cs['index'].str.subtract("chr", "")
cs = BedTool.from_dataframe(cs)
intergenic = cs.subtract(genebody_bed)
intergenic = intergenic.to_dataframe()
# join all
features = [
"promoter", "genebody", "exon", "intron",
"utr5", "utr3", "intergenic"]
annot = list()
for name in features:
cur = locals()[name]
cur = cur.iloc[:, list(range(3))]
cur.columns = bed_cols[:3]
cur.loc[:, "region_type"] = name
if save:
cur.sort_values(cur.columns.tolist()).drop('region_type', axis=1).to_csv(
os.path.join(output_dir, "{}.{}.genomic_context.{}.bed"
.format(organism, ensembl_genome_assembly, name)),
index=False, header=False, sep="\t")
annot.append(cur)
annot = pd.concat(annot)
annot = annot.sort_values(annot.columns.tolist())
# save
if save:
annot.to_csv(
os.path.join(output_dir, "{}.{}.genomic_context.bed"
.format(organism, ensembl_genome_assembly)),
index=False, header=False, sep="\t")
return annot
def count_reads_in_intervals(bam, intervals):
"""
Count total number of reads in a iterable holding strings
representing genomic intervals of the form ``"chrom:start-end"``.
:param bam: BAM file.
:type bam: str
:param intervals: List of strings with genomic coordinates in format
``"chrom:start-end"``.
:type intervals: list
:returns: Dict of read counts for each interval.
:rtype: dict
"""
import pysam
counts = dict()
bam = pysam.AlignmentFile(bam, mode='rb')
chroms = ["chr" + str(x) for x in range(1, 23)] + ["chrX", "chrY"]
for interval in intervals:
if interval.split(":")[0] not in chroms:
continue
counts[interval] = bam.count(region=interval.split("|")[0])
bam.close()
return counts
def normalize_quantiles_r(array):
"""
Quantile normalization with a R implementation.
Requires the "rpy2" library and the R library "preprocessCore".
Requires the R package "cqn" to be installed:
>>> source('http://bioconductor.org/biocLite.R')
>>> biocLite('preprocessCore')
:param array: Numeric array to normalize.
:type array: numpy.array
:returns: Normalized numeric array.
:rtype: numpy.array
"""
import numpy as np
import rpy2.robjects as robjects
import rpy2.robjects.numpy2ri
import warnings
from rpy2.rinterface import RRuntimeWarning
warnings.filterwarnings("ignore", category=RRuntimeWarning)
rpy2.robjects.numpy2ri.activate()
robjects.r('require("preprocessCore")')
normq = robjects.r('normalize.quantiles')
return np.array(normq(array))
def normalize_quantiles_p(df_input):
"""
Quantile normalization with a ure Python implementation.
Code from https://github.com/ShawnLYU/Quantile_Normalize.
:param df_input: Dataframe to normalize.
:type df_input: pandas.DataFrame
:returns: Normalized numeric array.
:rtype: numpy.array
"""
df = df_input.copy()
# compute rank
dic = {}
for col in df:
dic.update({col: sorted(df[col])})
sorted_df = pd.DataFrame(dic)
rank = sorted_df.mean(axis=1).tolist()
# sort
for col in df:
t = np.searchsorted(np.sort(df[col]), df[col])
df[col] = [rank[i] for i in t]
return df
def unsupervised_analysis(
analysis,
steps=['correlation', 'manifold', 'pca', 'pca_association'],
data_type=None,
quant_matrix=None,
samples=None,
attributes_to_plot=None,
plot_prefix=None,
standardize_matrix=False,
manifold_algorithms=['MDS', "Isomap", "LocallyLinearEmbedding", "SpectralEmbedding", "TSNE"],
test_pc_association=True,
display_corr_values=False,
plot_max_attr=20,
plot_max_pcs=8,
plot_group_centroids=True,
axis_ticklabels=False,
axis_lines=True,
legends=False,
always_legend=False,
prettier_sample_names=True,
pallete="tab20",
cmap="RdBu_r",
rasterized=False,
dpi=300,
output_dir="{results_dir}/unsupervised_analysis_{data_type}"):
"""
Unsupervised analysis for several data types.
Apply unsupervised clustering, manifold learning and dimensionality reduction
methods on numeric matrix.
Colours and labels samples by their attributes as given in `attributes_to_plot`.
For PCA analysis, if `test_pc_association` is `True`, will compute association of PCs
with sample attributes given in `attributes_to_plot`. For numeric attributes,
the Pearson correlation will be computed and for categoriacal, a pairwise
Kruskal-Wallis H-test (ANOVA).
:param analysis: Analysis object to perform analysis for.
:type analysis: ngs_toolkit.general.Analysis
:param data_type: Data type. One of "ATAC-seq" or "RNA-seq". Defaults to "ATAC-seq".
:type data_type: str, optional
:param quant_matrix: Name of analysis attribute contatining the numeric dataframe
to perform analysis on.
Defaults to "accessibility" if data_type is ATAC-seq and
"expression_annotated" if data_type is RNA-seq.
This matrix should have a pandas.MultiIndex as column index.
:type quant_matrix: str, optional
:param samples: List of sample objects to restrict analysis to. Defaults to None.
:type samples: list, optional
:param attributes_to_plot: List of attributes shared between sample groups should be plotted.
Defaults to ["cell_type"].
:type attributes_to_plot: list, optional
:param plot_prefix: Prefix for output files.
Defaults to "all_sites" if data_type is ATAC-seq and "all_genes" if
data_type is RNA-seq.
:type plot_prefix: str, optional
:param standardize_matrix: Whether to standardize variables in `quant_matrix` by removing
the mean and scaling to unit variance.
:type standardize_matrix: bool, optional
:param manifold_algorithms: List of manifold algorithms to use. See available algorithms here:
http://scikit-learn.org/stable/modules/classes.html#module-sklearn.manifold
:type manifold_algorithms: list, optional
:param test_pc_association: Whether a test of association of principal components and variables
in `attributes_to_plot` should be conducted.
Defaults to True.
:type test_pc_association: bool, optional
:param display_corr_values: Whether values in heatmap of sample correlations should be
displayed overlaid on top of colours. Defaults to False.
:type display_corr_values: bool, optional
:param plot_max_attr: Maximum number of sample attributes to plot for each factor in plot legend.
Defaults to 20.
:type plot_max_attr: int, optional
:param plot_max_pcs: Maximum number of principal components to plot. This only affects plotting.
All PCs will be calculated.
Defaults to 8.
:type plot_max_pcs: number, optional
:param plot_group_centroids: Whether centroids of each sample group should be plotted alongside
samples. Will be square shaped.
Defaults to True.
:type plot_group_centroids: bool, optional
:param axis_ticklabels: Whether MDS and PCA axis ticks and ticklabels should be plotted.
Defaults to False.
:type axis_ticklabels: bool, optional
:param axis_lines: Whether (0, 0) dashed lines should be plotted in MDS and PCA.
Defaults to True.
:type axis_lines: bool, optional
:param legends: Whether legends for group colours should be plotted in MDS and PCA.
Defaults to False.
:type legends: bool, optional
:param always_legend: Whether legends for group colours should be plotted in every figure
panel in MDS and PCA.
If False, will plot just on first/last figure panel.
Defaults to False.
:type always_legend: bool, optional
:param prettier_sample_names: Whether it should attempt to prettify sample names by
removing the data type from plots.
Defaults to True.
:type prettier_sample_names: bool, optional
:param pallete: Color pallete to use in levels of `attributes_to_plot`. Will be passed to
`analysis.get_level_colors`.
:type pallete: str
:param cmap: Color map to use in numerical levels of `attributes_to_plot`. Will be passed
to `analysis.get_level_colors`.
:type cmap: str
:param rasterized: Whether elements with many objects should be rasterized.
Defaults to False.
:type rasterized: bool, optional
:param dpi: Definition of rasterized image in dots per inch.
Defaults to 300dpi.
:type dpi: number, optional
:param output_dir: Directory for generated files and plots.
Defaults to "{results_dir}/unsupervised_analysis_{data_type}".
:type output_dir: str, optional
:returns: None
"""
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import manifold
from collections import OrderedDict
import itertools
from scipy.stats import kruskal
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.sandbox.stats.multicomp import multipletests
if data_type is None:
msg = "Data type not defined and Analysis object does not have a `data_type` attribute."
try:
data_type = analysis.data_type
except AttributeError as e:
_LOGGER.error(msg)
raise e
if data_type is None:
_LOGGER.error(msg)
raise ValueError
if data_type == "ATAC-seq":
if plot_prefix is None:
plot_prefix = "all_sites"
if quant_matrix is None:
quant_matrix = "accessibility"
elif data_type == "ChIP-seq":
if plot_prefix is None:
plot_prefix = "all_sites"
if quant_matrix is None:
quant_matrix = "binding"
elif data_type == "CNV":
if plot_prefix is None:
plot_prefix = "all_bins"
if quant_matrix is None:
quant_matrix = "cnv"
elif data_type == "RNA-seq":
if plot_prefix is None:
plot_prefix = "all_genes"
if quant_matrix is None:
quant_matrix = "expression"
else:
raise ValueError("Data types can only be 'ATAC-seq', 'RNA-seq' or 'CNV'.")
if ("{results_dir}" in output_dir) and ("{data_type}" in output_dir):
output_dir = output_dir.format(results_dir=analysis.results_dir, data_type=data_type)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
matrix = getattr(analysis, quant_matrix)
if type(matrix.columns) is not pd.core.indexes.multi.MultiIndex:
msg = "Provided quantification matrix must have columns with MultiIndex."
hint = " Use ngs_toolkit.general.annotate_with_sample_metadata to do that."
_LOGGER.error(msg + hint)
raise TypeError(msg)
if samples is None:
samples = [s for s in analysis.samples
if s.name in matrix.columns.get_level_values("sample_name")]
else:
samples = [s for s in samples
if s.name in matrix.columns.get_level_values("sample_name")]
if len(samples) == 0:
msg = "None of the samples could be found in the quantification matrix."
_LOGGER.error(msg)
raise ValueError(msg)
if len(samples) == 1:
msg = "Only one sample could be found in the quantification matrix."
hint = " Function needs more than one."
_LOGGER.error(msg + hint)
raise ValueError(msg)
msg = "`attributes_to_plot` were not specified and the analysis does not have a "
msg += " 'group_attributes' variable."
if attributes_to_plot is None:
try:
attributes_to_plot = analysis.group_attributes
except AttributeError:
_LOGGER.error(msg)
raise
# remove attributes with all NaNs
attributes_to_plot = [attr for attr in attributes_to_plot
if attr in matrix.columns.names]
attributes_to_plot = [attr for attr in attributes_to_plot
if not pd.isnull(matrix.columns.get_level_values(attr)).all()]
if len(attributes_to_plot) == 0:
msg = ("None of the factors in `attributes_to_plot` could be found in the " +
"quantification matrix index or they are all NaN.")
_LOGGER.error(msg)
raise ValueError(msg)
# This will always be a matrix for all samples
color_dataframe = pd.DataFrame(
analysis.get_level_colors(
index=matrix.columns, levels=attributes_to_plot,
pallete=pallete, cmap=cmap),
index=attributes_to_plot, columns=matrix.columns)
# will be filtered now by the requested samples if needed
color_dataframe = color_dataframe[[s.name for s in samples]]
# All regions, matching samples (provided samples in matrix)
X = matrix.loc[:, matrix.columns.get_level_values("sample_name").isin([s.name for s in samples])]
if standardize_matrix:
std = StandardScaler()
X = pd.DataFrame(std.fit_transform(X.T).T, index=X.index, columns=X.columns)
if isinstance(X.columns, pd.MultiIndex):
sample_display_names = X.columns.get_level_values("sample_name")
else:
sample_display_names = X.columns
# TODO: Re-implement to accomodate multiindex
# if prettier_sample_names:
# X.columns = (
# color_dataframe.columns
# .str.replace("ATAC-seq_", "")
# .str.replace("RNA-seq_", "")
# .str.replace("ChIP-seq_", ""))
if 'correlation' in steps:
# Pairwise correlations
for method in ['pearson', 'spearman']:
_LOGGER.info("Plotting pairwise correlation with '{}' metric.".format(method))
g = sns.clustermap(
X.astype(float).corr(method),
xticklabels=False, yticklabels=sample_display_names, annot=display_corr_values,
cmap="Spectral_r", figsize=(0.2 * X.shape[1], 0.2 * X.shape[1]),
cbar_kws={"label": "{} correlation".format(method.capitalize())},
row_colors=color_dataframe.T, col_colors=color_dataframe.T)
g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0, fontsize='xx-small')
g.ax_heatmap.set_xlabel(None, visible=False)
g.ax_heatmap.set_ylabel(None, visible=False)
g.fig.savefig(os.path.join(
output_dir, "{}.{}.{}_correlation.clustermap.svg"
.format(analysis.name, plot_prefix, method)), bbox_inches='tight', dpi=dpi)
if 'manifold' in steps:
# Manifolds
params = {
"MDS": {'n_jobs': -1},
"Isomap": {'n_jobs': -1},
"LocallyLinearEmbedding": {},
"SpectralEmbedding": {'n_jobs': -1},
"TSNE": {'init': 'pca'},
}
for algo in manifold_algorithms:
msg = "Learning manifold with '{}' algorithm".format(algo)
_LOGGER.info(msg + ".")
manif = getattr(manifold, algo)(**params[algo])
try:
x_new = manif.fit_transform(X.T)
except (TypeError, ValueError):
hint = " Number of samples might be too small to perform '{}'".format(algo)
_LOGGER.error(msg + " failed!" + hint)
continue
xx = pd.DataFrame(x_new, index=X.columns, columns=list(range(x_new.shape[1])))
_LOGGER.info("Plotting projection of manifold with '{}' algorithm.".format(algo))
fig, axis = plt.subplots(1, len(attributes_to_plot), figsize=(4 * len(attributes_to_plot), 4 * 1))
if len(attributes_to_plot) != 1:
axis = axis.flatten()
else:
axis = np.array([axis])
for i, attr in enumerate(attributes_to_plot):
for j, sample in enumerate(xx.index):
sample = pd.Series(sample, index=X.columns.names)
try:
label = getattr(sample, attributes_to_plot[i])
except AttributeError:
label = np.nan
axis[i].scatter(
xx.loc[sample['sample_name'], 0],
xx.loc[sample['sample_name'], 1],
s=50, color=color_dataframe.loc[attr, sample['sample_name']],
alpha=0.75, label=label, rasterized=rasterized)
# Plot groups
if plot_group_centroids:
xx2 = xx.groupby(attr).mean()
# get the color of each attribute group
cd = color_dataframe.loc[attr, :]
cd.name = None
cd.index = X.columns.get_level_values(attr)
cd = cd.apply(lambda x: tuple(x) if isinstance(x, list) else x) # fix for deduplicating lists
cd = cd.reset_index().drop_duplicates().set_index(attr)
for j, group in enumerate(xx2.index):
axis[i].scatter(
xx2.loc[group, 0],
xx2.loc[group, 1],
marker="s", s=50, color=cd.loc[group].squeeze(),
alpha=0.95, label=group, rasterized=rasterized)
axis[i].text(
xx2.loc[group, 0],
xx2.loc[group, 1], group,
color=cd.loc[group].squeeze(), alpha=0.95)
# Graphics
axis[i].set_title(attributes_to_plot[i])
axis[i].set_xlabel("{} 1".format(algo))
axis[i].set_ylabel("{} 2".format(algo))
if not axis_ticklabels:
axis[i].set_xticklabels(axis[i].get_xticklabels(), visible=False)
axis[i].set_yticklabels(axis[i].get_yticklabels(), visible=False)
if axis_lines:
axis[i].axhline(0, linestyle="--", color="black", alpha=0.3)
axis[i].axvline(0, linestyle="--", color="black", alpha=0.3)
if legends:
# Unique legend labels
handles, labels = axis[i].get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
if any([type(c) in [str, unicode] for c in by_label.keys()]) and len(by_label) <= 20:
# if not any([re.match("^\d", c) for c in by_label.keys()]):
axis[i].legend(by_label.values(), by_label.keys())
fig.savefig(os.path.join(
output_dir, "{}.{}.{}.svg"
.format(analysis.name, plot_prefix, algo.lower())), bbox_inches='tight', dpi=dpi)
if 'pca' in steps:
# PCA
pcs = min(*X.shape) - 1
_LOGGER.info("Decomposing data with 'PCA' algorithm for {} dimensions.".format(pcs))
pca = PCA(n_components=pcs, svd_solver="arpack")
x_new = pca.fit_transform(X.T)
pcs_order = range(pca.n_components_)
xx = pd.DataFrame(x_new, index=X.columns, columns=pcs_order)
xx.to_csv(
os.path.join(output_dir, "{}.{}.pca.fit.csv".format(
analysis.name, plot_prefix)))
comps = pd.DataFrame(pca.components_.T, index=X.index, columns=pcs_order)
comps.to_csv(
os.path.join(output_dir, "{}.{}.pca.loadings.csv".format(
analysis.name, plot_prefix)))
# Write % variance expained to disk
variance = pd.Series(
pca.explained_variance_ratio_ * 100,
name="percent_variance", index=pcs_order
).to_frame()
variance['log_variance'] = np.log10(pca.explained_variance_)
variance.index.name = "PC"
variance.to_csv(
os.path.join(output_dir, "{}.{}.pca.explained_variance.csv".format(
analysis.name, plot_prefix)))
# plot % explained variance per PC
_LOGGER.info("Plotting variance explained with PCA.")
fig, axis = plt.subplots(1, 3, figsize=(4 * 3, 4))
axis[0].plot(variance.index, variance['percent_variance'], 'o-')
axis[0].set_ylim((0, variance['percent_variance'].max() + variance['percent_variance'].max() * 0.1))
axis[1].plot(variance.index, variance['log_variance'], 'o-')
axis[2].plot(variance.index, variance['percent_variance'].cumsum(), 'o-')
axis[2].set_ylim((0, 100))
for ax in axis:
ax.axvline(len(attributes_to_plot), linestyle='--')
ax.set_xlabel("PC")
axis[0].set_ylabel("% variance")
axis[1].set_ylabel("log variance")
axis[2].set_ylabel("Cumulative % variance")
sns.despine(fig)
fig.savefig(os.path.join(
output_dir, "{}.{}.pca.explained_variance.svg"
.format(analysis.name, plot_prefix)), bbox_inches='tight', dpi=dpi)
# plot pca
pcs = min(xx.shape[1] - 1, plot_max_pcs)
_LOGGER.info("Plotting PCA up to {} dimensions.".format(pcs))
fig, axis = plt.subplots(pcs, len(attributes_to_plot), figsize=(
4 * len(attributes_to_plot), 4 * pcs))
if len(attributes_to_plot) == 1:
axis = axis.reshape((pcs, 1))
if pcs == 1:
axis = axis.reshape((1, len(attributes_to_plot)))
for pc in range(pcs):
for i, attr in enumerate(attributes_to_plot):
for j, sample in enumerate(xx.index):
sample = pd.Series(sample, index=X.columns.names)
try:
label = getattr(samples[j], attr)
except AttributeError:
label = np.nan
axis[pc, i].scatter(
xx.loc[sample['sample_name'], pc],
xx.loc[sample['sample_name'], pc + 1],
s=30, color=color_dataframe.loc[attr, sample['sample_name']],
alpha=0.75, label=label, rasterized=rasterized)
# Plot groups
if plot_group_centroids:
xx2 = xx.groupby(attr).mean()
# get the color of each attribute group
cd = color_dataframe.loc[attr, :]
cd.name = None
cd.index = X.columns.get_level_values(attr)
cd = cd.apply(lambda x: tuple(x) if isinstance(x, list) else x) # fix for deduplicating lists
cd = cd.reset_index().drop_duplicates().set_index(attr)
for j, group in enumerate(xx2.index):
axis[pc, i].scatter(
xx2.loc[group, pc],
xx2.loc[group, pc + 1],
marker="s", s=50, color=cd.loc[group].squeeze(),
alpha=0.95, label=group, rasterized=rasterized)
axis[pc, i].text(
xx2.loc[group, pc],
xx2.loc[group, pc + 1], group,
color=cd.loc[group].squeeze(), alpha=0.95)
# Graphics
axis[pc, i].set_title(attr)
axis[pc, i].set_xlabel("PC {}".format(pc + 1))
axis[pc, i].set_ylabel("PC {}".format(pc + 2))
if not axis_ticklabels:
axis[pc, i].set_xticklabels(axis[pc, i].get_xticklabels(), visible=False)
axis[pc, i].set_yticklabels(axis[pc, i].get_yticklabels(), visible=False)
if axis_lines:
axis[pc, i].axhline(0, linestyle="--", color="black", alpha=0.3)
axis[pc, i].axvline(0, linestyle="--", color="black", alpha=0.3)
if legends:
# Unique legend labels
handles, labels = axis[pc, i].get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
if any([type(c) in [str, unicode] for c in by_label.keys()]) and len(by_label) <= plot_max_attr:
# if not any([re.match("^\d", c) for c in by_label.keys()]):
if always_legend:
axis[pc, i].legend(by_label.values(), by_label.keys())
else:
if pc == (pcs - 1):
axis[pc, i].legend(
by_label.values(), by_label.keys())
fig.savefig(os.path.join(output_dir, "{}.{}.pca.svg".format(
analysis.name, plot_prefix)), bbox_inches="tight")
if ('pca' in steps) and ('pca_association' in steps):
# Test association of PCs with attributes
if not test_pc_association:
_LOGGER.info("Not testing association of attributes with principal components.")
return
_LOGGER.info("Computing association of given attributes with principal components.")
associations = list()
for pc in range(pcs):
for attr in attributes_to_plot:
_LOGGER.info("PC {}; Attribute {}.".format(pc + 1, attr))
# Get all values of samples for this attr
groups = xx.index.get_level_values(attr)
# Determine if attr is categorical or continuous
if all([type(i) in [str, bool] for i in groups]) or len(groups) == 2:
variable_type = "categorical"
elif all([type(i) in [int, float, np.int64, np.float64] for i in groups]):
variable_type = "numerical"
else:
_LOGGER.warning("attr %s cannot be tested." % attr)
associations.append([pc + 1, attr, variable_type, np.nan, np.nan, np.nan])
continue
if variable_type == "categorical":
# It categorical, test pairwise combinations of attributes
for group1, group2 in itertools.combinations(groups, 2):
g1_mask = xx.index.get_level_values(attr) == group1