-
Notifications
You must be signed in to change notification settings - Fork 0
/
score_region.smk
118 lines (104 loc) · 5.4 KB
/
score_region.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#snakemake -s score_region.smk -j 12 --cluster "qsub -l walltime={params.run_time} -l nodes=1:ppn={params.cores} -q home-yeo -e {params.error_out_file} -o {params.out_file}" --configfile config/preprocess_config/oligose_k562.yaml --use-conda --conda-prefix /home/hsher/snakeconda -np
import pandas as pd
from pathlib import Path
workdir: '/home/hsher/scratch/count_A3SS'
rootdir = Path('/home/hsher/scratch/ABC_2rep')
#cat /home/hsher/scratch/encode_kd/output/rMATs/SF3B4_K562/regions/A3SS/*/long_ss3.bed > ~/scratch/long_ss3.bed
#cat /home/hsher/scratch/encode_kd/output/rMATs/SF3B4_K562/regions/A3SS/*/long_5intron.bed > ~/scratch/long_5intron.bed
# cat /home/hsher/scratch/encode_kd/output/rMATs/PRPF8_K562/regions/A5SS/*/long_3intron.bed > ~/scratch/A5SS_long_3intron.bed
# cat /home/hsher/scratch/encode_kd/output/rMATs/PRPF8_K562/regions/A5SS/*/exon_diff_interval.bed > ~/scratch/A5SS_exon_diff_interval.bed
# cat /home/hsher/scratch/encode_kd/output/rMATs/RBFOX2_K562/regions/SE/*/casette_exon.bed > ~/scratch/casette_exon.bed
# cat /home/hsher/scratch/encode_kd/output/rMATs/RBFOX2_K562/regions/SE/*/casette_5intron.bed > ~/scratch/casette_5intron.bed
# cat /home/hsher/scratch/encode_kd/output/rMATs/RBFOX2_K562/regions/SE/*/casette_3intron.bed > ~/scratch/casette_3intron.bed
# cat /home/hsher/scratch/encode_kd/output/rMATs/RBFOX2_K562/regions/SE/*/exon5.bed > ~/scratch/exon5.bed
# cat /home/hsher/scratch/encode_kd/output/rMATs/RBFOX2_K562/regions/SE/*/exon3.bed > ~/scratch/exon3.bed
REGIONS=['/home/hsher/scratch/long_ss3.bed',
'/home/hsher/scratch/long_5intron.bed',
'/home/hsher/scratch/exon_diff_interval.bed',
'/home/hsher/scratch/A5SS_long_3intron.bed',
'/home/hsher/scratch/A5SS_exon_diff_interval.bed',
'/home/hsher/scratch/exon3.bed',
'/home/hsher/scratch/exon5.bed',
'/home/hsher/scratch/casette_exon.bed',
'/home/hsher/scratch/casette_5intron.bed',
'/home/hsher/scratch/casette_3intron.bed'
]
REGION_NAMES = [Path(i).name.replace('.bed','') for i in REGIONS]
REGIONS_DICT = dict(zip(REGION_NAMES, REGIONS))
MANIFEST=config['MANIFEST']
SCRIPT_PATH=config['SCRIPT_PATH']
UNINFORMATIVE_READ = 3 - int(config['INFORMATIVE_READ']) # whether read 1 or read 2 is informative
CHROM_SIZES = config['CHROM_SIZES']
R_EXE = config['R_EXE']
DB_FILE=config['DB_FILE']
GENOME_dir=config['GENOME_dir']
GENOMEFA=config['GENOMEFA']
manifest = pd.read_table(MANIFEST, index_col = False, sep = ',')
print(manifest)
barcode_df = pd.read_csv(config['barcode_csv'], header = None, sep = ':', names = ['barcode', 'RBP'])
# basic checking
assert not barcode_df['barcode'].duplicated().any()
assert not barcode_df['RBP'].duplicated().any() # cannot have any duplicated RBP names
assert not barcode_df['RBP'].str.contains(' ').any() # DO NOT CONTAIN white space lah
assert not manifest['fastq'].duplicated().any()
assert not manifest['libname'].str.contains(' ').any()
libnames = manifest['libname'].tolist()
config['libnames'] = ['K562_rep6']#libnames
experiments = manifest['experiment'].tolist()
config['experiments'] = experiments
rbps = barcode_df['RBP'].tolist()
config['rbps'] = rbps
print(f'RBPs: {rbps}',
f'experiments:{experiments}',
f'libnames:{libnames}')
# module normalization:
# snakefile:
# "rules/skipper.smk"
# config:
# config
# module DMN:
# snakefile:
# "rules/normalization_DMN.smk"
# config:
# config
rule all:
input:
expand("{region_name}/counts/genome/megatables/{libname}.tsv.gz",
region_name = REGION_NAMES,
libname = libnames)
rule partition_bam_reads:
input:
chrom_size = config['CHROM_SIZES'],
bam = rootdir/"{libname}/bams/{sample_label}.rmDup.Aligned.sortedByCoord.out.bam",
region_partition = lambda wildcards: REGIONS_DICT[wildcards.region_name]
output:
counts= "{region_name}/counts/genome/vectors/{libname}.{sample_label}.counts",
params:
error_out_file = "error_files/partition_bam_reads.{libname}.{sample_label}.{region_name}.err",
out_file = "stdout/partition_bam_reads.{libname}.{sample_label}.{region_name}.out",
run_time = "20:00",
cores = "1",
memory = "10000",
job_name = "partition_bam_reads",
replicate_label = "{libname}.{sample_label}",
uninformative_read = UNINFORMATIVE_READ
benchmark: "benchmarks/counts/unassigned_experiment.{libname}.{sample_label}.{region_name}.partition_bam_reads.txt"
shell:
"bedtools bamtobed -i {input.bam} | awk '($1 != \"chrEBV\") && ($4 !~ \"/{params.uninformative_read}$\")' | bedtools flank -s -l 1 -r 0 -g {input.chrom_size} -i - | bedtools shift -p -1 -m 1 -g {input.chrom_size} -i - | bedtools coverage -counts -s -a {input.region_partition} -b - | cut -f 7 | awk 'BEGIN {{print \"{params.replicate_label}\"}} {{print}}' > {output.counts};"
rule make_window_by_barcode_table:
input:
counts = expand("{region_name}/counts/genome/vectors/{libname}.{sample_label}.counts",
libname = ["{libname}"],
sample_label = rbps,
region_name = ["{region_name}"]),
output:
counts = "{region_name}/counts/genome/megatables/{libname}.tsv.gz",
params:
error_out_file = "error_files/window_by_barcode_table.{libname}.{region_name}.err",
out_file = "stdout/window_by_barcode_table.{libname}.{region_name}.out",
run_time = "20:00",
cores = 1
shell:
"""
paste -d '\t' {input.counts} | gzip > {output.counts}
"""