-
Notifications
You must be signed in to change notification settings - Fork 2
/
Snakefile_helper_Brian.py
220 lines (201 loc) · 8.38 KB
/
Snakefile_helper_Brian.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
######################################
# Functions used by rules in aseembly
######################################
def split_paired_fastq_into_subfiles(input, output, num_files):
"""
input: a list containing 2 files representing the two input paired fastq
output: a list of 2 files representing the two output paired fastq
Could be optimized better for time
"""
assert(num_files > 0)
# these files are fastq so it's 4 lines per read
r1_lines = sum(1 for l in open(input[0], 'r'))
r2_lines = sum(1 for l in open(input[1], 'r'))
print('The number of lines in '+input[0]+' is: '+str(r1_lines))
print('The number of lines in '+input[1]+' is: '+str(r2_lines))
assert(check_lines(input[0],input[1])),"Input files have different number of lines."
# the 4 here is so that each file contains complete reads (4 lines per read)
readsperfile = int(np.ceil(r1_lines/4/float(num_files)))
print('The number of reads per file is: '+str(readsperfile))
assert(readsperfile*4*num_files >= r1_lines)
# split files
# first open the 2 inputs. Go throught the inputs once. figure out which output to copy to.
with open(input[0],'r') as r1, open(input[1],'r') as r2:
linecnt = 0
# all the output files should have the same number of lines except the last file
for filenum in range(num_files-1):
print(str(filenum))
print(output[filenum]+' '+output[filenum+num_files])
file_line_cnt = 0
with open(output[filenum],'w') as f1, open(output[filenum+num_files],'w') as f2:
while file_line_cnt < readsperfile*4:
t = f1.write(r1.readline())
t = f2.write(r2.readline())
linecnt += 1
file_line_cnt += 1
assert(file_empty([output[filenum]])),"Output file read 1 is empty."
assert(file_empty([output[filenum+num_files]])),"Output file read 2 is empty."
assert(check_fastq_ids(output[filenum],output[filenum+num_files])),"Output files have different read id orders."
assert(linecnt < r1_lines),"Line count is larger than number of lines in fastq file."
# Now put the rest of the lines in the last split file for output (note the num_files-1)
with open(output[num_files-1],'w') as f1, open(output[num_files-1+num_files],'w') as f2:
while linecnt < r1_lines:
t = f1.write(r1.readline())
t = f2.write(r2.readline())
linecnt += 1
assert(file_empty([output[filenum]])),"Output file read 1 is empty."
assert(file_empty([output[filenum+num_files]])),"Output file read 2 is empty."
assert(check_fastq_ids(output[filenum],output[filenum+num_files])),"Output files have different read id orders."
def extract_subsample_fastq_segment(input, output, num_files, segment):
# This function extracts the nth segment from
# two fastq input files and puts them in 2 corresponding
# fastq output files. Checks these output files.
assert(num_files > 0),"Number of files to split into must be positive."
assert(type(segment) is int),"Segment has to be an integer."
assert(segment <= num_files),"Segment must be smaller or equal to total file numbers."
# these files are fastq so it's 4 lines per read
r1_lines = sum(1 for l in open(input[0], 'r'))
r2_lines = sum(1 for l in open(input[1], 'r'))
print('The number of lines in '+input[0]+' is: '+str(r1_lines))
print('The number of lines in '+input[1]+' is: '+str(r2_lines))
assert(check_lines(input[0],input[1])),"Input files have different number of lines."
# the 4 here is so that each file contains complete reads (4 lines per read)
readsperfile = int(np.ceil(r1_lines/4/float(num_files)))
print('The number of reads per file is: '+str(readsperfile))
assert(readsperfile*4*num_files >= r1_lines)
# split files
print('Segment is: '+str(segment))
# Split R1 files
with open(output[0],'w') as f1, open(input[0],'r') as r1:
linecnt = 0
file_line_cnt = 0
for line in r1:
# print(line)
if linecnt >= segment*readsperfile*4 and linecnt < (segment+1)*readsperfile*4:
t = f1.write(line)
linecnt += 1
file_line_cnt += 1
else:
linecnt += 1
print('The number of lines in '+output[0]+' is: '+str(file_line_cnt))
# Split R2 files
with open(output[1],'w') as f2, open(input[1],'r') as r2:
linecnt = 0
file_line_cnt = 0
for line in r2:
# print(line)
if linecnt >= segment*readsperfile*4 and linecnt < (segment+1)*readsperfile*4:
t = f2.write(line)
linecnt += 1
file_line_cnt += 1
else:
linecnt += 1
print('The number of lines in '+output[1]+' is: '+str(file_line_cnt))
assert(check_lines(output[0],output[1])),"Output files have different number of lines."
assert(file_empty(output)),"One of the output files is empty."
assert(check_fastq_ids(output[0],output[1])),"Output files have different read id orders."
def check_fastq_ids(file1, file2):
# This function checks sequence id of 2
# fastq files and sees if they are the same
with open(file1,'r') as f1:
id1 = []
linecnt = 0
for l in f1:
if linecnt % 4 == 0:
id1.append(l.split()[0])
linecnt += 1
else:
linecnt += 1
with open(file2,'r') as f2:
id2 = []
linecnt = 0
for l in f2:
if linecnt % 4 == 0:
id2.append(l.split()[0])
linecnt += 1
else:
linecnt += 1
if len(id1) == len(id2):
for i in range(len(id1)):
if id1[i] != id2[i]:
print('Different fastq ids '+id1[i]+' and '+id2[i])
return 0
else:
print('Different fastq file lengths.')
return 0
return 1
def file_empty(files):
# check if files are empty
# imput is a list of files
for f in files:
if sum(1 for l in open(f,'r')) == 0:
return 0
return 1
def check_lines(file1, file2):
"""
check that the number of lines in 2 files are the same
"""
l1 = sum(1 for l in open(file1,'r'))
l2 = sum(1 for l in open(file2,'r'))
if l1 == l2:
return 1
else:
return 0
def merge_fastq_files(input, output):
"""
both input and output are list of strings representing file names
input is of the form ['r1_0.fastq','r1_1.fastq','r2_0.fastq','r2_1.fastq'...]
"""
numfiles = len(input)/2
print('Number of files for read1 and read2 detected is: '+str(numfiles))
with open(output[0],'w') as f1, open(output[1],'w') as f2:
for i in range(int(numfiles)):
with open(input[i],'r') as in1:
linecnt1 = 0
for l in in1:
f1.write(l)
linecnt1 += 1
print('File '+input[i]+' has '+str(linecnt1)+' lines.')
with open(input[i+int(numfiles)],'r') as in2:
linecnt2 = 0
for l in in2:
f2.write(l)
linecnt2 += 1
print('File '+input[i+int(numfiles)]+' has '+str(linecnt2)+' lines.')
assert(check_lines(input[i],input[i+int(numfiles)])),"Input files have different number of lines."
assert(check_lines(output[0],output[1])),"Output files have different number of lines."
assert(file_empty(output)),"One of the output files is empty."
assert(check_fastq_ids(output[0],output[1])),"Output files have different read id orders."
def get_split_filenames(wildcards):
# This functions generates a list filesnames for
# the subsample split step. Different number of
# files for different subsamples
numfiles = int(depth_table.ix[wildcards.subsample,'subsample_clust_file_numbers'])
R1_names = []
R2_names = []
for i in range(numfiles):
R1_names.append(wildcards.subsample+'/clusterR1_'+str(i)+'.fastq')
R2_names.append(wildcards.subsample+'/clusterR2_'+str(i)+'.fastq')
return R1_names+R2_names
def unzip_files(input):
# This function takes in a list of input *.gz filenames
# and unzips them. The input files must be a list, even
# if it's just one file. The output files will not have
# the .gz ending and will replace the .gz version in place
for file in input:
if os.path.isfile(file):
subprocess.call(['gzip','-d',file])
def zip_back_files(input):
# This function takes a list of modified input filenames
# and zips files that are unzipped before back into
# the zipped versions. It can be used at the end of a rule
# This function also checks if a file exists
for file in input:
if os.path.isfile(file):
subprocess.call(['gzip',file])
def modify_zip_file_names(input):
# This function removes the .gz ending from all input file names
modified_filename = []
for file in input:
modified_filename.append(file[0:-3]) # gets rid of .gz ending
return modified_filename