-
Notifications
You must be signed in to change notification settings - Fork 1
/
Upstream_Ea.py
73 lines (55 loc) · 1.74 KB
/
Upstream_Ea.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python
import sys
import re
# Reading files
fasta_file = open(sys.argv[1], "r")
line = fasta_file.readline()
seqs = {}
# for line in fasta_file:
while line != "":
if line.startswith('>'):
name = line[1:].rstrip('\n')
seq = fasta_file.readline().rstrip('\n')
seqs[name] = seq
line = fasta_file.readline()
print "fasta file contains %i sequences" % (len(seqs.keys()))
fasta_file.close()
# Selecting specific sequences for TTTTTT nucleotides
req_seq = {}
for i in seqs:
name = i
seq = seqs[i]
pattern = re.search("T{4,8}", seq)
find_iter = re.finditer("T{4,8}", seq)
if pattern:
for seq_match in find_iter:
ind = seq_match.span()
up_seq = seq[ind[0] - 6:ind[0]]
gc_count = re.findall("[GC]", up_seq)
up_seq25 = seq[ind[0] - 25:ind[0]]
gc_count25 = re.findall("[GC]", up_seq25)
if len(gc_count) >= 3 and len(gc_count25) > 0:
gc_perc = float(len(gc_count25)) / len(up_seq25)
print gc_count25
print up_seq25
if gc_perc > 0.5:
print "yes"
final_seq = seq[ind[0] - 25:ind[1]]
req_seq[name] = final_seq
print name
print final_seq
print gc_perc
else:
continue
else:
continue
else:
continue
print "%i sequences that match '%s' value" % (len(req_seq.keys()), "T 4-8")
# Writing output fasta file
output = open("intergenic_Ea_Q.fasta", "w")
for i in req_seq:
name = i
seq = req_seq[i]
output.write(">%s\n%s\n" % (name, seq))
output.close()