/
makepseudogenome.py
executable file
·44 lines (40 loc) · 1.98 KB
/
makepseudogenome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#! /usr/bin/python
# makepseudogenome.py
# From a cdhit cluster of unique sequences, makes a fasta file of the
# most common unique sequence in each cluster (since the cdhit "seeds"
# are not always the most common sequence). This can be used as a
# pseudogenome to assemble reads against using Bowtie2 or other tools.
# usage: ./makepseudogene.py clusterfile.clstr uniquereads.fasta
import sys
import HTSeq
#Read the fastafile of unique sequences into a dictionary. This will be big (~4Gb) but should be faster than reading through the file millions of times.
unique_sequences = {}
unique_sequences_file = HTSeq.FastaReader(sys.argv[2])
for sequence in unique_sequences_file:
unique_sequences[sequence.name] = sequence.seq
print >> sys.stderr, "Indexed unique sequences"
#Step through the cluster file
with open(sys.argv[1]) as cluster_file:
cluster_name = ""
most_common_count = 0
most_common_name = ""
total_reads = 0
for line in cluster_file:
if line[0] == ">": #This is a new cluster, write the data for the last cluster
if cluster_name != "": #Print the last cluster info
if total_reads > 1: #Don't bother writing singletons to the pseudogenome
print cluster_name
print unique_sequences[most_common_name]
cluster_name = line.strip().replace(" ", "_")
most_common_count = 0
total_reads = 0
else: #this is a unique sequence. See if it is most common so far
count = int(line.split("...")[0].split(":")[-1]) # the count is between : and ...
if count > most_common_count:
most_common_count = count
most_common_name = line.split("...")[0].split(">")[-1] # name is between > and ...
total_reads += most_common_count
#when we finish reading the file we still haven't printed the info for the last cluster, so do it now
if total_reads > 1:
print cluster_name
print unique_sequences[most_common_name]