/
renameclusters.py
executable file
·48 lines (46 loc) · 2.49 KB
/
renameclusters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#! /usr/bin/python
# renameclusterseeds.py
# Takes the fasta output from cd-hit-454 and replaces the name of each
# representative "seed" sequence with the name of the cluster from the
# clster output file. This way I can recluster this output at
# different stringencies and the output of those clustering runs will
# directly report which cluster numbers were combined.
# Renames the reads using a tag to identify the clustering run and the
# cluster number, and number of reads in the cluster separated by colons.
# Example: >cintricatum_1.0:cluster_2:87
# usage: ./renameclusterseeds.py fastafile.fasta clusterfile.clstr tag
import sys
import HTSeq
# Step through the fasta file, modify the description and output the record
fasta_file = HTSeq.FastaReader(sys.argv[1])
fasta_reads = iter(fasta_file) # create an iterable so we can step through it with next()
tag = sys.argv[3]
script_failed = 0 # If the cluster file and fasta file aren't in the same order, this won't work
with open(sys.argv[2]) as cluster_file:
cluster_number = ""
read_count = 0
for line in cluster_file:
if line[0] == ">": # This is a new cluster. Print the last cluster and start again
if cluster_number != "": # Print the last cluster info
if script_failed == 1:
print "Seed sequence and cluster order do not match!"
else:
new_name = "%s:%s:%s" % (tag, cluster_number, read_count)
fasta_read.name = new_name
fasta_read.descr = "" # Should be already, but just in case...
fasta_read.write_to_fasta_file(sys.stdout)
cluster_number = line[1:].strip() # Grab the name without the > or \n characters
cluster_number = cluster_number.replace(" ","_") # Replace the space between "Cluster" and the number with an "_"
elif line[0] == "0": # This is the seed for the cluster
# it looks something like this:
# 0 87nt, >Some_fasta_description... *
# and we want to make sure the fasta description matches the description of the next read in the fasta file
# if it doesn't this script won't work, but they SHOULD be in the same order
cluster_name = line.split(">")[1].split("...")[0]
fasta_read = fasta_reads.next()
if fasta_read.name != cluster_name:
# script failed
script_failed = 1
read_count = 1
else:
read_count += 1