/
plotclusterreadproportions.py
executable file
·40 lines (37 loc) · 1.77 KB
/
plotclusterreadproportions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#! /usr/bin/python
# plotclusterreadproportions.py
# Takes a clster file from cdhit and for each cluster with more than one
# unique read, returns:
# 1) The # of unique reads
# 2) The proportion of reads that match the most common unique read
# Values are csv separated so I can scatterplot them.
# This only works if cdhit was run using the output of my renameclusters.py
# script as the input.
# usage: ./plotclusterreadproportions.py clusterfile.clstr
import sys
# Step through the clstr file
with open(sys.argv[1]) as cluster_file:
read_count = -1
most_common_count = 0
total_count = 0
unique_count = 0
print "Proportion_of_most_common,Count_of_unique_sequences,Count_of_reads"
for line in cluster_file:
if line[0] == ">": # This is a new cluster. Record data for the previous cluster and start again
if read_count != -1:
proportion_most_common = most_common_count / float(total_count)
if total_count >= 10: # filter out clusters with low read counts
print "%s,%s,%s" % (proportion_most_common, unique_count, total_count)
most_common_count = 0
total_count = 0
unique_count = 0
else:
unique_count += 1
read_count = int(line.split("...")[0].split(":")[-1]) # The count is between a ":" and the "..."
total_count += read_count
if read_count > most_common_count:
most_common_count = read_count
# We counted the reads for the last cluster in the file but haven't recorded it yet
proportion_most_common = most_common_count / float(total_count)
if total_count >= 10: # filter out clusters with low read counts
print "%s,%s,%s" % (proportion_most_common, unique_count, total_count)