-
Notifications
You must be signed in to change notification settings - Fork 1
/
merge_tables.py
100 lines (89 loc) · 2.89 KB
/
merge_tables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/python
#This code is adapted from Dave Wheeler
#To run for our data:
#make a file called Sample_10_raw_count.txt that include the following
#
#201348189-01_raw_count_0.txt cbpKO_47_sham
#201348190-01_raw_count_0.txt cbpKO_36_TAC
#201348191-01_raw_count_0.txt cbpKO_32_TAC
#201348192-01_raw_count_0.txt WT_12_sham
#201348193-01_raw_count_0.txt WT_21_sham
#201348195-01_raw_count_0.txt WT_4_TAC
#201348196-01_raw_count_0.txt p300KO_67_sham
#201348199-01_raw_count_0.txt p300KO_79_TAC
#201348200-01_raw_count_0.txt cbpKO_33_sham
# then run
#python ~/Code/merge_tables.py Sample_10_raw_count.txt
# you will get file name as "merged_counts.txt"
# this file is used as input for Run_DESeq2.R
#####################################################
# example.py - a program to .... #
# #
# Author: Dave Wheeler #
# #
# Purpose: merge count tables #
# #
# Usage: python merge_tables guide_file #
#####################################################
#looks for text guide_file
#that contains files to be merged with column headers (space separted)
#ie
#file1.counts untreated1
#file2.counts untreated2
#file3.counts treated1
#file4.counts treated2
#this will generated a tab separated table like this
#
#gene untreated1 untreated2 treated1 treated2
#gene1 0 0 0 0
#gene2 1 0 11 10
#.......
##############################################
import sys
try:
infile = open(sys.argv[1])
except IndexError:
print "No guide file provided"
sys.exit()
#make dict of genes with list of counts
#list is ordered so treatments will be preserved.
#genes = {'gene1':[1,2,3,4]}
#header keeps track of treatment order, will be as read from config
col_header = []
genes = {}
outfile = open('merged_counts.txt','w')
for line in infile:
filename,header = line.strip().split(' ')
try:
data_f = open(filename)
except IOError:
print "%s can't be found?"%filename
sys.exit()
col_header.append(header)
#read file and add gene and counts to the dict
for line in data_f:
gene,count = line.strip().split('\t')
if gene not in genes:
genes[gene] = [count]
else:
genes[gene].append(count)
#important to close file
data_f.close()
infile.close()
outfile.write('gene\t'+'\t'.join(col_header)+'\n')
for gene in genes:
data = genes[gene]
#make sure each treatment has a count for this gene
#this should catch most errors
try:
assert len(data) == len(col_header)
except AssertionError:
print "one of the treatment or genes is missing or extra"
print "data, found the problem here:"
print gene,data
print "while %s columns of treatments given" %len(col_header)
sys.exit()
out_data = gene+'\t'+'\t'.join(data)+'\n'
outfile.write(out_data)
outfile.close()
print "Merged table is 'merged_counts.txt'"