In [1]:
%autosave 15
from Bio import SeqIO
import pandas as pd
import numpy as np

Autosaving every 15 seconds


In [2]:
genes = pd.read_table('rice_go.stress.csv', sep=',')
genes.describe()

Unnamed: 0,NAME_FGENE,MODEL_LOC_ID,GOSLIM_ID,KEYWORD_CATEGORY,GOSLIM_TERM,ANNOTATED_BY,TAIR_ID
count,9127,9127,9127,9127,9127,9127,9127
unique,4537,4414,15,2,15,1,2705
top,chrChr1_mRNA_6294_40409611,LOC_Os06g50724.1,GO:0006950,P,response to stress,IEA,TAIR:AT5G24090
freq,10,20,1967,7986,1967,9127,69


In [3]:
def filter_data(l):
    return [el for el in l if el.id in genes['NAME_FGENE'].values]

In [4]:
def split_by_comma(l):
    for el in l:
        el.seq = el.seq.split(',')
    l

In [5]:
CAs = list(SeqIO.parse('FILES_4_PREDICTION/CA.s.fasta', 'fasta'))    
CAs_stress = filter_data(CAs)
len(CAs_stress)

4537

In [38]:
f = open("CAs", 'w')
for c in CAs_stress:
    f.write(str(c.seq) + '\n')
f.close()

In [41]:
CAs = np.loadtxt('CAs', delimiter=',')

In [6]:
CGs = list(SeqIO.parse('FILES_4_PREDICTION/CG.s.fasta', 'fasta'))
CGs_stress = filter_data(CGs)
len(CGs_stress)

4537

In [42]:
f = open("CGs", 'w')
for c in CGs_stress:
    f.write(str(c.seq) + '\n')
f.close()
CGs = np.loadtxt('CGs', delimiter=',')

In [7]:
SNPs = list(SeqIO.parse('FILES_4_PREDICTION/COUNT_SNP.s.fasta', 'fasta'))
SNPs_stress = filter_data(SNPs)
len(SNPs_stress)

4537

In [43]:
f = open("SNPs", 'w')
for c in SNPs_stress:
    f.write(str(c.seq) + '\n')
f.close()
SNPs = np.loadtxt('SNPs', delimiter=',')

In [8]:
MAFs = list(SeqIO.parse('FILES_4_PREDICTION/MAF_SNP.s.fasta', 'fasta'))
MAFs_stress = filter_data(MAFs)
len(MAFs_stress)

4537

In [45]:
f = open("MAFs", 'w')
for c in MAFs_stress:
    f.write(str(c.seq) + '\n')
f.close()
MAFs = np.loadtxt('MAFs', delimiter=',')

In [46]:
COVs = list(SeqIO.parse('FILES_4_PREDICTION/COV.s.fasta', 'fasta'))
for c in COVs:
    c.id = 'chr' + c.id
COVs_stress = filter_data(COVs)
len(COVs_stress)

4537

In [47]:
f = open("COVs", 'w')
for c in COVs_stress:
    f.write(str(c.seq) + '\n')
f.close()
COVs = np.loadtxt('COVs', delimiter=',')

In [10]:
meth = list(SeqIO.parse('FILES_4_PREDICTION/methylation.s.fasta', 'fasta'))
meth_stress = filter_data(meth)
len(meth_stress)

4537

In [51]:
f = open("meth", 'w')
for c in meth_stress:
    f.write(str(c.seq)[0:-1] + '\n')
f.close()
meth = np.loadtxt('meth', delimiter=',')

In [11]:
TATAs = list(SeqIO.parse('FILES_4_PREDICTION/TATA.s.fasta', 'fasta'))
TATAs_stress = filter_data(TATAs)
len(TATAs_stress)

4537

In [53]:
f = open("TATAs", 'w')
for c in TATAs_stress:
    f.write(str(c.seq) + '\n')
f.close()
TATAs = np.loadtxt('TATAs', delimiter=',')

In [12]:
TFBSinf = list(SeqIO.parse('FILES_4_PREDICTION/TFBS.inform.fasta', 'fasta'))
TFBSinf_stress = filter_data(TFBSinf)
len(TFBSinf_stress)

4537

In [54]:
f = open("TFBSinf", 'w')
for c in TFBSinf_stress:
    f.write(str(c.seq) + '\n')
f.close()
TFBSinf = np.loadtxt('TFBSinf', delimiter=',')

In [13]:
TFBSs = list(SeqIO.parse('FILES_4_PREDICTION/TFBS.s.fasta', 'fasta'))
TFBSs_stress = filter_data(TFBSs)
len(TFBSs_stress)

4537

In [55]:
f = open("TFBSs", 'w')
for c in TFBSs_stress:
    f.write(str(c.seq) + '\n')
f.close()
TFBSs = np.loadtxt('TFBSs', delimiter=',')

In [14]:
seqs = list(SeqIO.parse('FILES_4_PREDICTION/rice_prom_fgene.1000_1000.2000.fasta', 'fasta'))
seqs_stress = filter_data(seqs)
len(seqs_stress)

4537

## Compiling samples into list of matrices

In [15]:
seqs_encoded = []
for i in range(0, len(seqs_stress)):
    instead_of_letters = [
        [1, 0, 0, 0] 
        if seqs_stress[i].seq[j] == 'a' 
        else 
        [0, 1, 0, 0] 
        if seqs_stress[i].seq[j] == 't'
        else
        [0, 0, 1, 0]
        if seqs_stress[i].seq[j] == 'g'
        else
        [0, 0, 0, 1] for j in range(0, 2000)]
    seqs_encoded.append(instead_of_letters)

In [31]:
def get_elem(l, i, j):
    return float(str(l[i].seq.split(',')[j]))

def get_np_array(l, i):
    return np.array([[get_elem(l, i, j)] for j in range(2000)])

In [81]:
data = []
for i in range(0, len(seqs_encoded)):
    sample = np.append(np.array(seqs_encoded[i]), 
                       CAs[i].reshape(-1,1), axis = 1)    
    sample = np.append(sample,
                      CGs[i].reshape(-1,1), axis = 1)
    sample = np.append(sample,
                        SNPs[i].reshape(-1,1), axis = 1)
    sample = np.append(sample, 
                          MAFs[i].reshape(-1,1), axis = 1)
    sample = np.append(sample,
                          COVs[i].reshape(-1,1), axis = 1)
    sample = np.append(sample, 
                          meth[i].reshape(-1, 1), axis = 1)
    sample = np.append(sample, 
                          TATAs[i].reshape(-1, 1), axis = 1)
    sample = np.append(sample, 
                          TFBSinf[i].reshape(-1, 1), axis = 1)
    sample = np.append(sample,
                          TFBSs[i].reshape(-1, 1), axis = 1)
    data.append(np.array(sample))
    print(i, np.size(sample))

0 26000
1 26000
2 26000
3 26000
4 26000
5 26000
6 26000
7 26000
8 26000
9 26000
10 26000
11 26000
12 26000
13 26000
14 26000
15 26000
16 26000
17 26000
18 26000
19 26000
20 26000
21 26000
22 26000
23 26000
24 26000
25 26000
26 26000
27 26000
28 26000
29 26000
30 26000
31 26000
32 26000
33 26000
34 26000
35 26000
36 26000
37 26000
38 26000
39 26000
40 26000
41 26000
42 26000
43 26000
44 26000
45 26000
46 26000
47 26000
48 26000
49 26000
50 26000
51 26000
52 26000
53 26000
54 26000
55 26000
56 26000
57 26000
58 26000
59 26000
60 26000
61 26000
62 26000
63 26000
64 26000
65 26000
66 26000
67 26000
68 26000
69 26000
70 26000
71 26000
72 26000
73 26000
74 26000
75 26000
76 26000
77 26000
78 26000
79 26000
80 26000
81 26000
82 26000
83 26000
84 26000
85 26000
86 26000
87 26000
88 26000
89 26000
90 26000
91 26000
92 26000
93 26000
94 26000
95 26000
96 26000
97 26000
98 26000
99 26000
100 26000
101 26000
102 26000
103 26000
104 26000
105 26000
106 26000
107 26000
108 26000
109 26000
110 26000


838 26000
839 26000
840 26000
841 26000
842 26000
843 26000
844 26000
845 26000
846 26000
847 26000
848 26000
849 26000
850 26000
851 26000
852 26000
853 26000
854 26000
855 26000
856 26000
857 26000
858 26000
859 26000
860 26000
861 26000
862 26000
863 26000
864 26000
865 26000
866 26000
867 26000
868 26000
869 26000
870 26000
871 26000
872 26000
873 26000
874 26000
875 26000
876 26000
877 26000
878 26000
879 26000
880 26000
881 26000
882 26000
883 26000
884 26000
885 26000
886 26000
887 26000
888 26000
889 26000
890 26000
891 26000
892 26000
893 26000
894 26000
895 26000
896 26000
897 26000
898 26000
899 26000
900 26000
901 26000
902 26000
903 26000
904 26000
905 26000
906 26000
907 26000
908 26000
909 26000
910 26000
911 26000
912 26000
913 26000
914 26000
915 26000
916 26000
917 26000
918 26000
919 26000
920 26000
921 26000
922 26000
923 26000
924 26000
925 26000
926 26000
927 26000
928 26000
929 26000
930 26000
931 26000
932 26000
933 26000
934 26000
935 26000
936 26000
937 26000


1637 26000
1638 26000
1639 26000
1640 26000
1641 26000
1642 26000
1643 26000
1644 26000
1645 26000
1646 26000
1647 26000
1648 26000
1649 26000
1650 26000
1651 26000
1652 26000
1653 26000
1654 26000
1655 26000
1656 26000
1657 26000
1658 26000
1659 26000
1660 26000
1661 26000
1662 26000
1663 26000
1664 26000
1665 26000
1666 26000
1667 26000
1668 26000
1669 26000
1670 26000
1671 26000
1672 26000
1673 26000
1674 26000
1675 26000
1676 26000
1677 26000
1678 26000
1679 26000
1680 26000
1681 26000
1682 26000
1683 26000
1684 26000
1685 26000
1686 26000
1687 26000
1688 26000
1689 26000
1690 26000
1691 26000
1692 26000
1693 26000
1694 26000
1695 26000
1696 26000
1697 26000
1698 26000
1699 26000
1700 26000
1701 26000
1702 26000
1703 26000
1704 26000
1705 26000
1706 26000
1707 26000
1708 26000
1709 26000
1710 26000
1711 26000
1712 26000
1713 26000
1714 26000
1715 26000
1716 26000
1717 26000
1718 26000
1719 26000
1720 26000
1721 26000
1722 26000
1723 26000
1724 26000
1725 26000
1726 26000
1727 26000

2499 26000
2500 26000
2501 26000
2502 26000
2503 26000
2504 26000
2505 26000
2506 26000
2507 26000
2508 26000
2509 26000
2510 26000
2511 26000
2512 26000
2513 26000
2514 26000
2515 26000
2516 26000
2517 26000
2518 26000
2519 26000
2520 26000
2521 26000
2522 26000
2523 26000
2524 26000
2525 26000
2526 26000
2527 26000
2528 26000
2529 26000
2530 26000
2531 26000
2532 26000
2533 26000
2534 26000
2535 26000
2536 26000
2537 26000
2538 26000
2539 26000
2540 26000
2541 26000
2542 26000
2543 26000
2544 26000
2545 26000
2546 26000
2547 26000
2548 26000
2549 26000
2550 26000
2551 26000
2552 26000
2553 26000
2554 26000
2555 26000
2556 26000
2557 26000
2558 26000
2559 26000
2560 26000
2561 26000
2562 26000
2563 26000
2564 26000
2565 26000
2566 26000
2567 26000
2568 26000
2569 26000
2570 26000
2571 26000
2572 26000
2573 26000
2574 26000
2575 26000
2576 26000
2577 26000
2578 26000
2579 26000
2580 26000
2581 26000
2582 26000
2583 26000
2584 26000
2585 26000
2586 26000
2587 26000
2588 26000
2589 26000

3324 26000
3325 26000
3326 26000
3327 26000
3328 26000
3329 26000
3330 26000
3331 26000
3332 26000
3333 26000
3334 26000
3335 26000
3336 26000
3337 26000
3338 26000
3339 26000
3340 26000
3341 26000
3342 26000
3343 26000
3344 26000
3345 26000
3346 26000
3347 26000
3348 26000
3349 26000
3350 26000
3351 26000
3352 26000
3353 26000
3354 26000
3355 26000
3356 26000
3357 26000
3358 26000
3359 26000
3360 26000
3361 26000
3362 26000
3363 26000
3364 26000
3365 26000
3366 26000
3367 26000
3368 26000
3369 26000
3370 26000
3371 26000
3372 26000
3373 26000
3374 26000
3375 26000
3376 26000
3377 26000
3378 26000
3379 26000
3380 26000
3381 26000
3382 26000
3383 26000
3384 26000
3385 26000
3386 26000
3387 26000
3388 26000
3389 26000
3390 26000
3391 26000
3392 26000
3393 26000
3394 26000
3395 26000
3396 26000
3397 26000
3398 26000
3399 26000
3400 26000
3401 26000
3402 26000
3403 26000
3404 26000
3405 26000
3406 26000
3407 26000
3408 26000
3409 26000
3410 26000
3411 26000
3412 26000
3413 26000
3414 26000

4127 26000
4128 26000
4129 26000
4130 26000
4131 26000
4132 26000
4133 26000
4134 26000
4135 26000
4136 26000
4137 26000
4138 26000
4139 26000
4140 26000
4141 26000
4142 26000
4143 26000
4144 26000
4145 26000
4146 26000
4147 26000
4148 26000
4149 26000
4150 26000
4151 26000
4152 26000
4153 26000
4154 26000
4155 26000
4156 26000
4157 26000
4158 26000
4159 26000
4160 26000
4161 26000
4162 26000
4163 26000
4164 26000
4165 26000
4166 26000
4167 26000
4168 26000
4169 26000
4170 26000
4171 26000
4172 26000
4173 26000
4174 26000
4175 26000
4176 26000
4177 26000
4178 26000
4179 26000
4180 26000
4181 26000
4182 26000
4183 26000
4184 26000
4185 26000
4186 26000
4187 26000
4188 26000
4189 26000
4190 26000
4191 26000
4192 26000
4193 26000
4194 26000
4195 26000
4196 26000
4197 26000
4198 26000
4199 26000
4200 26000
4201 26000
4202 26000
4203 26000
4204 26000
4205 26000
4206 26000
4207 26000
4208 26000
4209 26000
4210 26000
4211 26000
4212 26000
4213 26000
4214 26000
4215 26000
4216 26000
4217 26000

In [87]:
import pickle

In [90]:
pickle.dump(data, open('data.pkl', 'wb'))

## Keras model

In [91]:
from keras.models import Sequential
from keras.layers import Dense, Activation

Using TensorFlow backend.


ModuleNotFoundError: No module named 'tensorflow'