In [1]:
import random
import os
from Bio.Seq import Seq
from Bio import SeqIO
import numpy as np
import tensorflow as tf
import math
import pickle
from tensorflow import keras

In [7]:
path_file_list=[]
for root, dirs, files in os.walk('Pathogenic E. Coli Sequences/'):
    for file in files:
        if file.endswith('.fasta'):
            path_file_list.append(('Pathogenic E. Coli Sequences/'+str(file)))

In [8]:
path_file_list

['Pathogenic E. Coli Sequences/PAI_III_536.fasta',
 'Pathogenic E. Coli Sequences/PAI_II_536.fasta',
 'Pathogenic E. Coli Sequences/PAI_I_536.fasta',
 'Pathogenic E. Coli Sequences/PAI_V_536.fasta']

In [9]:
path_seq_list=[];
for filename in path_file_list:
    for seq_record in SeqIO.parse(filename,'fasta'):
        path_seq_list.append(seq_record.seq)

In [10]:
# These are the sequences for 4 pathogenicity islands from E. coli 536
path_seq_list

[Seq('AGGGCCGATATAGCTCAGTTGGTAGAGCAGCGCATTCGTAATGCGAAGGTCGTA...CCT'),
 Seq('GCAAGCTGGCGCTTGCATGGTGGCGTGCGACAGGTATAATCCACAACGTTTTCC...TCG'),
 Seq('GGAAGATCGTCGTCTCCGGTGAGGCGGCTGGACTTCAAATCCAGTTGGGGCCGC...ATC'),
 Seq('GCCCGGATAGCTCAGTCGGTAGAGCAGGGGATTGAAAATCCCCGTGTCCTTGGT...GGA')]

In [12]:
nonpath_ecoli_fasta_file='EcoliK12_MG1655.fasta'

In [13]:
nonpath_seq_list=[];
for seq_record in SeqIO.parse(nonpath_ecoli_fasta_file,'fasta'):
    nonpath_seq_list.append(seq_record.seq)

In [14]:
# This is the sequence for E. coli K-12, MG1655
nonpath_seq_list

[Seq('AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAG...TTC')]

## Change seqs. to numbers

In [15]:
nonpath_vector_list=[];
for item in nonpath_seq_list:
    vector_rep=[]
    for letter in item:
        if letter=='A':
            number=0.25
        elif letter=='T':
            number=0.5
        elif letter=='C':
            number=0.75
        elif letter=='G':
            number=1
        vector_rep.append(number)
    nonpath_vector_list.append(vector_rep)

In [16]:
path_vector_list=[];
for item in path_seq_list:
    vector_rep=[]
    for letter in item:
        if letter=='A':
            number=0.25
        elif letter=='T':
            number=0.5
        elif letter=='C':
            number=0.75
        elif letter=='G':
            number=1
        vector_rep.append(number)
    path_vector_list.append(vector_rep)

## Make 1000-bp segments

In [26]:
cutoff=1000;
nonpath_list=[];

for item in nonpath_vector_list:
    i=0;
    while i+cutoff<=len(item):
        nonpath_list.append(item[i:i+cutoff]);
        i+=1;

In [27]:
len(nonpath_list)

4640653

In [28]:
cutoff=1000;
path_list=[];

for item in path_vector_list:
    i=0;
    while i+cutoff<=len(item):
        path_list.append(item[i:i+cutoff]);
        i+=1;

In [37]:
path_array=np.asarray(path_list)

In [39]:
len(path_list)

356863

In [40]:
short_nonpath_list=random.sample(nonpath_list,len(path_list))

In [41]:
nonpath_array=np.asarray(short_nonpath_list)

In [42]:
path_array

array([[0.25, 1.  , 1.  , ..., 0.75, 0.25, 1.  ],
       [1.  , 1.  , 1.  , ..., 0.25, 1.  , 0.25],
       [1.  , 1.  , 0.75, ..., 1.  , 0.25, 0.25],
       ...,
       [0.5 , 0.75, 1.  , ..., 1.  , 0.5 , 1.  ],
       [0.75, 1.  , 1.  , ..., 0.5 , 1.  , 1.  ],
       [1.  , 1.  , 0.75, ..., 1.  , 1.  , 0.25]])

In [43]:
path_labelled = np.hstack((path_array, np.atleast_2d(np.ones(len(path_array))).T))

In [51]:
path_labelled

array([[0.25, 1.  , 1.  , ..., 0.25, 1.  , 1.  ],
       [1.  , 1.  , 1.  , ..., 1.  , 0.25, 1.  ],
       [1.  , 1.  , 0.75, ..., 0.25, 0.25, 1.  ],
       ...,
       [0.5 , 0.75, 1.  , ..., 0.5 , 1.  , 1.  ],
       [0.75, 1.  , 1.  , ..., 1.  , 1.  , 1.  ],
       [1.  , 1.  , 0.75, ..., 1.  , 0.25, 1.  ]])

In [53]:
np.shape(path_labelled)

(356863, 1001)

In [50]:
type(path_labelled[0][0])

numpy.float64

In [54]:
nonpath_array

array([[0.25, 0.25, 0.5 , ..., 1.  , 1.  , 0.25],
       [0.5 , 0.25, 1.  , ..., 0.5 , 0.25, 0.5 ],
       [0.25, 0.75, 0.25, ..., 0.5 , 0.25, 0.25],
       ...,
       [1.  , 0.25, 0.5 , ..., 0.25, 0.75, 0.75],
       [0.5 , 0.75, 0.5 , ..., 0.25, 0.25, 0.75],
       [0.75, 0.75, 0.5 , ..., 1.  , 0.75, 0.5 ]])

In [52]:
non_path_labelled = np.hstack((nonpath_array, np.atleast_2d(np.zeros(len(nonpath_array),dtype='uint8')).T))

MemoryError: Unable to allocate 2.66 GiB for an array with shape (356863, 1001) and data type float64

In [None]:
data=np.concatenate((non_path_labelled,path_labelled),axis=0)

In [None]:
y_raw=data[:,-1]
X_raw=np.delete(data,-1,axis=1)

In [55]:
np.savez('Thousand_bp_data',nonpath_array,path_labelled,'Nonpath','Path')