# Data Collection and Preprocessing

This script downloads and preprocesses the fasta files. Our target is to either write a representative of the species that is ~500000bp long, or store multiple sequences that add up to 500000bp.

In [1]:
import os
import utilities as utils
import numpy as np

In [2]:
with open('non_labs.txt', 'r') as f:
    labels = ['_'.join(line.split()) for line in f]

In [3]:
curls = 'non_curls'

In [4]:
listing = os.listdir(curls)
ids = []
for i in range(len(listing)):
    path = os.path.join(curls, f'gtdb-adv-search-genomes ({i+1}).sh')
    with open(path, 'r') as f:
        next(f)
        tmp = f.read()
        parts = tmp.split('"')
        url = parts[1]
        id = url.split('=')[-1].split('&')[0].split('.zip')[0]
        ids.append(id)

In [5]:
dataset = 'non_dataset'

In [None]:
for id, label in zip(ids, labels):
    utils.download_fasta(id, os.path.join(dataset, label))

In [7]:
for dir in os.listdir(dataset):
    if dir.startswith('.'):
        continue
    target = os.path.join(dataset, dir)
    for fasta in os.listdir(target):
        if fasta.startswith('.'):
            continue
        seq = utils.read_fasta(os.path.join(target, fasta))
        seq = utils.clean_fasta(seq)
        utils.write_fastas(seq, dir, 'non_target')

In [8]:
data = 'non_target'
repr = 'non_repr'

In [9]:
MAX_LEN = 500000

if not os.path.exists(data):
    os.makedirs(data)
if not os.path.exists(repr):
    os.makedirs(repr)

for folder in os.listdir(data):
    if folder.startswith('.'):
        continue
    count = 0
    seqs = []
    pwd = os.path.join(data, folder)
    for file in os.listdir(pwd):
        file_path = os.path.join(pwd, file)
        seq = utils.read_fasta(file_path)
        seq = utils.clean_fasta(seq)
        ln = len(seq[0])
        if not os.path.exists(os.path.join(repr, folder)):
            os.makedirs(os.path.join(repr, folder))
        if ln > MAX_LEN:
            max_starting_index = ln - MAX_LEN
            idx = np.random.randint(0, max_starting_index)
            seq = [seq[0][idx:idx+MAX_LEN-1]]
            utils.write_fastas(seq, folder, repr, f'{folder} representative_0')
            break
        if count < MAX_LEN:
            if ln > MAX_LEN - count:
                seqs.append(seq[0][:MAX_LEN-count])
                count += MAX_LEN-count
            else:
                seqs.append(seq[0])
                count += ln
        else:
            utils.write_fastas(seqs, folder, repr, f'{folder} representative')
            break

In [10]:
with open('patho_labs.txt', 'r') as f:
    labels = [line.split(': s__')[1].strip() for line in f]

In [11]:
curls = 'patho_curls'

In [12]:
listing = os.listdir(curls)
ids = []
for i in range(len(listing)):
    path = os.path.join(curls, f'gtdb-adv-search-genomes ({i+1}).sh')
    with open(path, 'r') as f:
        next(f)
        tmp = f.read()
        parts = tmp.split('"')
        url = parts[1]
        id = url.split('=')[-1].split('&')[0].split('.zip')[0]
        ids.append(id)

In [13]:
dataset = 'patho_dataset'

In [None]:
for id, label in zip(ids, labels):
    utils.download_fasta(id, os.path.join(dataset, label))

In [15]:
for dir in os.listdir(dataset):
    if dir.startswith('.'):
        continue
    target = os.path.join(dataset, dir)
    for fasta in os.listdir(target):
        if fasta.startswith('.'):
            continue
        seq = utils.read_fasta(os.path.join(target, fasta))
        seq = utils.clean_fasta(seq)
        utils.write_fastas(seq, dir, 'patho_target')

In [16]:
data = 'patho_target'
repr = 'patho_repr'

In [17]:
MAX_LEN = 500000

if not os.path.exists(data):
    os.makedirs(data)
if not os.path.exists(repr):
    os.makedirs(repr)

for folder in os.listdir(data):
    if folder.startswith('.'):
        continue
    count = 0
    seqs = []
    pwd = os.path.join(data, folder)
    for file in os.listdir(pwd):
        file_path = os.path.join(pwd, file)
        seq = utils.read_fasta(file_path)
        seq = utils.clean_fasta(seq)
        ln = len(seq[0])
        if not os.path.exists(os.path.join(repr, folder)):
            os.makedirs(os.path.join(repr, folder))
        if ln > MAX_LEN:
            max_starting_index = ln - MAX_LEN
            idx = np.random.randint(0, max_starting_index)
            seq = [seq[0][idx:idx+MAX_LEN-1]]
            utils.write_fastas(seq, folder, repr, f'{folder} representative_0')
            break
        if count < MAX_LEN:
            if ln > MAX_LEN - count:
                seqs.append(seq[0][:MAX_LEN-count])
                count += MAX_LEN-count
            else:
                seqs.append(seq[0])
                count += ln
        else:
            utils.write_fastas(seqs, folder, repr, f'{folder} representative')
            break
