In [None]:
import pandas as pd
import scipy as sp
import numpy as np

In [None]:
replicon = pd.read_table('data/replicon.tsv')

In [None]:
PLASMID_MULTI = 10

genome_list = ['Escherichia_coli_K_12_MG1655', 'Escherichia_coli_S88',
               'Faecalibacterium_prausnitzii_A2165',
               'Bacteroides_vulgatus_ATCC_8482']  # genome_ids for each genome
means = [4, 4, 10, 10]  # Per-organism mean abundance
stdev = [0.2, 0.2, 0.2, 0.2]  # Per-organism standard deviation of the log-abundance
nreads = int(1e5)
alpha = 50


assert len(genome_list) == len(means)
assert len(genome_list) == len(stdev)

In [None]:
sp.random.seed(1)
genome_abund = sp.random.lognormal(mean=np.log(means), sigma=stdev)
print(genome_abund)

sim_replicon = replicon[replicon.genome_id.isin(genome_list)].copy()
sim_replicon['copies'] = np.nan
sim_replicon['org_abund'] = np.nan

for i, genome_id in enumerate(genome_list):
    sim_replicon.loc[(replicon.genome_id == genome_id),
                     'org_abund'] = genome_abund[i]
    sim_replicon.loc[((replicon.genome_id == genome_id) &
                      (replicon.replicon_type == 'chromosome')),
                     'copies'] = genome_abund[i]
    sim_replicon.loc[((replicon.genome_id == genome_id) &
                      (replicon.replicon_type == 'plasmid')),
                     'copies'] = genome_abund[i] * PLASMID_MULTI
    
assert set(sim_replicon.genome_id) == set(genome_list)
    
sim_replicon['rabund'] = sim_replicon['org_abund'] / sim_replicon['org_abund'].sum()
sim_replicon['nucabund'] = (sim_replicon['size'] * sim_replicon['copies'])
sim_replicon['pread'] = sim_replicon['nucabund'] / sim_replicon['nucabund'].sum()
sim_replicon['nreads'] = np.random.multinomial(nreads, np.random.dirichlet(sim_replicon['pread'] * alpha))
sim_replicon[['genome_id', 'replicon_id', 'genbank_id', 'size', 'rabund', 'pread', 'nreads']]

In [None]:
from iss.generator import simulate_read
from iss.error_models.kde import KDErrorModel
import iss
import os.path
from os import listdir
import sys
from tqdm import trange

err_model = KDErrorModel(os.path.join(iss.__path__[0], 'profiles/HiSeq'))

from Bio.SeqIO import index as seq_file_index
from Bio.SeqIO import write as write_seq

path1 = 'test_1.fq'
path2 = 'test_2.fq'
print(f'Simulating reads to {path1} and {path2}', file=sys.stderr)

with open(path1, 'w') as handle1, open(path2, 'w') as handle2:
    for genome_id, g in sim_replicon.groupby('genome_id'):
        print(f'Simulating reads for genome {genome_id}', file=sys.stderr)
        seqs = seq_file_index('ref/genome/' + genome_id + '.fn', 'fasta')
        for _, r in g.iterrows():
            print(f'Simulating reads for replicon {r.replicon_id}', file=sys.stderr)
            record = seqs[r.genbank_id]
            for i in trange(r.nreads):
                paired_reads = simulate_read(record, err_model, i)
                write_seq(paired_reads[0], handle1, 'fastq-sanger')
                write_seq(paired_reads[1], handle2, 'fastq-sanger')