# Data Generation Pipline 

In [5]:
import os
import re
import gzip
import random
import time
import pandas as pd
import numpy as np
from Bio import SeqIO
from pyfaidx import Fasta

In [2]:
data_folder='E:/refseq/'
genome_folder='E:/masters/virnet/data/1-genomes'
train_test_folder='E:/masters/virnet/data/2-train_test'
fragments_folder='E:/masters/virnet/data/3-fragments/fna'
metagenome_folder='E:/masters/virnet/data/4-metagenome'

## 1- Generate Genomes

In [50]:
def get_taxid(features):
    for feature in features:
        if feature.type == 'source':
            taxid = feature.qualifiers["db_xref"][0].strip('taxon:')
            return (taxid)
    return 0

def load_file(gb_file,file_type,sample,sample_ratio=20):
    random.seed(42)
    genomes=[]
    #if(sample==True and random.randint(0,2)!=0):
    #    return genomes
    with gzip.open(gb_file,"rt") as handle:
        for record in SeqIO.parse(handle, file_type):
            if(sample==True and random.randint(0,sample_ratio)!=0):
                continue
            else:
                #print(record.annotations['source'])
                genome={'id':record.id,'len':len(record),'seq':record.seq,'taxid':'','date':''}
                if(file_type=='genbank'):
                    genome['date']=record.annotations['date']
                    genome['taxid']=get_taxid(record.features)
                genomes.append(genome)
    return genomes  

def load_data(genometype,file_type,output_folder,sample=False,sample_ratio=20):
    start=time.time()
    genome_folder=os.path.join(data_folder,genometype)
    files_list=[]
    if(file_type=='fasta'):
        files_list=[file for file in os.listdir(genome_folder) if file.endswith(".genomic.fna.gz")]
    else:
        files_list=[file for file in os.listdir(genome_folder) if file.endswith(".genomic.gbff.gz")]
    
    count=0
    output_file_name=os.path.join(output_folder,"{0}.fna".format(genometype))
    with open(output_file_name,'w+') as f:
        for i in range(len(files_list)):
            file=files_list[i]
            print('Prasing {1}/{2} file: {0}'.format(file,i+1,len(files_list)))
            gb_file=os.path.join(genome_folder,file)
            file_genomes=load_file(gb_file,file_type,sample,sample_ratio)
            print('Saving {0} genomes from file : {1}'.format(len(file_genomes),file))
            for genome in file_genomes:
                genome_header='> ref_{0}|'.format(genometype)
                for key in genome:
                    if(key!='seq'):
                        genome_header+='{0}={1}|'.format(key,genome[key])
                f.write('{0}\n{1}\n'.format(genome_header,genome['seq']))
            count+=len(file_genomes)
    print('Saved {0} genomes in {1}'.format(count,output_file_name))
    end=time.time()
    print('Time elapased {0:.3f} Minutes'.format((end-start)/60))

In [30]:
load_data('viral','genbank',genome_folder,sample=False)

Prasing 1/2 file: viral.1.genomic.gbff.gz
Saving 8557 genomes from file : viral.1.genomic.gbff.gz
Prasing 2/2 file: viral.2.genomic.gbff.gz
Saving 999 genomes from file : viral.2.genomic.gbff.gz
Saved 9556 genomes in E:/masters/virnet/data/1-genomes\viral.fna
Time elapased 0.936 Minutes


In [51]:
load_data('archaea','fasta',genome_folder,sample=True,sample_ratio=2)

Prasing 1/6 file: archaea.1.1.genomic.fna.gz
Saving 1349 genomes from file : archaea.1.1.genomic.fna.gz
Prasing 2/6 file: archaea.2.1.genomic.fna.gz
Saving 4351 genomes from file : archaea.2.1.genomic.fna.gz
Prasing 3/6 file: archaea.3.1.genomic.fna.gz
Saving 3563 genomes from file : archaea.3.1.genomic.fna.gz
Prasing 4/6 file: archaea.4.1.genomic.fna.gz
Saving 2739 genomes from file : archaea.4.1.genomic.fna.gz
Prasing 5/6 file: archaea.5.1.genomic.fna.gz
Saving 122 genomes from file : archaea.5.1.genomic.fna.gz
Prasing 6/6 file: archaea.6.1.genomic.fna.gz
Saving 80 genomes from file : archaea.6.1.genomic.fna.gz
Saved 12204 genomes in E:/masters/virnet/data/1-genomes\archaea.fna
Time elapased 0.633 Minutes


In [32]:
load_data('bacteria','fasta',genome_folder,sample=True)

Prasing 1/1212 file: bacteria.1.1.genomic.fna.gz
Saving 13504 genomes from file : bacteria.1.1.genomic.fna.gz
Prasing 2/1212 file: bacteria.10.1.genomic.fna.gz
Saving 95 genomes from file : bacteria.10.1.genomic.fna.gz
Prasing 3/1212 file: bacteria.100.1.genomic.fna.gz
Saving 137 genomes from file : bacteria.100.1.genomic.fna.gz
Prasing 4/1212 file: bacteria.1000.1.genomic.fna.gz
Saving 51 genomes from file : bacteria.1000.1.genomic.fna.gz
Prasing 5/1212 file: bacteria.1001.1.genomic.fna.gz
Saving 53 genomes from file : bacteria.1001.1.genomic.fna.gz
Prasing 6/1212 file: bacteria.1002.1.genomic.fna.gz
Saving 640 genomes from file : bacteria.1002.1.genomic.fna.gz
Prasing 7/1212 file: bacteria.1003.1.genomic.fna.gz
Saving 998 genomes from file : bacteria.1003.1.genomic.fna.gz
Prasing 8/1212 file: bacteria.1004.1.genomic.fna.gz
Saving 715 genomes from file : bacteria.1004.1.genomic.fna.gz
Prasing 9/1212 file: bacteria.1005.1.genomic.fna.gz
Saving 34 genomes from file : bacteria.1005.1.gen

Prasing 73/1212 file: bacteria.1063.1.genomic.fna.gz
Saving 178 genomes from file : bacteria.1063.1.genomic.fna.gz
Prasing 74/1212 file: bacteria.1064.1.genomic.fna.gz
Saving 73 genomes from file : bacteria.1064.1.genomic.fna.gz
Prasing 75/1212 file: bacteria.1065.1.genomic.fna.gz
Saving 1 genomes from file : bacteria.1065.1.genomic.fna.gz
Prasing 76/1212 file: bacteria.1066.1.genomic.fna.gz
Saving 10 genomes from file : bacteria.1066.1.genomic.fna.gz
Prasing 77/1212 file: bacteria.1067.1.genomic.fna.gz
Saving 54 genomes from file : bacteria.1067.1.genomic.fna.gz
Prasing 78/1212 file: bacteria.1068.1.genomic.fna.gz
Saving 39 genomes from file : bacteria.1068.1.genomic.fna.gz
Prasing 79/1212 file: bacteria.1069.1.genomic.fna.gz
Saving 189 genomes from file : bacteria.1069.1.genomic.fna.gz
Prasing 80/1212 file: bacteria.107.1.genomic.fna.gz
Saving 374 genomes from file : bacteria.107.1.genomic.fna.gz
Prasing 81/1212 file: bacteria.1070.1.genomic.fna.gz
Saving 313 genomes from file : bact

Saving 34 genomes from file : bacteria.1127.1.genomic.fna.gz
Prasing 145/1212 file: bacteria.1128.1.genomic.fna.gz
Saving 40 genomes from file : bacteria.1128.1.genomic.fna.gz
Prasing 146/1212 file: bacteria.1129.1.genomic.fna.gz
Saving 74 genomes from file : bacteria.1129.1.genomic.fna.gz
Prasing 147/1212 file: bacteria.113.1.genomic.fna.gz
Saving 221 genomes from file : bacteria.113.1.genomic.fna.gz
Prasing 148/1212 file: bacteria.1130.1.genomic.fna.gz
Saving 82 genomes from file : bacteria.1130.1.genomic.fna.gz
Prasing 149/1212 file: bacteria.1131.1.genomic.fna.gz
Saving 77 genomes from file : bacteria.1131.1.genomic.fna.gz
Prasing 150/1212 file: bacteria.1132.1.genomic.fna.gz
Saving 135 genomes from file : bacteria.1132.1.genomic.fna.gz
Prasing 151/1212 file: bacteria.1133.1.genomic.fna.gz
Saving 67 genomes from file : bacteria.1133.1.genomic.fna.gz
Prasing 152/1212 file: bacteria.1134.1.genomic.fna.gz
Saving 165 genomes from file : bacteria.1134.1.genomic.fna.gz
Prasing 153/1212 f

Saving 10 genomes from file : bacteria.1192.1.genomic.fna.gz
Prasing 217/1212 file: bacteria.1193.1.genomic.fna.gz
Saving 13 genomes from file : bacteria.1193.1.genomic.fna.gz
Prasing 218/1212 file: bacteria.1194.1.genomic.fna.gz
Saving 8 genomes from file : bacteria.1194.1.genomic.fna.gz
Prasing 219/1212 file: bacteria.1195.1.genomic.fna.gz
Saving 0 genomes from file : bacteria.1195.1.genomic.fna.gz
Prasing 220/1212 file: bacteria.1196.1.genomic.fna.gz
Saving 14 genomes from file : bacteria.1196.1.genomic.fna.gz
Prasing 221/1212 file: bacteria.1197.1.genomic.fna.gz
Saving 6 genomes from file : bacteria.1197.1.genomic.fna.gz
Prasing 222/1212 file: bacteria.1198.1.genomic.fna.gz
Saving 16 genomes from file : bacteria.1198.1.genomic.fna.gz
Prasing 223/1212 file: bacteria.1199.1.genomic.fna.gz
Saving 11 genomes from file : bacteria.1199.1.genomic.fna.gz
Prasing 224/1212 file: bacteria.12.1.genomic.fna.gz
Saving 426 genomes from file : bacteria.12.1.genomic.fna.gz
Prasing 225/1212 file: ba

Saving 614 genomes from file : bacteria.166.1.genomic.fna.gz
Prasing 289/1212 file: bacteria.167.1.genomic.fna.gz
Saving 242 genomes from file : bacteria.167.1.genomic.fna.gz
Prasing 290/1212 file: bacteria.168.1.genomic.fna.gz
Saving 262 genomes from file : bacteria.168.1.genomic.fna.gz
Prasing 291/1212 file: bacteria.169.1.genomic.fna.gz
Saving 317 genomes from file : bacteria.169.1.genomic.fna.gz
Prasing 292/1212 file: bacteria.17.1.genomic.fna.gz
Saving 190 genomes from file : bacteria.17.1.genomic.fna.gz
Prasing 293/1212 file: bacteria.170.1.genomic.fna.gz
Saving 83 genomes from file : bacteria.170.1.genomic.fna.gz
Prasing 294/1212 file: bacteria.171.1.genomic.fna.gz
Saving 225 genomes from file : bacteria.171.1.genomic.fna.gz
Prasing 295/1212 file: bacteria.172.1.genomic.fna.gz
Saving 257 genomes from file : bacteria.172.1.genomic.fna.gz
Prasing 296/1212 file: bacteria.173.1.genomic.fna.gz
Saving 296 genomes from file : bacteria.173.1.genomic.fna.gz
Prasing 297/1212 file: bacteri

Saving 781 genomes from file : bacteria.231.1.genomic.fna.gz
Prasing 362/1212 file: bacteria.232.1.genomic.fna.gz
Saving 614 genomes from file : bacteria.232.1.genomic.fna.gz
Prasing 363/1212 file: bacteria.233.1.genomic.fna.gz
Saving 680 genomes from file : bacteria.233.1.genomic.fna.gz
Prasing 364/1212 file: bacteria.234.1.genomic.fna.gz
Saving 788 genomes from file : bacteria.234.1.genomic.fna.gz
Prasing 365/1212 file: bacteria.235.1.genomic.fna.gz
Saving 641 genomes from file : bacteria.235.1.genomic.fna.gz
Prasing 366/1212 file: bacteria.236.1.genomic.fna.gz
Saving 647 genomes from file : bacteria.236.1.genomic.fna.gz
Prasing 367/1212 file: bacteria.237.1.genomic.fna.gz
Saving 4 genomes from file : bacteria.237.1.genomic.fna.gz
Prasing 368/1212 file: bacteria.238.1.genomic.fna.gz
Saving 503 genomes from file : bacteria.238.1.genomic.fna.gz
Prasing 369/1212 file: bacteria.239.1.genomic.fna.gz
Saving 571 genomes from file : bacteria.239.1.genomic.fna.gz
Prasing 370/1212 file: bacter

Prasing 434/1212 file: bacteria.298.1.genomic.fna.gz
Saving 52 genomes from file : bacteria.298.1.genomic.fna.gz
Prasing 435/1212 file: bacteria.299.1.genomic.fna.gz
Saving 299 genomes from file : bacteria.299.1.genomic.fna.gz
Prasing 436/1212 file: bacteria.3.1.genomic.fna.gz
Saving 649 genomes from file : bacteria.3.1.genomic.fna.gz
Prasing 437/1212 file: bacteria.30.1.genomic.fna.gz
Saving 519 genomes from file : bacteria.30.1.genomic.fna.gz
Prasing 438/1212 file: bacteria.300.1.genomic.fna.gz
Saving 6 genomes from file : bacteria.300.1.genomic.fna.gz
Prasing 439/1212 file: bacteria.301.1.genomic.fna.gz
Saving 200 genomes from file : bacteria.301.1.genomic.fna.gz
Prasing 440/1212 file: bacteria.302.1.genomic.fna.gz
Saving 479 genomes from file : bacteria.302.1.genomic.fna.gz
Prasing 441/1212 file: bacteria.303.1.genomic.fna.gz
Saving 84 genomes from file : bacteria.303.1.genomic.fna.gz
Prasing 442/1212 file: bacteria.304.1.genomic.fna.gz
Saving 119 genomes from file : bacteria.304.1

Saving 1076 genomes from file : bacteria.362.1.genomic.fna.gz
Prasing 507/1212 file: bacteria.363.1.genomic.fna.gz
Saving 677 genomes from file : bacteria.363.1.genomic.fna.gz
Prasing 508/1212 file: bacteria.364.1.genomic.fna.gz
Saving 564 genomes from file : bacteria.364.1.genomic.fna.gz
Prasing 509/1212 file: bacteria.365.1.genomic.fna.gz
Saving 545 genomes from file : bacteria.365.1.genomic.fna.gz
Prasing 510/1212 file: bacteria.366.1.genomic.fna.gz
Saving 388 genomes from file : bacteria.366.1.genomic.fna.gz
Prasing 511/1212 file: bacteria.367.1.genomic.fna.gz
Saving 680 genomes from file : bacteria.367.1.genomic.fna.gz
Prasing 512/1212 file: bacteria.368.1.genomic.fna.gz
Saving 393 genomes from file : bacteria.368.1.genomic.fna.gz
Prasing 513/1212 file: bacteria.369.1.genomic.fna.gz
Saving 458 genomes from file : bacteria.369.1.genomic.fna.gz
Prasing 514/1212 file: bacteria.37.1.genomic.fna.gz
Saving 341 genomes from file : bacteria.37.1.genomic.fna.gz
Prasing 515/1212 file: bacte

Prasing 579/1212 file: bacteria.428.1.genomic.fna.gz
Saving 702 genomes from file : bacteria.428.1.genomic.fna.gz
Prasing 580/1212 file: bacteria.429.1.genomic.fna.gz
Saving 756 genomes from file : bacteria.429.1.genomic.fna.gz
Prasing 581/1212 file: bacteria.43.1.genomic.fna.gz
Saving 766 genomes from file : bacteria.43.1.genomic.fna.gz
Prasing 582/1212 file: bacteria.430.1.genomic.fna.gz
Saving 811 genomes from file : bacteria.430.1.genomic.fna.gz
Prasing 583/1212 file: bacteria.431.1.genomic.fna.gz
Saving 462 genomes from file : bacteria.431.1.genomic.fna.gz
Prasing 584/1212 file: bacteria.432.1.genomic.fna.gz
Saving 449 genomes from file : bacteria.432.1.genomic.fna.gz
Prasing 585/1212 file: bacteria.433.1.genomic.fna.gz
Saving 625 genomes from file : bacteria.433.1.genomic.fna.gz
Prasing 586/1212 file: bacteria.434.1.genomic.fna.gz
Saving 426 genomes from file : bacteria.434.1.genomic.fna.gz
Prasing 587/1212 file: bacteria.435.1.genomic.fna.gz
Saving 658 genomes from file : bacter

Saving 541 genomes from file : bacteria.493.1.genomic.fna.gz
Prasing 652/1212 file: bacteria.494.1.genomic.fna.gz
Saving 1804 genomes from file : bacteria.494.1.genomic.fna.gz
Prasing 653/1212 file: bacteria.495.1.genomic.fna.gz
Saving 864 genomes from file : bacteria.495.1.genomic.fna.gz
Prasing 654/1212 file: bacteria.496.1.genomic.fna.gz
Saving 977 genomes from file : bacteria.496.1.genomic.fna.gz
Prasing 655/1212 file: bacteria.497.1.genomic.fna.gz
Saving 900 genomes from file : bacteria.497.1.genomic.fna.gz
Prasing 656/1212 file: bacteria.498.1.genomic.fna.gz
Saving 876 genomes from file : bacteria.498.1.genomic.fna.gz
Prasing 657/1212 file: bacteria.499.1.genomic.fna.gz
Saving 128 genomes from file : bacteria.499.1.genomic.fna.gz
Prasing 658/1212 file: bacteria.5.1.genomic.fna.gz
Saving 348 genomes from file : bacteria.5.1.genomic.fna.gz
Prasing 659/1212 file: bacteria.50.1.genomic.fna.gz
Saving 1069 genomes from file : bacteria.50.1.genomic.fna.gz
Prasing 660/1212 file: bacteria

Saving 488 genomes from file : bacteria.558.1.genomic.fna.gz
Prasing 724/1212 file: bacteria.559.1.genomic.fna.gz
Saving 649 genomes from file : bacteria.559.1.genomic.fna.gz
Prasing 725/1212 file: bacteria.56.1.genomic.fna.gz
Saving 354 genomes from file : bacteria.56.1.genomic.fna.gz
Prasing 726/1212 file: bacteria.560.1.genomic.fna.gz
Saving 610 genomes from file : bacteria.560.1.genomic.fna.gz
Prasing 727/1212 file: bacteria.561.1.genomic.fna.gz
Saving 606 genomes from file : bacteria.561.1.genomic.fna.gz
Prasing 728/1212 file: bacteria.562.1.genomic.fna.gz
Saving 572 genomes from file : bacteria.562.1.genomic.fna.gz
Prasing 729/1212 file: bacteria.563.1.genomic.fna.gz
Saving 380 genomes from file : bacteria.563.1.genomic.fna.gz
Prasing 730/1212 file: bacteria.564.1.genomic.fna.gz
Saving 316 genomes from file : bacteria.564.1.genomic.fna.gz
Prasing 731/1212 file: bacteria.565.1.genomic.fna.gz
Saving 376 genomes from file : bacteria.565.1.genomic.fna.gz
Prasing 732/1212 file: bacter

Prasing 796/1212 file: bacteria.623.1.genomic.fna.gz
Saving 428 genomes from file : bacteria.623.1.genomic.fna.gz
Prasing 797/1212 file: bacteria.624.1.genomic.fna.gz
Saving 345 genomes from file : bacteria.624.1.genomic.fna.gz
Prasing 798/1212 file: bacteria.625.1.genomic.fna.gz
Saving 428 genomes from file : bacteria.625.1.genomic.fna.gz
Prasing 799/1212 file: bacteria.626.1.genomic.fna.gz
Saving 402 genomes from file : bacteria.626.1.genomic.fna.gz
Prasing 800/1212 file: bacteria.627.1.genomic.fna.gz
Saving 645 genomes from file : bacteria.627.1.genomic.fna.gz
Prasing 801/1212 file: bacteria.628.1.genomic.fna.gz
Saving 835 genomes from file : bacteria.628.1.genomic.fna.gz
Prasing 802/1212 file: bacteria.629.1.genomic.fna.gz
Saving 575 genomes from file : bacteria.629.1.genomic.fna.gz
Prasing 803/1212 file: bacteria.63.1.genomic.fna.gz
Saving 715 genomes from file : bacteria.63.1.genomic.fna.gz
Prasing 804/1212 file: bacteria.630.1.genomic.fna.gz
Saving 1308 genomes from file : bacte

Saving 377 genomes from file : bacteria.689.1.genomic.fna.gz
Prasing 869/1212 file: bacteria.69.1.genomic.fna.gz
Saving 569 genomes from file : bacteria.69.1.genomic.fna.gz
Prasing 870/1212 file: bacteria.690.1.genomic.fna.gz
Saving 379 genomes from file : bacteria.690.1.genomic.fna.gz
Prasing 871/1212 file: bacteria.691.1.genomic.fna.gz
Saving 338 genomes from file : bacteria.691.1.genomic.fna.gz
Prasing 872/1212 file: bacteria.692.1.genomic.fna.gz
Saving 532 genomes from file : bacteria.692.1.genomic.fna.gz
Prasing 873/1212 file: bacteria.693.1.genomic.fna.gz
Saving 391 genomes from file : bacteria.693.1.genomic.fna.gz
Prasing 874/1212 file: bacteria.694.1.genomic.fna.gz
Saving 617 genomes from file : bacteria.694.1.genomic.fna.gz
Prasing 875/1212 file: bacteria.695.1.genomic.fna.gz
Saving 519 genomes from file : bacteria.695.1.genomic.fna.gz
Prasing 876/1212 file: bacteria.696.1.genomic.fna.gz
Saving 460 genomes from file : bacteria.696.1.genomic.fna.gz
Prasing 877/1212 file: bacter

Saving 351 genomes from file : bacteria.754.1.genomic.fna.gz
Prasing 942/1212 file: bacteria.755.1.genomic.fna.gz
Saving 290 genomes from file : bacteria.755.1.genomic.fna.gz
Prasing 943/1212 file: bacteria.756.1.genomic.fna.gz
Saving 191 genomes from file : bacteria.756.1.genomic.fna.gz
Prasing 944/1212 file: bacteria.757.1.genomic.fna.gz
Saving 523 genomes from file : bacteria.757.1.genomic.fna.gz
Prasing 945/1212 file: bacteria.758.1.genomic.fna.gz
Saving 451 genomes from file : bacteria.758.1.genomic.fna.gz
Prasing 946/1212 file: bacteria.759.1.genomic.fna.gz
Saving 267 genomes from file : bacteria.759.1.genomic.fna.gz
Prasing 947/1212 file: bacteria.76.1.genomic.fna.gz
Saving 485 genomes from file : bacteria.76.1.genomic.fna.gz
Prasing 948/1212 file: bacteria.760.1.genomic.fna.gz
Saving 314 genomes from file : bacteria.760.1.genomic.fna.gz
Prasing 949/1212 file: bacteria.761.1.genomic.fna.gz
Saving 392 genomes from file : bacteria.761.1.genomic.fna.gz
Prasing 950/1212 file: bacter

Saving 463 genomes from file : bacteria.819.1.genomic.fna.gz
Prasing 1014/1212 file: bacteria.82.1.genomic.fna.gz
Saving 520 genomes from file : bacteria.82.1.genomic.fna.gz
Prasing 1015/1212 file: bacteria.820.1.genomic.fna.gz
Saving 1025 genomes from file : bacteria.820.1.genomic.fna.gz
Prasing 1016/1212 file: bacteria.821.1.genomic.fna.gz
Saving 460 genomes from file : bacteria.821.1.genomic.fna.gz
Prasing 1017/1212 file: bacteria.822.1.genomic.fna.gz
Saving 161 genomes from file : bacteria.822.1.genomic.fna.gz
Prasing 1018/1212 file: bacteria.823.1.genomic.fna.gz
Saving 561 genomes from file : bacteria.823.1.genomic.fna.gz
Prasing 1019/1212 file: bacteria.824.1.genomic.fna.gz
Saving 640 genomes from file : bacteria.824.1.genomic.fna.gz
Prasing 1020/1212 file: bacteria.825.1.genomic.fna.gz
Saving 525 genomes from file : bacteria.825.1.genomic.fna.gz
Prasing 1021/1212 file: bacteria.826.1.genomic.fna.gz
Saving 1026 genomes from file : bacteria.826.1.genomic.fna.gz
Prasing 1022/1212 f

Saving 809 genomes from file : bacteria.884.1.genomic.fna.gz
Prasing 1086/1212 file: bacteria.885.1.genomic.fna.gz
Saving 478 genomes from file : bacteria.885.1.genomic.fna.gz
Prasing 1087/1212 file: bacteria.886.1.genomic.fna.gz
Saving 540 genomes from file : bacteria.886.1.genomic.fna.gz
Prasing 1088/1212 file: bacteria.887.1.genomic.fna.gz
Saving 463 genomes from file : bacteria.887.1.genomic.fna.gz
Prasing 1089/1212 file: bacteria.888.1.genomic.fna.gz
Saving 490 genomes from file : bacteria.888.1.genomic.fna.gz
Prasing 1090/1212 file: bacteria.889.1.genomic.fna.gz
Saving 452 genomes from file : bacteria.889.1.genomic.fna.gz
Prasing 1091/1212 file: bacteria.89.1.genomic.fna.gz
Saving 526 genomes from file : bacteria.89.1.genomic.fna.gz
Prasing 1092/1212 file: bacteria.890.1.genomic.fna.gz
Saving 97 genomes from file : bacteria.890.1.genomic.fna.gz
Prasing 1093/1212 file: bacteria.891.1.genomic.fna.gz
Saving 391 genomes from file : bacteria.891.1.genomic.fna.gz
Prasing 1094/1212 file

Saving 72 genomes from file : bacteria.949.1.genomic.fna.gz
Prasing 1158/1212 file: bacteria.95.1.genomic.fna.gz
Saving 168 genomes from file : bacteria.95.1.genomic.fna.gz
Prasing 1159/1212 file: bacteria.950.1.genomic.fna.gz
Saving 2028 genomes from file : bacteria.950.1.genomic.fna.gz
Prasing 1160/1212 file: bacteria.951.1.genomic.fna.gz
Saving 2288 genomes from file : bacteria.951.1.genomic.fna.gz
Prasing 1161/1212 file: bacteria.952.1.genomic.fna.gz
Saving 338 genomes from file : bacteria.952.1.genomic.fna.gz
Prasing 1162/1212 file: bacteria.953.1.genomic.fna.gz
Saving 1293 genomes from file : bacteria.953.1.genomic.fna.gz
Prasing 1163/1212 file: bacteria.954.1.genomic.fna.gz
Saving 424 genomes from file : bacteria.954.1.genomic.fna.gz
Prasing 1164/1212 file: bacteria.955.1.genomic.fna.gz
Saving 855 genomes from file : bacteria.955.1.genomic.fna.gz
Prasing 1165/1212 file: bacteria.956.1.genomic.fna.gz
Saving 872 genomes from file : bacteria.956.1.genomic.fna.gz
Prasing 1166/1212 f

Merge Archea and Bacteria

In [54]:
!copy E:\masters\virnet\data\1-genomes\archaea.fna + E:\masters\virnet\data\1-genomes\bacteria.fna  E:\masters\virnet\data\1-genomes\non_viral.fna

E:\masters\virnet\data\1-genomes\archaea.fna
E:\masters\virnet\data\1-genomes\bacteria.fna
        1 file(s) copied.


## 2- Split Genomes

In [70]:
def split_sample_genomes(genometype,input_folder,output_folder,sample=False):
    random.seed(42)
    start=time.time()
    fasta_file=os.path.join(input_folder,"{0}.fna".format(genometype))
    train_file=os.path.join(output_folder,"{0}_train.fna".format(genometype))
    test_file=os.path.join(output_folder,"{0}_test.fna".format(genometype))
    count=0
    pb_count=0
    t_count=0
    t_pb_count=0
    print('Reading and Splitting {0} genomes'.format(genometype))
    with open(fasta_file,"rt") as handle , open(train_file,'w+') as train_f, open(test_file,'w+') as test_f:
        for record in SeqIO.parse(handle, "fasta"):
            if(sample==True and random.randint(0,2)!=0):
                continue
            toss=random.randint(1,10)
            if(toss==4 or toss==7):
                test_f.write('>{0}\n{1}\n'.format(record.id,record.seq))
                t_count+=1
                t_pb_count+=len(record.seq)
            else:
                train_f.write('>{0}\n{1}\n'.format(record.id,record.seq))
                count+=1
                pb_count+=len(record.seq)
    end=time.time()
    print('Training {0} genomes'.format(count))
    print('Testing {0} genomes'.format(t_count))
    print('Training bp ratio {0:.3f}%'.format(pb_count*100.0/(pb_count+t_pb_count)))
    print('Total bp {0}'.format(pb_count+t_pb_count))
    print('Time elapased {0:.3f} Sec'.format(end-start))

In [74]:
split_sample_genomes('viral',genome_folder,train_test_folder)

Reading and Splitting viral genomes
Training 7686 genomes
Testing 1870 genomes
Training bp ratio 79.852%
Total bp 261109346
Time elapased 3.313 Sec


In [58]:
split_sample_genomes('non_viral',genome_folder,train_test_folder,sample=True)

Reading and Splitting non_viral genomes
Training 143241 genomes
Testing 35543 genomes
Training bp ratio 80.345%
Time elapased 432.700 Sec


## 3- Generate Fragments

In [86]:
def rand_parts(seq, n_gen, contigs_len):
    indices = range(len(seq) - (contigs_len - 1) * n_gen)
    result = []
    offset = 0
    for i in sorted(random.sample(indices, n_gen)):
        i += offset
        result.append(str(seq[i:i+contigs_len]))        
        offset += contigs_len - 1
    return result

def get_sequences(seq,contigs_len=500,is_random=False):
    if is_random:
        n_gen=random.randint(0,50)
        if (len(seq)>contigs_len):
            if (len(seq)<contigs_len*n_gen):
                start=random.randint(0,len(seq) - (contigs_len - 1))
                return [str(seq[start:start+contigs_len])]
            else:
                return rand_parts(seq,n_gen,contigs_len)
        else:
            return []
    else:
        seq_list=[]
        n_gen=len(seq)//contigs_len
        for i in range(n_gen):
            start=i*contigs_len   
            end=start+contigs_len
            seq_list.append(str(seq[start:end]))    
        ## Adding the last contigs with the perivous (The last contigs is overlapping TODO fix)
        seq_list.append(str(seq[-contigs_len:]))
        return seq_list
    
def generate_fragment(file,input_folder,output_folder,n,sample=False):
    random.seed(42)
    start=time.time()
    file_path=os.path.join(input_folder,file)
    output_path=os.path.join(output_folder,"{0}_{1}.fna".format(file,n))
    n_reads=0
    with open(file_path,'rt') as handle, open(output_path,'w+') as out_f:
        for record in SeqIO.parse(handle, "fasta"):
            generated_sequences=get_sequences(record.seq,contigs_len=n,is_random=sample)
            n_reads+=len(generated_sequences)
            for item in generated_sequences:
                out_f.write('>{0}\n{1}\n'.format(record.id,item))
    end=time.time()
    print('Generated reads {0}'.format(n_reads))
    print('Time elapased {0:.3f} Sec'.format(end-start))

In [80]:
generate_fragment('viral_train.fna',train_test_folder,fragments_folder,n=100)
generate_fragment('viral_test.fna',train_test_folder,fragments_folder,n=100)

Generated reads 2088863
Time elapased 8.974 Sec
Generated reads 527020
Time elapased 2.376 Sec


In [87]:
generate_fragment('non_viral_test.fna',train_test_folder,fragments_folder,n=100,sample=True)
generate_fragment('non_viral_train.fna',train_test_folder,fragments_folder,n=100,sample=True)

Generated reads 517673
Time elapased 10.981 Sec
Generated reads 2094125
Time elapased 80.865 Sec


## 4- Generate Metagenomes

In [3]:
def create_metagenome_genomes(file,reads,read_len,viral_ratio):
    random.seed(42)
    print("Starting creating genomes for {0}".format(file))
    start=time.time()
    total_length=reads*read_len
    output_file=os.path.join(metagenome_folder,"{0}.fna".format(file))


    viral_path=os.path.join(train_test_folder,'viral_test.fna')
    non_viral_path=os.path.join(train_test_folder,'non_viral_test.fna')

    vfaa = list(Fasta(viral_path))
    bfaa = list(Fasta(non_viral_path))

    random.shuffle(vfaa)
    random.shuffle(bfaa)

    v_len=0
    v_count=0
    b_len=0
    b_count=0
    with open(output_file,'w+') as fout:
        print("Sample from Viruses")
        for sample in vfaa:
            if(v_len>=int(total_length*viral_ratio)):
                break
            fout.write('>{0}\n{1}\n'.format(sample.name,sample))
            v_len+=len(sample)
            v_count+=1
        print('Viruses Length {0} bp'.format(v_len))
        print('# Viruses Genomes {0}'.format(v_count))

        print("Sample from Bacteria")
        for sample in bfaa:
            if(v_len*1.0/(v_len+b_len)<=viral_ratio):
                break
            fout.write('>{0}\n{1}\n'.format(sample.name,sample))
            b_len+=len(sample)
            b_count+=1
    print('Bacteria Length {0} bp'.format(b_len))
    print('# Bacteria Genomes {0}'.format(b_count))
    end=time.time()
    print('Time elapased {0:3f} Secs'.format(end-start))
    print('Total Length {0} bp'.format(v_len+b_len))
    print('Bacteria ratio {0:.2f}%'.format(b_len*1.0/(v_len+b_len)*100))

In [6]:
create_metagenome_genomes('microbiome',reads=10**6,read_len=100,viral_ratio=0.25)
create_metagenome_genomes('virome',reads=10**6,read_len=100,viral_ratio=0.75)

Starting creating genomes for microbiome
Sample from Viruses
Viruses Length 25133078 bp
# Viruses Genomes 845
Sample from Bacteria
Bacteria Length 75450367 bp
# Bacteria Genomes 1488
Time elapased 11.989594 Secs
Total Length 100583445 bp
Bacteria ratio 75.01%
Starting creating genomes for virome
Sample from Viruses
Viruses Length 52609236 bp
# Viruses Genomes 1870
Sample from Bacteria
Bacteria Length 17551396 bp
# Bacteria Genomes 422
Time elapased 1.445022 Secs
Total Length 70160632 bp
Bacteria ratio 25.02%


## 5- Real case Study