# Download Bacteriophages genomes from FTP site with Accession from NCBI Virus

In [None]:
## IMPORT ##
import pandas as pd
from pandas import read_csv
import urllib.request
import time
import os
from datetime import date
from pyfaidx import Fasta

Firstly, download RefSeq Genomes from NCBI Virus as csv format from https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&VirusLineage_ss=Bacteriophage,%20all%20taxids and then read the dataframe.

In [None]:
seq = pd.read_csv('sequences.csv') #sequences from NCBI Virus -> assemblies

Filtr genomes and create list of genomes you want download

In [None]:
#new dataframe with individual columns
new_seq = seq[['Assembly', 'Species', 'Molecule_type', 'Family', 'Host', 'GenBank_Title']].copy()

#sorting values by Host and Family
new_seq.sort_values(by=['Host', 'Family'])

#searching Family Siphoviridae which Host is Lactococcus lactis
siphoviridae = new_seq.loc [(new_seq['Family'] == 'Siphoviridae') & (new_seq['Host'] == 'Lactococcus lactis')] 

#Family Siphoviridae|Lactococcus Lactis Host assemblies -> make list 
asb = siphoviridae["Assembly"].tolist()

In [None]:
siphovir_loc = seq.loc[seq['Family'] == 'Siphoviridae']
sipho_list = siphovir_loc["Assembly"].tolist() #make a list of bacteriophages of interest

In [None]:
sipho_list[0:25]

In [None]:
len(sipho_list)

In [None]:
asb[0:25]

Download assembly_summary_refseq txt file to get the path from specific assemblies

In [None]:
urllib.request.urlretrieve("https://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt", "assembly_summary_refseq.txt")

In [None]:
def get_assemblies(phages_list, path):
    '''
    This function download genomes from flirting list of Bacteriophages to concrete path
    '''
    #import assembly_summary_refseq file to dataframe
    assembly_sum = pd.read_csv('/home/amanda/assembly_summary_refseq.txt', sep='\t', skiprows=1) 
    #names of columns
    assembly_sum.columns = [
        'assembly_accession',
        'bioproject','biosample',
        'wgs_master','refseq_category',
        'taxid','species_taxid','organism_name',
        'infraspecific_name','isolate','version_status',
        'assembly_level','release_type','genome_rep',
        'seq_rel_date','asm_name','submitter','gbrs_paired_asm',
        'paired_asm_comp','ftp_path','excluded_from_refseq',
        'relation_to_type_material','asm_not_live_date'
    ]

    for assembly in phages_list:
        # searching specific genomes from list
        my_df = assembly_sum[(assembly_sum['assembly_accession'] == assembly)]
        #Process the newly created file and download genomes from NCBI website
        ftp = my_df['ftp_path'].tolist() #making ftp list --> path to download genomes
        asm = my_df['asm_name'].to_list() #making asm list --> asm necessary as part of suffix

        for elem in ftp:
            for i in asm:
                file_in = assembly + '.fna.gz' #gzip format
                fullfilename = os.path.join(path, file_in) #directory and file name
                file_suffix=elem+'/'+assembly+'_'+i+'_genomic.fna.gz'
                try:   
                    if os.path.isfile(fullfilename): #if genome is in directory, skip it and continue the rest of them
                        print(file_in, " already exist")
                        continue
                    else:
                        response = urllib.request.urlretrieve(file_suffix, fullfilename) #download genomes
                        print("Download:", file_in)
                        time.sleep(1)

                except:        
                    print("Skipping", file_in, " - doesn't exist.") #If there is an error or the ftp server doesn't have the genome, skip it


In [None]:
def folder(parent_dir, folder_name):
    '''
    This function create new folder in directory
    '''
    path = os.path.join(parent_dir, folder_name)
    os.mkdir(path)
    print("Directory '% s' created" % folder_name)
    

In [None]:
folder('/home/amanda/Bacteriophages','Siphoviridae')

In [None]:
path = '/home/amanda/Bacteriophages/Siphoviridae' 

In [None]:
get_assemblies(sipho_list, path)

In [None]:
def inventory(path):
    '''
    This function create inventory txt file with date of downloading

    '''
    current_time = str(date.today())
    with open(os.path.join(path, "inventory.txt"), "w") as f:
        for path, subdirs, files in os.walk(path):
            for filenames in files:
                if filenames == 'inventory.txt':
                    continue
                else:
                    f.write(filenames + '\t' + current_time + '\n')

            


### Create one file for all downloading genomes with samtools

Change the directory to create one compressed fasta file and *.fai file by which samtools can quickly access any region of the genome.

In [None]:
os.chdir(path)
print("Current Working Directory: " , os.getcwd())

In [None]:
%%bash
gunzip GCF*.fna.gz
cat GCF*.fna>> genomy.fasta
rm GCF*.fna
bgzip -c genomy.fasta >> genomy.fasta.bgzf
samtools faidx genomy.fasta

In [None]:
inventory(path)

In [None]:
%%bash
samtools faidx genomy.fasta.bgzf NC_001706.1:1-1234


**For each row in *.fai:**

- Column 1: Accession

- Column 2: The number of bases in the genome

- Column 3: The byte index of the file where the genome sequence begins. (Notice how it constantly increases by roughly the amount in column 2?)

- Column 4: bases per line in the FASTA file

- Column 5: bytes per line in the FASTA file


In [None]:
%%bash

awk 'OFS="\t" {print $1,$2,$3, $4, $5}' genomy.fasta.bgzf.fai

In [None]:
%%bash
cut -f1-2 genomy.fasta.bgzf.fai #only the number of bases in genomes

### Pyfaidx to get access to any subsequence from file

more about pyfaidx:
https://pypi.org/project/pyfaidx/

In [None]:
genes = Fasta('/home/amanda/Bacteriophages/Siphoviridae/genomy.fasta')

In [None]:
genes.keys()

In [None]:
genes['NC_013153.1'][1:2341]

In [None]:
for records in genes:
    print(records.long_name)

In [None]:
for line in genes['NC_002166.1']:
    print(line)

In [None]:
len(genes['NC_002166.1'])