## Sourcing data from NCBI

In [13]:
# ./datasets summary taxonomy taxon "Pectobacteriaceae"
P_taxon_id = 1903410

## Importing data

### Phage accession lookup
Phage name, ID, NCBI accession 

In [14]:
import pandas as pd
import sys
os_system = sys.platform #recognize operationg system.
if os_system == "win32":
    datasets = "./datasets.exe"
    dataformat = "./dataformat.exe"
else:
    datasets = "./datasets_macOS"
    dataformat = "./dataformat_macOS"

PhagesDB_all = pd.read_csv("PhagesDB_accession_numbers.txt", sep="\t", names=["phage_name", "id", "accession"], header=0)
PhagesDB_all

Unnamed: 0,phage_name,id,accession
0,244,DQ398041,NC_008194
1,32HC,KJ028219,NC_023602
2,39HC,KJ433973,NC_023603
3,40AC,KJ192196,NC_023607
4,40BC,KJ433975,
...,...,...,...
5073,Zorro,MK279896,
5074,Zucker,ON645346,
5075,Zuko,MN204493,
5076,Zulu,MH779517,


Only phages with accession numbers

In [15]:
PDB_acc = PhagesDB_all[PhagesDB_all['accession'].notna()]
PDB_acc

Unnamed: 0,phage_name,id,accession
0,244,DQ398041,NC_008194
1,32HC,KJ028219,NC_023602
2,39HC,KJ433973,NC_023603
3,40AC,KJ192196,NC_023607
24,Acadian,JN699007,NC_023701
...,...,...,...
4917,WIVsmall,KC736071,NC_021334
4945,Xeno,KU935728,NC_031243
5019,Zaka,KF560334,NC_022985
5041,Zemlya,KC700558,NC_021339


In [16]:
display(PDB_acc.head(10))

Unnamed: 0,phage_name,id,accession
0,244,DQ398041,NC_008194
1,32HC,KJ028219,NC_023602
2,39HC,KJ433973,NC_023603
3,40AC,KJ192196,NC_023607
24,Acadian,JN699007,NC_023701
36,Adawi,KF279411,NC_022328
42,Adjutor,EU676000,NC_010763
51,Adzzy,KF416344,NC_022058
54,Aeneas,JQ809703,NC_023723
77,Akoma,JN699006,NC_023742


In [17]:
### Search on NCBI for extra info
import subprocess
import json
from tqdm import tqdm
df_reports = pd.DataFrame()

failed_accessions = []

for acc in tqdm(PDB_acc['accession'], desc="Fetching data from NCBI", unit="acc"):
    #print(acc+".1")
    res = subprocess.run([datasets, "summary", "virus", "genome", "accession", acc+".1"],
                         capture_output=True,   # capture stdout and stderr
                         text=True, 
                         check=True)          # raise an error if the command fails
    # Parse the JSON string
    data = json.loads(res.stdout)
    
    # Check empty data
    if "reports" not in data:
        #print("No reports found in the JSON data.")
        failed_accessions.append(acc)
        continue

    # Convert the 'reports' list to a pandas DataFrame
    if df_reports.empty:
        df_reports = pd.DataFrame(data['reports'])
    else:
        df_reports = pd.concat([df_reports, pd.DataFrame(data['reports'])], ignore_index=True)
display(df_reports)


Fetching data from NCBI: 100%|██████████| 287/287 [02:48<00:00,  1.70acc/s]


Unnamed: 0,accession,bioprojects,completeness,host,is_annotated,is_lab_host,lab_host,length,nucleotide,protein_count,release_date,source_database,submitter,update_date,virus,isolate,location
0,NC_008194.1,[PRJNA485481],COMPLETE,"{'lineage': [{'name': 'cellular organisms', 't...",True,True,Mycobacterium smegmatis,74483,{'sequence_hash': '2974DF27'},142,2006-06-20T00:00:00Z,RefSeq,{'affiliation': 'National Center for Biotechno...,2023-01-11T00:00:00Z,"{'lineage': [{'name': 'Viruses', 'tax_id': 102...",,
1,NC_023602.1,[PRJNA485481],COMPLETE,"{'lineage': [{'name': 'cellular organisms', 't...",True,,,50781,{'sequence_hash': 'CFEA6EA'},86,2014-03-04T00:00:00Z,RefSeq,{'affiliation': 'National Center for Biotechno...,2023-01-08T00:00:00Z,"{'lineage': [{'name': 'Viruses', 'tax_id': 102...",,
2,NC_023603.1,[PRJNA485481],COMPLETE,"{'lineage': [{'name': 'cellular organisms', 't...",True,,,71565,{'sequence_hash': '52C4C232'},100,2014-03-04T00:00:00Z,RefSeq,{'affiliation': 'National Center for Biotechno...,2023-01-08T00:00:00Z,"{'lineage': [{'name': 'Viruses', 'tax_id': 102...",,
3,NC_023607.1,[PRJNA485481],COMPLETE,"{'lineage': [{'name': 'cellular organisms', 't...",True,,,53396,{'sequence_hash': '4EC4E167'},90,2014-03-04T00:00:00Z,RefSeq,{'affiliation': 'National Center for Biotechno...,2023-01-08T00:00:00Z,"{'lineage': [{'name': 'Viruses', 'tax_id': 102...",,
4,NC_023701.1,[PRJNA485481],COMPLETE,"{'lineage': [{'name': 'cellular organisms', 't...",True,True,Mycobacterium smegmatis mc2 155,69864,{'sequence_hash': '4E62B368'},97,2014-03-11T00:00:00Z,RefSeq,{'affiliation': 'National Center for Biotechno...,2023-01-11T00:00:00Z,"{'lineage': [{'name': 'Viruses', 'tax_id': 102...",{'collection_date': '2009-07-01'},"{'geographic_location': 'USA: Lafayette, LA', ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258,NC_021334.1,[PRJNA485481],COMPLETE,"{'lineage': [{'name': 'cellular organisms', 't...",True,True,Mycobacterium smegmatis mc2 155,53359,{'sequence_hash': 'A8A0EF4E'},83,2013-06-03T00:00:00Z,RefSeq,{'affiliation': 'National Center for Biotechno...,2023-01-08T00:00:00Z,"{'lineage': [{'name': 'Viruses', 'tax_id': 102...",,"{'geographic_location': 'China', 'geographic_r..."
259,NC_031243.1,[PRJNA485481],COMPLETE,"{'lineage': [{'name': 'cellular organisms', 't...",True,True,Mycobacterium smegmatis mc2 155,42395,{'sequence_hash': '5E5F2E14'},69,2016-09-27T00:00:00Z,RefSeq,{'affiliation': 'National Center for Biotechno...,2023-01-09T00:00:00Z,"{'lineage': [{'name': 'Viruses', 'tax_id': 102...",{'collection_date': '2014-10-01'},"{'geographic_location': 'USA: New Haven, CT', ..."
260,NC_022985.1,[PRJNA485481],COMPLETE,"{'lineage': [{'name': 'cellular organisms', 't...",True,True,Mycobacterium smegmatis mc2 155,52122,{'sequence_hash': '910ADD35'},101,2013-12-04T00:00:00Z,RefSeq,{'affiliation': 'National Center for Biotechno...,2023-01-08T00:00:00Z,"{'lineage': [{'name': 'Viruses', 'tax_id': 102...",{'collection_date': '2011-09-29'},"{'geographic_location': 'USA: Los Angeles, CA'..."
261,NC_021339.1,[PRJNA485481],COMPLETE,"{'lineage': [{'name': 'cellular organisms', 't...",True,True,Streptomyces lividans,51077,{'sequence_hash': '7D95B565'},76,2013-06-03T00:00:00Z,RefSeq,{'affiliation': 'National Center for Biotechno...,2023-01-08T00:00:00Z,"{'lineage': [{'name': 'Viruses', 'tax_id': 102...",{'collection_date': '2008-10-30'},"{'geographic_location': 'USA: Houston, Texas',..."


### Obtaining fasta sequences for each phage
Using NCBI CLI:

In [18]:
import zipfile
import os

failed_genomes = []

for acc in tqdm(PDB_acc['accession'], desc="Downloading phage genomes from NCBI", unit="acc"):
    try:
        res = subprocess.run([datasets
        , "download", "virus", "genome", "accession", acc+".1"],
                            capture_output=True,   # capture stdout and stderr
                            text=True, 
                            check=True)          # raise an error if the command fails
    except subprocess.CalledProcessError as e:
        #print(f"Error fetching accession {acc}: {e.stderr}")
        failed_genomes.append(acc)
        continue

    phage_name = PDB_acc.loc[PDB_acc['accession'] == acc, 'phage_name'].values[0]

    if not os.path.exists("ncbi_phage_genomes"):
        os.mkdir("ncbi_phage_genomes")

    with zipfile.ZipFile("ncbi_dataset.zip", "r") as zip_ref:
        file = zip_ref.extract("ncbi_dataset/data/genomic.fna", path=".")
        with open(file, 'r') as f:
            fasta_content = f.read()
            with open(f"ncbi_phage_genomes/{phage_name}.fasta", "w") as output_file:
                output_file.write(fasta_content)

Downloading phage genomes from NCBI: 100%|██████████| 287/287 [03:56<00:00,  1.21acc/s]


### Excluding failed accessions

In [19]:
for acc in failed_genomes:
    print(f"Failed to download genome for accession: {acc}")

for acc in failed_accessions:
    print(f"Failed to fetch summary for accession: {acc}")

print("Filtering")

PDB_acc_success = PDB_acc[~PDB_acc['accession'].isin(failed_accessions + failed_genomes)]
df_reports_success = df_reports[~df_reports['accession'].isin(failed_accessions + failed_genomes)]

PDB_acc_success.to_csv("data_prod/PDB_acc.csv", sep="\t", index=False)
df_reports_success.to_csv("data_prod/NCBI_virus_genome_summary_PDB_phages.csv", index=False)

Failed to download genome for accession: NC_010763
Failed to download genome for accession: NC_029018
Failed to download genome for accession: NC_022972
Failed to download genome for accession: NC_022327
Failed to download genome for accession: NC_010762
Failed to download genome for accession: NC_021296
Failed to download genome for accession: NC_011286
Failed to download genome for accession: NC_004682
Failed to download genome for accession: NC_011284
Failed to download genome for accession: NC_004680
Failed to download genome for accession: NC_004686
Failed to download genome for accession: NC_014459
Failed to download genome for accession: NC_001900
Failed to download genome for accession: NC_021306
Failed to download genome for accession: NC_009993
Failed to download genome for accession: NC_008202
Failed to download genome for accession: NC_022054
Failed to download genome for accession: NC_004664
Failed to download genome for accession: NC_001978
Failed to download genome for a

OSError: Cannot save file into a non-existent directory: 'data_prod'

### Phages against clusters
Clusters identify which bacteria each phage can infect

In [None]:
PDB_phage_cluster = pd.read_csv("PhagesDB_Data.txt", sep="\t", header=0)
PDB_phage_cluster

Unnamed: 0,Phage Name,Cluster,Subcluster
0,20ES,A,A2
1,244,E,
2,32HC,Z,
3,39HC,B,B6
4,40AC,A,A17
...,...,...,...
5573,Zorro,AK,
5574,Zucker,FN,
5575,Zuko,BR,
5576,Zulu,A,A6
