In [1]:
import sys 
import glob
from time import sleep
from io import StringIO
from collections import defaultdict
from typing import Union
import entrezpy.conduit



# vseek imports
sys.path.append("../")
import vseek.common.vseek_paths as vsp
from vseek.common.loader import load_bat_virus_data
from vseek.apis.ncbi import get_viral_genomes, get_viral_genes
from vseek.common.checks import genome_db_exist
from vseek.common.io_files import get_genome_genes_paths, get_viral_genome_fasta_paths, save_genes, clean_all_genes


In [2]:
# loading bat-virus data
bat_vir_df = load_bat_virus_data()
bat_vir_df = bat_vir_df.drop("Unnamed: 0", axis="columns")
all_accessions = bat_vir_df["Representative"].tolist()

bat_vir_df.head()


Unnamed: 0,Representative,Host,Taxonomy name,Segment name,family,genus
0,NC_001348,human,Human alphaherpesvirus 3,segment,Herpesviridae,Varicellovirus
1,NC_001352,human,Human papillomavirus type 2,segment,Papillomaviridae,Alphapapillomavirus
2,NC_001354,human,Human papillomavirus type 41,segment,Papillomaviridae,Nupapillomavirus
3,NC_001356,human,Human papillomavirus type 1a,segment,Papillomaviridae,Mupapillomavirus
4,NC_001430,human,Enterovirus D68,segment,Picornaviridae,Enterovirus


In [6]:
get_viral_genes(email="erikishere3@gmail.com", accession=all_accessions)


Downloading viral genes metadata
Checking if genome database exists ...
Database exists! Checking for missing gene data files
> Requesting NC_035211 genes ... 
> Requesting NC_001943 genes ... 
> Requesting NC_012776 genes ... 
> Requesting NC_017995 genes ... 
> Requesting NC_009539 genes ... 
> Requesting NC_026161 genes ... 
> Requesting NC_038351 genes ... 
> Requesting NC_022788 genes ... 
> Requesting NC_001526 genes ... 
> Requesting NC_001676 genes ... 
> Requesting NC_014086 genes ... 
> Requesting NC_014361 genes ... 
> Requesting NC_020106 genes ... 
> Requesting NC_038728 genes ... 
> Requesting NC_055331 genes ... 
> Requesting NC_006317 genes ... 
> Requesting NC_038605 genes ... 
> Requesting NC_038600 genes ... 
> Requesting NC_001586 genes ... 
> Requesting NC_038602 genes ... 
> Requesting NC_038350 genes ... 
> Requesting NC_039061 genes ... 
> Requesting NC_055339 genes ... 
> Requesting NC_010562 genes ... 
> Requesting NC_001722 genes ... 
> Requesting NC_038415 

In [4]:
email = "erikishere3@gmail.com"
# accession = "NC_001693"
accession = "NC_001348"
buffer = 0.5


old_stdout = sys.stdout

# creating a container storing stdout
gene_result = StringIO()
sys.stdout = gene_result

# calling ncbi gene database
c = entrezpy.conduit.Conduit(email)
fetch_genes = c.new_pipeline()
sid = fetch_genes.add_search(
    {
        "db": "gene",
        "term": f"{accession}",
        "rettype": "count",
        "datetype": "pdat",
    }
)
fid = fetch_genes.add_fetch(
    {"retmax": 10, "retmode": "text", "rettype": "fasta"}, dependency=sid
)
c.run(fetch_genes)
sleep(buffer)

# store the string from the stdout into variable
gene_conts = gene_result.getvalue()

# now redirect back stdout to screen
sys.stdout = old_stdout

print(gene_conts)

# parsing gene response contents
# gene_info = _parse_ncbi_genes_response(gene_conts)


1. ORF68
envelope glycoprotein E [Human alphaherpesvirus 3 (Varicella-zoster virus)]
Other Aliases: HHV3_gp69
Other Designations: envelope glycoprotein E
Annotation:  NC_001348.1 (115808..117721)
ID: 1487709

2. ORF61
ubiquitin E3 ligase ICP0 [Human alphaherpesvirus 3 (Varicella-zoster virus)]
Other Aliases: HHV3_gp62
Other Designations: ubiquitin E3 ligase ICP0
Annotation:  NC_001348.1 (103001..104485, complement)
ID: 1487698

3. ORF9
tegument protein VP22 [Human alphaherpesvirus 3 (Varicella-zoster virus)]
Other Aliases: HHV3_gp11
Other Designations: tegument protein VP22
Annotation:  NC_001348.1 (11009..11964)
ID: 1487674

4. ORF31
envelope glycoprotein B [Human alphaherpesvirus 3 (Varicella-zoster virus)]
Other Aliases: HHV3_gp33
Other Designations: envelope glycoprotein B
Annotation:  NC_001348.1 (56819..59648)
ID: 1487662

5. ORF54
capsid portal protein [Human alphaherpesvirus 3 (Varicella-zoster virus)]
Other Aliases: HHV3_gp55
Other Designations: capsid portal protein
Annotati

In [5]:
split_contents = gene_conts.splitlines()
split_contents = [line_data for line_data in split_contents if len(line_data) != 0]
chunked_contents = [
    split_contents[i : i + 6] for i in range(0, len(split_contents), 6)
]

In [None]:
for chunk in chunked_contents:
    
    # removing discontinued genes
    if "discontinued" in " ".join(chunk):
            continue
    # gene name  
    name = chunk[0].split(".")[-1].strip()

    # iterating through contents
    for line in chunk[1:]:
        if line.startswith("Annotation"):
            if "complement" in line:
                annotation = (
                    line.split(":")[-1]
                    .replace("(", "")
                    .replace(")", "")
                    .split()[1]
                    .replace(",", "")
                    .split("..")
                )
            else:
                annotation = (
                    line.split()[-1]
                    .replace("(", "")
                    .replace(")", "")
                    .replace(",", "")
                    .split("..")
                )
            annotation_range = tuple([int(i) for i in annotation])
        
        elif line.startswith("Other Designations:"):
            description = line.split(":")[-1].strip()

        elif line.startswith("ID"):
            gene_id = line.split()[-1].strip()

In [19]:
split_contents = gene_conts.splitlines()
split_contents = [line_data for line_data in split_contents if len(line_data) != 0]
chunked_contents = [
    split_contents[i : i + 6] for i in range(0, len(split_contents), 6)
]

# TODO: need to redesign annotatation extraction is not general
gene_dict = defaultdict(None)
for idx, chunked_content in enumerate(chunked_contents):
    subdict = {}
    if "discontinued" in " ".join(chunked_content):
        continue

    if len(chunked_content) != 6:
        continue


    name = chunked_content[0].split(".")[-1].strip()
    gene_id = chunked_content[-1].split(":")[-1].strip()
    for line in chunked_content:

        # iterating through contents
        if line.startswith("Annotation"):
            if "complement" in line:
                annotation = (
                    line.split(":")[-1]
                    .replace("(", "")
                    .replace(")", "")
                    .split()[1]
                    .replace(",", "")
                    .split("..")
                )
            else:
                annotation = (
                    line.split()[-1]
                    .replace("(", "")
                    .replace(")", "")
                    .replace(",", "")
                    .split("..")
                )
            annotation_range = tuple([int(i) for i in annotation])
            subdict["annotation"] = annotation_range

        elif line.startswith("Other Designations:"):
            description = line.split(":")[-1].strip()
            subdict["name"] = f"{name}: {description}"


    gene_dict[gene_id] = subdict

In [4]:
clean_all_genes()

In [20]:
all_accessions[:5]
gene_dict

defaultdict(None,
            {'1487709': {'name': 'ORF68: envelope glycoprotein E',
              'annotation': (115808, 117721)},
             '1487698': {'name': 'ORF61: ubiquitin E3 ligase ICP0',
              'annotation': (103001, 104485)},
             '1487674': {'name': 'ORF9: tegument protein VP22',
              'annotation': (11009, 11964)},
             '1487662': {'name': 'ORF31: envelope glycoprotein B',
              'annotation': (56819, 59648)},
             '1487723': {'name': 'ORF54: capsid portal protein',
              'annotation': (92850, 95984)},
             '1487689': {'name': 'ORF67: envelope glycoprotein I',
              'annotation': (114496, 115592)},
             '1487695': {'name': 'ORF62: transcriptional regulator ICP4',
              'annotation': (120764, 124756)},
             '1487711': {'name': 'ORF63: regulatory protein ICP22',
              'annotation': (118466, 119316)},
             '1487678': {'name': 'ORF47: tegument serine/threonine prote