In [1]:
from Bio import Entrez, SeqIO
from tqdm import tqdm

import pandas as pd
import time
import os
import re

In [2]:
Entrez.email = 'akishirsath@gmail.com'

In [3]:
start = time.time()

# Complete Genome (nuo absent)
# NC_002163.1

# Plasmid
# NZ_CP071455.1

# Multiple copies
# NZ_CP072608.1

# Complete Genome (nuo present)
# NC_002695.2

# Accession ID to search
acc_id = "NZ_CP072608.1"

In [4]:
handle = Entrez.efetch(db='nucleotide', 
                   id=acc_id, 
                   rettype='fasta_cds_na', 
                   retmode='text')
record = handle.read()

end = time.time()
print(f"Record downloading: {end-start}")

Record downloading: 32.06046462059021


In [5]:
full_name_1 = "NADH-quinone oxidoreductase" 

full_name_2 = "NADH:quinone oxidoreductase"

gene_sym_1 = 'nuo'

gene_sym_2 = 'nqo'    

count=0

nuo_headers = list()

for line in record.split('\n'):
    if (full_name_1 in line) or (full_name_2 in line) or (gene_sym_1 in line) or (gene_sym_2 in line):
        count+=1
        nuo_headers.append(line)

print(f"\nnuo Genes: {count}\n")


nuo Genes: 30



In [6]:
nuo_headers

['>lcl|NZ_CP072608.1_cds_WP_022657683.1_388 [locus_tag=J8J02_RS01995] [db_xref=GeneID:72381465] [protein=NADH-quinone oxidoreductase subunit C] [protein_id=WP_022657683.1] [location=complement(468170..468709)] [gbkey=CDS]',
 '>lcl|NZ_CP072608.1_cds_WP_034604940.1_390 [gene=nuoB] [locus_tag=J8J02_RS02005] [db_xref=GeneID:72381467] [protein=NADH-quinone oxidoreductase subunit NuoB] [protein_id=WP_034604940.1] [location=complement(469313..469747)] [gbkey=CDS]',
 '>lcl|NZ_CP072608.1_cds_WP_209818609.1_391 [locus_tag=J8J02_RS02010] [db_xref=GeneID:72381468] [protein=NADH-quinone oxidoreductase subunit H] [protein_id=WP_209818609.1] [location=complement(469773..470744)] [gbkey=CDS]',
 '>lcl|NZ_CP072608.1_cds_WP_022658250.1_917 [locus_tag=J8J02_RS04670] [db_xref=GeneID:72382000] [protein=NADH-quinone oxidoreductase subunit L] [protein_id=WP_022658250.1] [location=1065136..1067091] [gbkey=CDS]',
 '>lcl|NZ_CP072608.1_cds_WP_022658251.1_918 [locus_tag=J8J02_RS04675] [db_xref=GeneID:72382001] [pr

# Method 01
Just Information

In [7]:
main = list()
lookfor = ['gene=', 'locus_tag=', 'protein_id=', 'protein=', 'db_xref=', 'location=']

for header in nuo_headers:
    info = None
    temp = list()
    temp.append(acc_id)
    for tag in lookfor:
        try:
            info = re.search(f'\[{tag}([^\]]+)\]', header).group(1)
            if info.startswith('GeneID'):
                temp.append(info.split(':')[1])
            else:
                temp.append(info)
        except:
            temp.append(info)
    main.append(temp)

In [8]:
data = pd.DataFrame(main, columns=['Accession', 'Gene', 'LocusTag', 'ProteinID', 'ProteinName', 'GeneID', 'Location'])

In [9]:
data

Unnamed: 0,Accession,Gene,LocusTag,ProteinID,ProteinName,GeneID,Location
0,NZ_CP072608.1,,J8J02_RS01995,WP_022657683.1,NADH-quinone oxidoreductase subunit C,72381465,complement(468170..468709)
1,NZ_CP072608.1,nuoB,J8J02_RS02005,WP_034604940.1,NADH-quinone oxidoreductase subunit NuoB,72381467,complement(469313..469747)
2,NZ_CP072608.1,,J8J02_RS02010,WP_209818609.1,NADH-quinone oxidoreductase subunit H,72381468,complement(469773..470744)
3,NZ_CP072608.1,,J8J02_RS04670,WP_022658250.1,NADH-quinone oxidoreductase subunit L,72382000,1065136..1067091
4,NZ_CP072608.1,,J8J02_RS04675,WP_022658251.1,NADH-quinone oxidoreductase subunit H,72382001,1067091..1067942
5,NZ_CP072608.1,,J8J02_RS04680,WP_022658252.1,NADH-quinone oxidoreductase subunit B family p...,72382002,1067955..1068515
6,NZ_CP072608.1,,J8J02_RS04685,WP_209818940.1,NADH-quinone oxidoreductase subunit C,72382003,1068534..1069010
7,NZ_CP072608.1,,J8J02_RS08240,WP_209817444.1,NADH-quinone oxidoreductase subunit N,72382709,complement(1926853..1928277)
8,NZ_CP072608.1,,J8J02_RS08245,WP_245170739.1,NADH-quinone oxidoreductase subunit M,72382710,complement(1928309..1929832)
9,NZ_CP072608.1,nuoK,J8J02_RS08265,WP_022658764.1,NADH-quinone oxidoreductase subunit NuoK,72382714,complement(1933394..1933720)


In [10]:
end = time.time()
print(f"Processed in {end-start} seconds")

Processed in 32.20840573310852 seconds
