In [None]:
# import required library
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)  # None means no limit
import time

In [None]:
# connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# .csv file address that contain organisms names
file_path = "/content/drive/MyDrive/.csv file_name"
data_f = pd.read_csv(file_path, encoding="ISO-8859-1", na_filter=False)

In [None]:
# dataframe
data_f

Unnamed: 0,organism,Unnamed: 1,Unnamed: 2
0,Clostridium hylemonae DSM 15053,,
1,Acaryochloris marina,,
2,Acetoanaerobium sticklandii,,
3,Acetoanaerobium sticklandii,,
4,Acidiphilium cryptum,,
...,...,...,...
1323,Zea mays subsp. huehuetenangensis,,
1324,Zea mays subsp. mexicana,,
1325,Zea perennis,,
1326,Zymomonas mobilis subsp. mobilis,,


In [None]:
# install Biopython
pip install biopython

Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86


In [None]:
# Access NCBI Texonomy database through Biopython 
from Bio import Entrez

def get_taxonomy(organism_name):
    """Fetch taxonomy details and NCBI link for a given organism name."""
    try:
        # Search organism in NCBI taxonomy
        handle = Entrez.esearch(db="taxonomy", term=organism_name)
        record = Entrez.read(handle)
        handle.close()

        if not record['IdList']:
            return {"Taxonomy_Link": "Not available", "Taxonomy_Info": "Not found"}

        tax_id = record['IdList'][0]

        # Fetch the full taxonomy record
        handle = Entrez.efetch(db="taxonomy", id=tax_id, retmode="xml")
        records = Entrez.read(handle)
        handle.close()

        lineage = records[0]["Lineage"]
        # lineage = str(records[0]["Lineage"]).split(';')
        rank = records[0]["Rank"]
        scientific_name = records[0]["ScientificName"]

        # Construct informative taxonomy info
        taxonomy_info = f"{scientific_name}';'{rank}';'{lineage}"
        # print(type(taxonomy_info))
        # Construct NCBI taxonomy browser link
        ncbi_link = f"https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id={tax_id}"

        # Return both info and link as a dictionary
        return {"Taxonomy_Link": ncbi_link, "Taxonomy_Info": taxonomy_info}

    except Exception as e:
        return {"Taxonomy_Link": "Error", "Taxonomy_Info": f"Error: {e}"}

# Apply the function to each organism
results = data_f['organism'].apply(get_taxonomy)

# Expand the dictionary output into separate columns
data_f[['Taxonomy_Link','Taxonomy_Info',]] = pd.DataFrame(results.tolist(), index=data_f.index)

# Optional: delay between API requests to respect NCBI limits (≈3/sec max)
# time.sleep(0.5)

# Display results
# print(data_f[['organism','Taxonomy_Link','Taxonomy_Info']])

            Email address is not specified.

            To make use of NCBI's E-utilities, NCBI requires you to specify your
            email address with each request.  As an example, if your email address
            is A.N.Other@example.com, you can specify it as follows:
               from Bio import Entrez
               Entrez.email = 'A.N.Other@example.com'
            In case of excessive usage of the E-utilities, NCBI will attempt to contact
            a user at the email address provided before blocking access to the
            E-utilities.


In [None]:
# store the data into dataframe
data_f[['organism','Taxonomy_Link','Taxonomy_Info']]

Unnamed: 0,organism,Taxonomy_Link,Taxonomy_Info
0,Clostridium hylemonae DSM 15053,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=553973,[Clostridium] hylemonae DSM 15053';'strain';'cellular organisms; Bacteria; Bacillati; Bacillota; Clostridia; Lachnospirales; Lachnospiraceae; Lachnoclostridium; [Clostridium] hylemonae
1,Acaryochloris marina,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=155978,Acaryochloris marina';'species';'cellular organisms; Bacteria; Bacillati; Cyanobacteriota/Melainabacteria group; Cyanobacteriota; Cyanophyceae; Acaryochloridales; Acaryochloridaceae; Acaryochloris
2,Acetoanaerobium sticklandii,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=1511,Acetoanaerobium sticklandii';'species';'cellular organisms; Bacteria; Bacillati; Bacillota; Clostridia; Peptostreptococcales; Filifactoraceae; Acetoanaerobium
3,Acetoanaerobium sticklandii,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=1511,Acetoanaerobium sticklandii';'species';'cellular organisms; Bacteria; Bacillati; Bacillota; Clostridia; Peptostreptococcales; Filifactoraceae; Acetoanaerobium
4,Acidiphilium cryptum,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=524,Acidiphilium cryptum';'species';'cellular organisms; Bacteria; Pseudomonadati; Pseudomonadota; Alphaproteobacteria; Acetobacterales; Acidocellaceae; Acidiphilium
...,...,...,...
1323,Zea mays subsp. huehuetenangensis,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=112001,Zea mays subsp. huehuetenangensis';'subspecies';'cellular organisms; Eukaryota; Viridiplantae; Streptophyta; Streptophytina; Embryophyta; Tracheophyta; Euphyllophyta; Spermatophyta; Magnoliopsida; Mesangiospermae; Liliopsida; Petrosaviidae; commelinids; Poales; Poaceae; PACMAD clade; Panicoideae; Andropogonodae; Andropogoneae; Tripsacinae; Zea; Zea mays
1324,Zea mays subsp. mexicana,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=4579,Zea mays subsp. mexicana';'subspecies';'cellular organisms; Eukaryota; Viridiplantae; Streptophyta; Streptophytina; Embryophyta; Tracheophyta; Euphyllophyta; Spermatophyta; Magnoliopsida; Mesangiospermae; Liliopsida; Petrosaviidae; commelinids; Poales; Poaceae; PACMAD clade; Panicoideae; Andropogonodae; Andropogoneae; Tripsacinae; Zea; Zea mays
1325,Zea perennis,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=4580,Zea perennis';'species';'cellular organisms; Eukaryota; Viridiplantae; Streptophyta; Streptophytina; Embryophyta; Tracheophyta; Euphyllophyta; Spermatophyta; Magnoliopsida; Mesangiospermae; Liliopsida; Petrosaviidae; commelinids; Poales; Poaceae; PACMAD clade; Panicoideae; Andropogonodae; Andropogoneae; Tripsacinae; Zea
1326,Zymomonas mobilis subsp. mobilis,https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=120045,Zymomonas mobilis subsp. mobilis';'subspecies';'cellular organisms; Bacteria; Pseudomonadati; Pseudomonadota; Alphaproteobacteria; Sphingomonadales; Zymomonadaceae; Zymomonas; Zymomonas mobilis


In [None]:
# Convert dataframe into CSV and download to device
from google.colab import files
data_f[['organism','Taxonomy_Link','Taxonomy_Info']].to_csv('my_data.csv', index=False)

# Download the file
files.download('my_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>