### we have to show classifcation_accuracy on the website

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

### Classification for the known species labelling
### Clustering for identifying novel taxa(new species)

In [2]:
df=pd.read_csv('plasmids.csv')
df.head()

Unnamed: 0,id,sequence
0,NC_001988.2,GATTGTATAATGCAATTATAGCATATTTAGGAGGTATTTAAAATGA...
1,NC_009131.1,CAGATCAACAAAGGCCACAAACGCCGCTTCGTTTCCTTCCTCGCGG...
2,NC_019098.1,GTAGAGTTTCAGGGTAACAGGGGATGTTTATGTCGGTTTTCCACAA...
3,NC_020278.2,CTTCGCGTTGCTCAGTTGTCCAACCCCGGAAACGGGAAAAAGCAAG...
4,NC_022374.1,ATGGATAAGTTGCTGAACAAAAAGATAAAAGTTAAGCAGTCTAACG...


In [3]:
df.shape

(1609, 2)

In [4]:
df['sequence'].head()

0    GATTGTATAATGCAATTATAGCATATTTAGGAGGTATTTAAAATGA...
1    CAGATCAACAAAGGCCACAAACGCCGCTTCGTTTCCTTCCTCGCGG...
2    GTAGAGTTTCAGGGTAACAGGGGATGTTTATGTCGGTTTTCCACAA...
3    CTTCGCGTTGCTCAGTTGTCCAACCCCGGAAACGGGAAAAAGCAAG...
4    ATGGATAAGTTGCTGAACAAAAAGATAAAAGTTAAGCAGTCTAACG...
Name: sequence, dtype: object

In [39]:
from Bio import Entrez
import time

Entrez.email = "prateekyash0610@gmail.com"  # Replace with your email for NCBI compliance

def fetch_species_batch(accessions):
    # Flatten and sanitize input: ensure all items are strings and no nested lists
    flat_accessions = []
    for acc in accessions:
        if isinstance(acc, list):
            flat_accessions.extend([str(a) for a in acc])
        else:
            flat_accessions.append(str(acc))

    # Remove duplicates for efficiency
    unique_accessions = list(set(flat_accessions))
    # Create comma-separated accession list, trimming version suffix (after '.')
    ids = ",".join([a.split('.')[0] for a in unique_accessions])

    try:
        handle = Entrez.efetch(db="nucleotide", id=ids, rettype="gb", retmode="text")
        records = handle.read()
        handle.close()
    except Exception as e:
        print(f"Entrez fetch error: {e}")
        return {}

    result = {}
    # Split records on GenBank record separator
    for rec in records.split('\n//\n'):
        lines = rec.split('\n')
        org = None
        acc = None
        for line in lines:
            if line.startswith('VERSION'):
                acc = line.split()[1].split('.')[0]
            if line.startswith('  ORGANISM'):
                org = line.strip().replace('ORGANISM  ', '')
                break
        if acc and org:
            result[acc] = org
    return result

def map_species(df, accession_col='id', batch_size=20, delay=0.4):
    all_accessions = df[accession_col].tolist()
    species_map = {}

    for i in range(0, len(all_accessions), batch_size):
        batch = all_accessions[i:i+batch_size]
        print(f"Processing batch {i} to {i+len(batch)} ...")
        species_map.update(fetch_species_batch(batch))
        time.sleep(delay)  # NCBI recommends ≤3 requests/sec
    
    # Map original dataframe column to species, default Unknown if not found
    df['species'] = df[accession_col].apply(lambda x: species_map.get(str(x).split('.')[0], 'Unknown'))
    return df

# Usage example
import pandas as pd

df = pd.read_csv('plasmids.csv')
df = map_species(df)
print(df[['id', 'species']].head())


Processing batch 0 to 20 ...
Processing batch 20 to 40 ...
Processing batch 40 to 60 ...
Processing batch 60 to 80 ...
Processing batch 80 to 100 ...
Processing batch 100 to 120 ...
Processing batch 120 to 140 ...
Processing batch 140 to 160 ...
Processing batch 160 to 180 ...
Processing batch 180 to 200 ...
Processing batch 200 to 220 ...
Processing batch 220 to 240 ...
Processing batch 240 to 260 ...
Processing batch 260 to 280 ...
Processing batch 280 to 300 ...
Processing batch 300 to 320 ...
Processing batch 320 to 340 ...
Processing batch 340 to 360 ...
Processing batch 360 to 380 ...
Processing batch 380 to 400 ...
Processing batch 400 to 420 ...
Processing batch 420 to 440 ...
Processing batch 440 to 460 ...
Processing batch 460 to 480 ...
Processing batch 480 to 500 ...
Processing batch 500 to 520 ...
Processing batch 520 to 540 ...
Processing batch 540 to 560 ...
Processing batch 560 to 580 ...
Processing batch 580 to 600 ...
Processing batch 600 to 620 ...
Processing batch 6

In [41]:
df.head()

Unnamed: 0,id,sequence,species
0,NC_001988.2,GATTGTATAATGCAATTATAGCATATTTAGGAGGTATTTAAAATGA...,Clostridium acetobutylicum ATCC 824
1,NC_009131.1,CAGATCAACAAAGGCCACAAACGCCGCTTCGTTTCCTTCCTCGCGG...,Escherichia coli
2,NC_019098.1,GTAGAGTTTCAGGGTAACAGGGGATGTTTATGTCGGTTTTCCACAA...,Escherichia coli
3,NC_020278.2,CTTCGCGTTGCTCAGTTGTCCAACCCCGGAAACGGGAAAAAGCAAG...,Escherichia coli
4,NC_022374.1,ATGGATAAGTTGCTGAACAAAAAGATAAAAGTTAAGCAGTCTAACG...,Escherichia coli


In [42]:
df=df.to_csv('updated_plasmids.csv')