# Bacterial Genome Analysis — *Vibrio natriegens*

**Author:** Bhanvi Paliwal  
**Project:** Computational Analysis of the Vibrio natriegens Genome  

This project analyzes the genome of *Vibrio natriegens*, a marine bacterium known for its exceptionally fast doubling time (~10 minutes) and growing importance as a chassis organism in synthetic biology.  

The Python-based workflow performs:  
- Scanning of both DNA strands to identify open reading frames (ORFs ≥ 50 codons)  
- Translation of ORFs into protein sequences  
- Calculation of protein properties, including:  
  - Molecular weight (kDa)  
  - Isoelectric point (pI)  
  - Estimated solubility score  

The pipeline provides a computational framework for exploring bacterial genomes and predicting protein features relevant to experimental or synthetic biology applications.

In [17]:
import os
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.Seq import Seq
import pandas as pd
import time

# ORF Finder

def find_orfs(seq, min_length=150):  # 150 nt = 50 codons
    start_codon = "ATG"
    stop_codons = ["TAA", "TAG", "TGA"]
    orfs = []

    for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]:
        for frame in range(3):
            trans = nuc[frame:].translate(to_stop=False)
            start = 0
            while True:
                start = trans.find("M", start)
                if start == -1:
                    break
                stop = start
                while stop < len(trans):
                    if trans[stop] == "*":
                        break
                    stop += 1
                length = stop - start
                if length >= 50:
                    protein_seq = str(trans[start:stop])
                    orfs.append({
                        "strand": strand,
                        "frame": frame,
                        "nt_start": frame + start * 3,
                        "nt_end": frame + stop * 3,
                        "protein": protein_seq
                    })
                start += 1
    return orfs


#  Solubility Score

AA_WEIGHTS = {
    'A': 0.8, 'C': 0.3, 'D': 1.2, 'E': 1.3,
    'F': 0.2, 'G': 0.9, 'H': 1.0, 'I': 0.4,
    'K': 1.4, 'L': 0.5, 'M': 0.6, 'N': 1.1,
    'P': 0.7, 'Q': 1.0, 'R': 1.5, 'S': 1.2,
    'T': 1.1, 'V': 0.6, 'W': 0.2, 'Y': 0.3
}

def simple_solubility_score(protein_seq):
    total = sum(AA_WEIGHTS.get(aa, 0.0) for aa in protein_seq)
    return total / len(protein_seq) if len(protein_seq) > 0 else 0


# Analyze ORFs

def analyze_orfs(orfs):
    results = []
    for i, orf in enumerate(orfs, start=1):
        seq = orf["protein"]
        analysis = ProteinAnalysis(seq)
        results.append({
            "ORF_ID": f"ORF_{i}",
            "Strand": orf["strand"],
            "Frame": orf["frame"],
            "Start_nt": orf["nt_start"],
            "End_nt": orf["nt_end"],
            "Length_AA": len(seq),
            "MolWeight_kDa": round(analysis.molecular_weight() / 1000, 3),
            "pI": round(analysis.isoelectric_point(), 2),
            "ProteinSeq": seq,
            "SolubilityScore": round(simple_solubility_score(seq), 3)
        })
    return results

#BLAST top hits

def blast_top_hits(orfs, max_hits=5):
    blast_results = []
    for i, orf in enumerate(orfs[:max_hits], start=1):
        seq = orf["protein"]
        if len(seq) > 1000:
            print(f"[SKIP] ORF_{i} is too long ({len(seq)} aa)")
            continue

        print(f"[BLAST] Running BLAST for ORF_{i} ({len(seq)} aa)...")
        try:
            result_handle = NCBIWWW.qblast("blastp", "nr", seq)
            blast_record = NCBIXML.read(result_handle)
            if blast_record.alignments:
                top_hit = blast_record.alignments[0]
                blast_results.append({
                    "ORF_ID": f"ORF_{i}",
                    "Top_Hit": top_hit.hit_def,
                    "E_value": blast_record.descriptions[0].e
                })
            else:
                blast_results.append({
                    "ORF_ID": f"ORF_{i}",
                    "Top_Hit": "No hit found",
                    "E_value": None
                })
        except Exception as e:
            print(f"[ERROR] BLAST failed for ORF_{i}: {e}")
        time.sleep(3)  # polite delay between requests

    return pd.DataFrame(blast_results)

# Main pipeline

def main():
    
    genome_fasta = "vibrionatriegens_genome.fasta"

    print("Reading genome FASTA...")
    genome_record = SeqIO.read(genome_fasta, "fasta")
    seq = genome_record.seq

    print("Finding ORFs...")
    orfs = find_orfs(seq)
    print(f"[INFO] Found {len(orfs)} ORFs >= 50 codons")

    print("Analyzing ORFs...")
    analyzed = analyze_orfs(orfs)
    df = pd.DataFrame(analyzed)

    print("Running BLAST for up to 5 proteins (<1000 aa)...")
    blast_df = blast_top_hits(orfs, max_hits=5)

    # Handle empty BLAST 
    if blast_df is None or blast_df.empty:
        print("[WARNING] No BLAST results found — creating placeholder columns.")
        df["Top_Hit"] = "No hit"
        df["E_value"] = None
        final_df = df
    elif "ORF_ID" not in blast_df.columns:
        print("[WARNING] BLAST dataframe missing ORF_ID — skipping merge.")
        df["Top_Hit"] = "No hit"
        df["E_value"] = None
        final_df = df
    else:
        print("Merging BLAST results...")
        final_df = pd.merge(df, blast_df, on="ORF_ID", how="left")

    output_csv = "final_results_with_blast.csv"
    final_df.to_csv(output_csv, index=False)
    print(f"All results saved to: {output_csv}")
    print(final_df.head())


if __name__ == "__main__":
    main()


Reading genome FASTA...
Finding ORFs...




[INFO] Found 28334 ORFs >= 50 codons
Analyzing ORFs...
Running BLAST for up to 5 proteins (<1000 aa)...
[BLAST] Running BLAST for ORF_1 (321 aa)...
[ERROR] BLAST failed for ORF_1: name 'NCBIWWW' is not defined
[BLAST] Running BLAST for ORF_2 (257 aa)...
[ERROR] BLAST failed for ORF_2: name 'NCBIWWW' is not defined
[BLAST] Running BLAST for ORF_3 (187 aa)...
[ERROR] BLAST failed for ORF_3: name 'NCBIWWW' is not defined
[BLAST] Running BLAST for ORF_4 (179 aa)...
[ERROR] BLAST failed for ORF_4: name 'NCBIWWW' is not defined
[BLAST] Running BLAST for ORF_5 (175 aa)...
[ERROR] BLAST failed for ORF_5: name 'NCBIWWW' is not defined
All results saved to: final_results_with_blast.csv
  ORF_ID  Strand  Frame  Start_nt  End_nt  Length_AA  MolWeight_kDa    pI  \
0  ORF_1       1      0      8997    9960        321         35.543  5.72   
1  ORF_2       1      0      9189    9960        257         28.445  4.88   
2  ORF_3       1      0      9399    9960        187         20.476  4.75   
3  ORF_