In [2]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/3.2 MB[0m [31m8.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m1.9/3.2 MB[0m [31m27.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86


In [None]:
"""
FILE: pre_simulated_pipeline.ipynb
DESCRIPTION: An automated bioinformatics pipeline for DNA sequence validation,
             transcription, translation (ORF detection), and protein property analysis
             (Molecular Weight, Isoelectric Point, and Mutation Detection).
AUTHOR: Alfian Valentino F. P. S.
DATE: January 18, 2026
LICENSE: MIT
CONTACT: https://github.com/alfnihilus
"""

In [4]:
# ==========================================
# 1. SETUP & CONFIG
# ==========================================
from Bio import SeqIO, Entrez
from Bio.Seq import Seq
from Bio.SeqUtils import gc_fraction, molecular_weight
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio.SeqRecord import SeqRecord
import pandas as pd
import re

Entrez.email = "email_kamu@example.com"

# ==========================================
# AMINO ACID DATABASE
# ==========================================
# Reformat as Dictionary {Symbol: {Details}} to enable instant lookup
AA_DATABASE = {
    "A": {"name": "Alanine", "property": "Non-polar"},
    "R": {"name": "Arginine", "property": "Basic (+)"},
    "N": {"name": "Asparagine", "property": "Polar"},
    "D": {"name": "Aspartic Acid", "property": "Acidic (-)"},
    "C": {"name": "Cysteine", "property": "Sulfur-containing"},
    "Q": {"name": "Glutamine", "property": "Polar"},
    "E": {"name": "Glutamic Acid", "property": "Acidic (-)"},
    "G": {"name": "Glycine", "property": "Small"},
    "H": {"name": "Histidine", "property": "Basic (+)"},
    "I": {"name": "Isoleucine", "property": "Non-polar"},
    "L": {"name": "Leucine", "property": "Non-polar"},
    "K": {"name": "Lysine", "property": "Basic (+)"},
    "M": {"name": "Methionine", "property": "Start / Sulfur-containing"},
    "F": {"name": "Phenylalanine", "property": "Aromatic"},
    "P": {"name": "Proline", "property": "Rigid"},
    "S": {"name": "Serine", "property": "Polar"},
    "T": {"name": "Threonine", "property": "Polar"},
    "W": {"name": "Tryptophan", "property": "Aromatic"},
    "Y": {"name": "Tyrosine", "property": "Aromatic"},
    "V": {"name": "Valine", "property": "Non-polar"},
}

# ==========================================
# 2. ANALYSIS FUNCTIONS
# ==========================================

def find_all_codons(dna_string):
    """
    Finds the starting positions of all start (ATG) and stop (TAA, TAG, TGA) codons.
    """
    starts = [m.start() for m in re.finditer('ATG', str(dna_string))]
    stops = [m.start() for m in re.finditer('TAA|TAG|TGA', str(dna_string))]
    return starts, stops

def detect_mutations(reference, sample):
    """
    Compares the reference sequence with a sample sequence and identifies point mutations.
    """
    reference_str, sample_str = str(reference).upper(), str(sample).upper()
    mutations = []
    # Compare character by character
    # b1 = base_reference b2 = base_sample
    for i, (b1, b2) in enumerate(zip(reference_str, sample_str)):
        if b1 != b2:
            mutations.append(f"Pos {i+1}: {b1}->{b2}")
    return mutations if mutations else ["No mutations found"]

def find_longest_orf(dna_obj, table_id=1):
    """
    Identifies and returns the longest Open Reading Frame (ORF)
    from both the forward and reverse strands across all 6 reading frames.
    """
    target_sequences = [dna_obj, dna_obj.reverse_complement()]
    found_orfs = []
    for strand in target_sequences:
        for frame in range(3):
            n = len(strand)
            # Ensure the sequence length is a multiple of 3 for translation
            dna_frame = strand[frame : n - (n - frame) % 3]
            protein_seq = dna_frame.translate(table=table_id)

            # Split protein sequence by stop codons (*)
            for fragmen in protein_seq.split('*'):
                # ORF must start with Methionine (M)
                if 'M' in fragmen:
                    found_orfs.append(fragmen[fragmen.find('M'):])

    # Return the longest ORF found, or an empty sequence if none exist
    return max(found_orfs, key=len) if found_orfs else Seq("")

# ==========================================
# 3. MAIN PIPELINE
# ==========================================

def run_full_pipeline():
    print("=== AUTOMATED ANALYSIS PIPELINE ===")

    # --- SELECT TRANSLATION TABLE ---
    print("\n[1] ORGANISM TYPE")
    print("1. Standard (Eukaryote/General)")
    print("11. Bacteria, Archaea, & Plastids")
    try:
        table_id = int(input("Select table number (1/11): "))
    except ValueError:
        print("❌ Invalid input. Defaulting to Standard Table (1).")
        table_id = 1 # Default to Standard if input is invalid

    # --- INPUT METHOD ---
    print("\n[2] INPUT METHOD")
    print("1. Manual Entry | 2. FASTA File | 3. NCBI Fetch")
    choice = input("Select method (1/2/3): ")

    if choice == '1':
        id_sequence = "Manual_Input"
        dna_raw = input("Enter DNA sequence: ")
    elif choice == '2':
        filename = input("Enter .fasta filename: ")
        record = SeqIO.read(filename, "fasta")
        id_sequence = record.id
        dna_raw = str(record.seq)
    elif choice == '3':
        accession_id = input("Enter NCBI Accession ID: ")
        # Fetching data from NCBI using Entrez
        handle = Entrez.efetch(db="nucleotide", id=accession_id, rettype="fasta", retmode="text")
        record = SeqIO.read(handle, "fasta")
        id_sequence = record.id
        dna_raw = str(record.seq)
    else:
      print("[!] Invalid selection. Exiting pipeline.")
      return

    # --- VALIDATION & REVERSE COMPLEMENT ---
    clean_dna = dna_raw.upper().strip().replace(" ", "")

    # Check for non-standard nucleotides (only A, T, C, G allowed)
    if not set(clean_dna).issubset(set("ATCG")):
        print("❌ Error: Illegal characters detected in sequence!"); return

    dna_seq = Seq(clean_dna)
    rev_comp = dna_seq.reverse_complement()

    # --- TRANSCRIPTION  ---
    rna_seq = dna_seq.transcribe()

    # --- TRANSLATION ---
    clean_dna = dna_raw.upper().strip().replace(" ", "")
    dna_seq = Seq(clean_dna)
    rna_seq = dna_seq.transcribe()

    # Full Translation
    # Translates the entire sequence based on the selected genetic code
    full_protein = dna_seq.translate(table=table_id)

    # Longest Protein (ORF)
    # Finds the longest Open Reading Frame (ORF) in the sequence
    protein_orf = find_longest_orf(dna_seq, table_id=table_id)

    # --- CHEMICAL & CODON ANALYSIS ---
    gc = gc_fraction(dna_seq) * 100
    starts, stops = find_all_codons(dna_seq)

    # Protein Analysis (pI & Composition)
    if len(protein_orf) > 0:
        protein_analyzer = ProteinAnalysis(str(protein_orf))
        isoelectric_point = protein_analyzer.isoelectric_point(); mol_weight = molecular_weight(protein_orf, "protein")
        mol_weight = molecular_weight(protein_orf, "protein")
        aa_composition = protein_analyzer.count_amino_acids()
    else:
        print("❌ Error: No ORF detected!"); return

    # --- Mutation Analysis (Optional) ---
    check_mutan = input("\nWould you like to check for mutations against a reference? (y/n): ").lower()
    mutation_report = "Not checked"
    if check_mutan == 'y':
        reference_input = input("Enter Reference DNA sequence: ")
        mutation_report = detect_mutations(reference_input, clean_dna)

    # ==========================================
    # 4. RESULTS
    # ==========================================
    print("\n" + "="*50)
    print(f"SEQUENCE ANALYSIS REPORT: {id_sequence}")
    print("="*50)

    # Displaying first 60 characters for readability
    print(f"Original DNA      : {str(clean_dna)[:60]}...")
    print(f"Reverse Complement: {str(rev_comp)[:60]}...")
    print(f"RNA (Transcription): {str(rna_seq)[:60]}...")
    print(f"Full Protein      : {str(full_protein)[:60]}...")
    print(f"Longest ORF       : {str(protein_orf)[:60]}...")

    # Statistics and Chemical Properties
    print(f"\nGC Content      : {gc*100:.2f}%")
    print(f"Molecular Weight   : {mol_weight:.2f} Da")
    print(f"Isoelectric Point : {isoelectric_point:.2f} (pH)")

    # Codon Locations
    print(f"\nStart Indices (ATG): {(starts)}")
    print(f"Stop Indices      : {(stops)}")
    print("\nAmino Acid Composition (Count):")

    # Only include amino acids that are present to keep the output concise
    for symbol, count in aa_composition.items():
        if count > 0:
            # Retrieve details from AA_DATABASE; use default values if not found
            detail = AA_DATABASE.get(symbol, {"name": "Unknown", "property": "Unknown"})
            name_aa = detail["name"]
            property_aa = detail["property"]
            print(f"    - {name_aa} ({symbol}) [{property_aa}]: {count}")

    print(f"\nMutation Report     : {mutation_report}")
    print("="*50)

    # --- SAVE OPTIONS ---
    save_report = input("\nSave full report to a .txt file? (y/n): ").lower()

    if save_report == 'y':
        report_filename = f"Report_{id_sequence}.txt"

        with open(report_filename, "w") as f:
            # Writing the exact same content shown on the screen
            f.write("="*60 + "\n")
            f.write(f"FULL SEQUENCE ANALYSIS REPORT: {id_sequence}\n")
            f.write("="*60 + "\n\n")

            f.write(f"Original DNA         : {str(dna_seq)}\n")
            f.write(f"Reverse Complement   : {str(dna_seq.reverse_complement())}\n")
            f.write(f"RNA (Transcription)  : {str(rna_seq)}\n")
            f.write(f"Full Protein         : {str(full_protein)}\n")
            f.write(f"Longest Protein (ORF): {str(protein_orf)}\n\n")

            f.write(f"GC Content          : {gc_fraction(dna_seq)*100:.2f}%\n")
            f.write(f"Protein Mol. Weight : {molecular_weight(protein_orf, 'protein'):.2f} Da\n")
            f.write(f"Isoelectric Point   : {protein_analyzer.isoelectric_point():.2f} (pH)\n\n")

            f.write(f"Start Indices (ATG) : {starts}\n")
            f.write(f"Stop Indices        : {stops}\n\n")

            f.write("Amino Acid Composition (Count):\n")

            for symbol, count in protein_analyzer.count_amino_acids().items():
                if count > 0:
                    detail = AA_DATABASE.get(symbol, {"name": "Unknown", "property": "Unknown"})
                    f.write(f"  - {detail['name']} ({symbol}) [{detail['property']}]: {int(count)} unit\n")

            f.write("\n" + "="*60 + "\n")
            f.write("This report was automatically generated by Biopython.\n")

        print(f"✅ Success! The full report has been saved as: {report_filename}")
        print("Check the folder icon on the left sidebar in Google Colab to download it.")

    # SAVE TO FASTA
    if input("\nSave results to a FASTA file? (y/n): ").lower() == 'y':
        fasta_filename = f"{id_sequence}_protein.fasta"
        SeqIO.write(SeqRecord(protein_orf, id=id_sequence, description="ORF_Protein"), fasta_filename, "fasta")
        print(f"✅ File {fasta_filename} saved successfully!")

# Execute Program
if __name__ == "__main__":
    run_full_pipeline()

=== AUTOMATED ANALYSIS PIPELINE ===

[1] ORGANISM TYPE
1. Standard (Eukaryote/General)
11. Bacteria, Archaea, & Plastids
Select table number (1/11): 1

[2] INPUT METHOD
1. Manual Entry | 2. FASTA File | 3. NCBI Fetch
Select method (1/2/3): 3
Enter NCBI Accession ID: PI195879.1

Would you like to check for mutations against a reference? (y/n): n

SEQUENCE ANALYSIS REPORT: PI195879.1
Original DNA      : ATGTCAAGCTCTTCCTGGCTCCTTCTCAGCCTTGTTGCTGTAACTGCTGCTCAGTCCACC...
Reverse Complement: TTAAAAGGAGGTCTGAACATCATCAGTGTTTTGGAATCCTGGATTATTTTCTCCTTTGCT...
RNA (Transcription): AUGUCAAGCUCUUCCUGGCUCCUUCUCAGCCUUGUUGCUGUAACUGCUGCUCAGUCCACC...
Full Protein      : MSSSSWLLLSLVAVTAAQSTIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQ...
Longest ORF       : MSSSSWLLLSLVAVTAAQSTIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQ...

GC Content      : 4292.80%
Molecular Weight   : 92461.95 Da
Isoelectric Point : 5.36 (pH)

Start Indices (ATG): [0, 172, 183, 190, 203, 243, 307, 366, 419, 445, 453, 475, 538, 547, 562, 567, 58