In [1]:
#import the specific seqio module from the biopython library

In [2]:
# this module is designed for readind and writing various sequence file formats

In [3]:
from Bio import SeqIO

In [4]:
#import a function to calulate GC content from biopython's sequtils

In [7]:
from Bio.SeqUtils import GC

In [8]:
#store the name of our FASTA file in a variable 

In [10]:
#the script will look for this file in the same directory where jupyter notebook was started

In [14]:
fasta_file_name = 'sequences.fasta'

In [15]:
#print the file name we're targeting

In [16]:
print(f'target file set to:{fasta_file_name}')

target file set to:sequences.fasta


In [17]:
#use a try... except block. 'try runs the code that might case an error. 

In [19]:
#'except' catches specific errors (like file not ofund) and runs different code

In [38]:
try:
    #use SeqIO.read() to read a file that contains ONLY ONE sequence. if your file had multiple sequences and your wanted to process them all, you would use SeqIO.parse() and loop through it. 
    #arguments: file name, file format ('fasta')
    fasta_records = SeqIO.parse(fasta_file_name, 'fasta')

    print(f'Processing sequences from file: {fasta_file_name}')
    print('-------------------')

    #loop through each sequence record

    #now look through each sequence record found by SeqIO.parse()
    for sequence_record in fasta_records:
        
        #accessing sequence data and info for the current record
        #inside the loop, 'sequence_record' changes for each sequence in the file
        
        sequence_id = sequence_record.id #the identifier (eg. sampleseq1)
        sequence_description = sequence_record.description #the full description line
        dna_sequence_object = sequence_record.seq #actual sequence data as biopython seq object)

        print(f'Analyzing Sequence; {sequence_id}')
    #use f-strings to embed variable values directly into print statements
        print(f'Sequence ID: {sequence_id}')
    #modify the description in memory
        if sequence_id == 'SampleSeq1':
            sequence_record.description = '-This sequence is a long one, starting with ATG.'
        elif sequence_id == 'SampleSeq2':
            sequence_record.description = '-This sequence is a short one, starting with AGC.'
            
        sequence_description = sequence_record.description
        print(f'Description: {sequence_description}')
   
    # if you want to see the actual sequence, uncomment the line below: 
        print(f'Original Sequence: {dna_sequence_object}')


    #performing calculations using biopython for the current record
    
    #1. get the length of the sequence
    #biopython seq objects work directly with python's built-in len() function
        sequence_length = len(dna_sequence_object)

    #2. calculate GC content (percentage of G and C nucleotides)
    # we use the gc_content function we imported earlier from bio.sequtils
        gc_percentage = GC(dna_sequence_object)

    #3. get the reverse complement sequence 
    # biopython seq objects have methods (function attached to them) 
    # use the .reverse_complement() method
        reverse_complement_seq = dna_sequence_object.reverse_complement()

    #display the results

        print('\n--- Sequence Analysis Results ---') #print a header for clarity


        print(f'Length: {sequence_length} base pairs')

    #use :.2F inside the f-string to format the GC percentage to 2 decimal places
        print(f'GC Content: {gc_percentage:.2f}')
        print(f'Reverse Complement: {reverse_complement_seq}')
        print('---') #separator for each sequence 

    print('\nAnalysis Complete!')
    
    
    #error handling code
except FileNotFoundError:
    #this code runs only if the file specified by fasta_file_name wasn't found
    print(f"Error: The File '{fasta_file_name}' was not found in the current directory.")
except Exception as e:
    #this is a general catch-all for any other errors during the process
    print(f'An unexpected error occured: {e}')
    


Processing sequences from file: sequences.fasta
-------------------
Analyzing Sequence; SampleSeq1
Sequence ID: SampleSeq1
Description: -This sequence is a long one, starting with ATG.
Original Sequence: ATGCGTACGTACGTATGCATGCGTACGTACGTATGCGCGTACGTACGTATGCATGCGTACGTACGTATGCGCATGCGTACGTACGTATGC

--- Sequence Analysis Results ---
Length: 90 base pairs
GC Content: 52.22
Reverse Complement: GCATACGTACGTACGCATGCGCATACGTACGTACGCATGCATACGTACGTACGCGCATACGTACGTACGCATGCATACGTACGTACGCAT
---
Analyzing Sequence; SampleSeq2
Sequence ID: SampleSeq2
Description: -This sequence is a short one, starting with AGC.
Original Sequence: AGCTAGCTAGCTAGCT

--- Sequence Analysis Results ---
Length: 16 base pairs
GC Content: 50.00
Reverse Complement: AGCTAGCTAGCTAGCT
---

Analysis Complete!
