In [3]:
import urllib.request
import os
import tkinter as tk
from tkinter import filedialog

def reverseComplement(s):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
    t = ''
    for base in s:
        t = complement[base] + t
    return t

def naive_with_rc(p, T):
    """
    Searches for occurrences of pattern p or its reverse complement in text T.
    Returns a sorted list of starting indices where either occurs.
    If p equals its reverse complement, a match is reported only once.
    """
    occurrences = set()  # Use a set to avoid duplicate indices.
    p_rc = reverseComplement(p)
    
    # Function to search for a given pattern in T.
    def search(pattern):
        for i in range(len(T) - len(pattern) + 1):
            match = True
            for j in range(len(pattern)):
                if T[i+j] != pattern[j]:
                    match = False
                    break
            if match:
                occurrences.add(i)
    
    # Search for the pattern.
    search(p)
    # If the reverse complement differs, search for it as well.
    if p_rc != p:
        search(p_rc)
        
    return sorted(occurrences)

# Example usage of naive_with_rc:
T = "ACTAGTACACTAGT"
p = "ACT"
matches = naive_with_rc(p, T)
print("Matches for pattern and its reverse complement:", matches)

def readGenome(filename):
    """
    Reads a genome from a FASTA file.
    Lines starting with '>' are headers and are skipped.
    """
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            if line.startswith('>'):
                continue
            genome += line.rstrip()
    return genome

# Set up Tkinter root and hide the main window.
root = tk.Tk()
root.withdraw()

# Construct a default directory path pointing to the Desktop.
desktop_dir = os.path.join(os.path.expanduser("~"), "Desktop")

# Open a file dialog starting in the Desktop folder.
file_path = filedialog.askopenfilename(
    initialdir=desktop_dir,
    title="Select FASTA file",
    filetypes=[("FASTA files", "*.fa *.fasta"), ("All files", "*.*")]
)

if file_path:
    genome = readGenome(file_path)
    print("Genome read from FASTA file (first 300 bases shown):")
    print(genome[:300])
else:
    print("No file selected.")



Matches for pattern and its reverse complement: [0, 3, 8, 11]
Genome read from FASTA file (first 300 bases shown):
GGGCGGCGACCTCGCGGGTTTTCGCTATTTATGAAAATTTTCCGGTTTAAGGCGTTTCCGTTCTTCTTCGTCATAACTTAATGTTTTTATTTAAAATACCCTCTGAAAAGAAAGGAAACGACAGGTGCTGAAAGCGAGGCTTTTTGGCCTCTGTCGTTTCCTTTCTCTGTTTTTGTCCGTGGAATGAACAATGGAAGTCAACAAAAAGCAGCTGGCTGACATTTTCGGTGCGAGTATCCGTACCATTCAGAACTGGCAGGAACAGGGAATGCCCGTTCTGCGAGGCGGTGGCAAGGGTAA


# An alternative method

In [7]:
# Lambda Virus genome
!wget -P /tmp https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa

--2025-03-26 14:12:02--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/lambda_virus.fa
Resolving d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)... 18.155.188.60, 18.155.188.170, 18.155.188.167, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)|18.155.188.60|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 49270 (48K) [application/octet-stream]
Saving to: ‘/tmp/lambda_virus.fa’


2025-03-26 14:12:03 (3.39 MB/s) - ‘/tmp/lambda_virus.fa’ saved [49270/49270]



In [8]:
def readGenome(filename):
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            # ignore header line with genome information
            if not line[0] == '>':
                genome += line.rstrip()
    return genome


In [10]:
lambda_virus_genome = readGenome('/tmp/lambda_virus.fa')

In [12]:
occurrences = naive_with_rc('ATTA', lambda_virus_genome)
print('offset of leftmost occurrence: %d' % min(occurrences))
print('# occurrences: %d' % len(occurrences))



offset of leftmost occurrence: 78
# occurrences: 383
