# Simplified BLASTn Simulation in Python
This Jupyter Notebook implements a simplified version of BLASTn based on a Perl script. It finds word-size matches and extends alignments using a basic scoring system.
It takes two fasta files (neeed full path for visual studio)
Blast output with actual alignment
Weird Score calculation (no trusted)

In [10]:
from Bio import SeqIO

# Function to read a FASTA file
def read_fasta(file_path):
    """Reads a FASTA file and returns a dictionary of sequences."""
    sequences = {}
    for record in SeqIO.parse(file_path, "fasta"):
        sequences[record.id] = str(record.seq)
    return sequences

## Function to Extend Alignments

In [11]:
def extend_alignment(query, subject, q_pos, s_pos):
    """Extends alignment around a seed hit and returns aligned sequences."""
    match_score = 5
    mismatch_score = -2
    score = 0
    alignment_length = 0
    aligned_query = ""
    aligned_subject = ""

    while q_pos < len(query) and s_pos < len(subject):
        if query[q_pos] == subject[s_pos]:
            score += match_score
            aligned_query += query[q_pos]
            aligned_subject += subject[s_pos]
        else:
            score += mismatch_score
            aligned_query += query[q_pos].lower()
            aligned_subject += subject[s_pos].lower()
        alignment_length += 1
        q_pos += 1
        s_pos += 1

    return alignment_length, score, aligned_query, aligned_subject

## Function to Perform BLASTn Simulation

In [13]:
def blast_simulation(query_file, db_file, word_size, output_file):
    """Simulates a BLASTn search using word-size matches and alignment extension."""
    queries = read_fasta(query_file)
    subjects = read_fasta(db_file)  # Read database as FASTA

    with open(output_file, "w") as out:
        for q_id, q_seq in queries.items():
            for s_id, s_seq in subjects.items():
                for i in range(len(q_seq) - word_size + 1):
                    word = q_seq[i:i+word_size]
                    pos = s_seq.find(word)
                    if pos != -1:
                        alignment_length, score, aligned_query, aligned_subject = extend_alignment(q_seq, s_seq, i, pos)
                        # e_value = 1e-5  # Placeholder for academic purposes
                        out.write(f"Query: {q_id}\nSubject: {s_id}\nAlignment Length: {alignment_length}\nScore: {score}\n")
                        out.write(f"Query  : {aligned_query}\n")
                        out.write(f"Subject: {aligned_subject}\n\n")

## Set Input Files and Run BLAST Simulation

In [15]:
query_file = "/Users/.../Chlp_gene_nt.fa"  # Replace with actual path
db_file = "/Users/.../database.fasta"  # Use a FASTA file
word_size = 11  # Word size for matching
output_file = "blast_output.txt"  # Output file

# Run the BLAST simulation
blast_simulation(query_file, db_file, word_size, output_file)

## Print the Output Results

In [16]:
with open(output_file, "r") as blast_output:
    print(blast_output.read())

Query: NC_014267.1:c1890-1774
Subject: NC_014267
Alignment Length: 109
Score: 20
Query  : TCCAAATCCAAaTaaacAagcagtcgAAttaaaTAgaAcgtcaCtttActggggaCttttAttaattTtTGttTTagcAgtattatttTcaagttaCtTtTtcAaTtaa
Subject: TCCAAATCCAAtTtctgAgttttgatAAggttcTAacAacgagCggcAtctttacCagagAcagtaaTcTGagTTccgAcccagtgacTtttcgatCcTgTatAtTgcc

Query: NC_014267.1:c1890-1774
Subject: NC_014267
Alignment Length: 107
Score: 59
Query  : CAAATCCAAATaaaCAAgcagtcgaattaAatAgaAcgtcAcTTtActggGgActtTtatTaAtttTTgttTTAGcagtattAttttcAAgttActTtTTcaattaa
Subject: CAAATCCAAATcttCAAtgtaaacttcatAccAttAgtgaAtTTaAaaatGaAtcaTccaTtAaaaTTacaTTAGattatcaAcgggtAAaacAagTcTTaggaccg

Query: NC_014267.1:c1890-1774
Subject: NC_014267
Alignment Length: 106
Score: 89
Query  : AAATCCAAATAaaCaagcAgtcgaattaaatAgAAcGtcaCttTaCtggggACttTTaTtAattTttgtTTtaGcagtaTtatttTcaAGtTacTTtttcAATTAA
Subject: AAATCCAAATAttCttaaAccatttacctccAaAAgGcatCacTcCaataaACagTTtTgAgaaTgataTTatGgctatTacaccTatAGaTgaTTcacaAATTAA

Query: NC_014267.1:c1890-1774
Subject: NC_014267
A