In [1]:
# Importing the BioPython library to read the fasta and fastq file and the fuzzywuzzy library to find the mapping ratio of the reads to the genome.
from Bio import SeqIO
from fuzzywuzzy import fuzz

In [2]:
# This function is used to open the fasta file (Genome Sequence) and return the complete sequence in the form of a string.
def fasta_file(file_name):
    for record in SeqIO.parse(file_name,"fasta"):
        fasta_seq = record.seq.upper()
    return str(fasta_seq)

# This function is used to open the fastq file (Sequence Reads) and return the complete sequence in the form of a list.
def fastq_file(file_name):
    fastq_list = []
    for record in SeqIO.parse(file_name,"fastq"):
        fastq_list.append(str(record.seq))
    return fastq_list

In [3]:
# Storing the genome sequence in a variable in order to access easily.
genome = fasta_file('chr2L.fa')

In [4]:
# This fumction finds the possible K-mers of seed size = 10 for the Genome file and returns the dictionary of all the possible K-mers and their index postion.
def kmers(sequence,seed_size):
    hash_map = {}
    for i in range(len(sequence) - seed_size + 1):
        kmer = sequence[i:i+seed_size]
        if kmer not in hash_map:
            hash_map[kmer]=[i]
        else :
            hash_map[kmer].append(i)
    return hash_map

In [5]:
# Storing the genome hash table in a new variable in order to access it easily.
genome_table = kmers(genome,10)

In [5]:
# Reading the fastq (Read sequence) file and storing it in a new variable.
read_seq = fastq_file("10k_reads.fastq")

In [None]:
# Read Mapping to the Genome Sequence using the seed and extend algorithm and it prints the Exact Index from the Genome where the reads are found.
for seq in range(len(read_seq)):
  reference = '' 
  short_read = read_seq[seq][:10]
  if short_read in genome_table.keys():
    indices = genome_table[short_read]
    for i in indices:
      reference = genome[i:i+36]
      gg = fuzz.ratio(reference,read_seq[seq])
      print(f'\n The read sequence is {read_seq[seq]} is at the index {i} to {i+36} in the genome sequence with a matching score of {gg}')
