In [None]:
# read in merged fqs and parse file to extract target regions for downstream editing quantification

In [None]:
import sys

In [None]:
# Path to the reference FASTA file
ref_fasta = '/home/ec2-user/ngs_data/MiSeq_PE150_20240412/refs/2024Apr13_mismatch_lib.fa'
from collections import defaultdict
from Bio import SeqIO
import gzip
import re
from tqdm.notebook import tqdm
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os

def read_ref_fasta(file_path):
    """Read a FASTA file and return a dictionary of sequences."""
    sequences = defaultdict(list)
    for record in SeqIO.parse(file_path, "fasta"):
        #print(record)
        sequences[record.id] = str(record.seq)
    #print(sequences)
    return sequences

read_1 = read_ref_fasta(ref_fasta)



In [None]:
# loop through provided in_dir and parse specified fq files and write a split fastq file 
# based on the error corrected barcode id

from Bio import SeqIO
import gzip
import re
from tqdm.notebook import tqdm
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import os
import Levenshtein
import editdistance

def fuzzy_match_id(query_id, ref_ids, max_mismatches=0):
    """Find a matching ID in ref_ids allowing for a certain number of mismatches."""
    for ref_id in ref_ids:
        if editdistance.eval(query_id, ref_id) <= max_mismatches:
            return ref_id
    return None

def parse_fq(in_dir, grep_str, out_dir):
    
    for filename in os.listdir(in_dir):
        
        if filename.endswith("extendedFrags.fastq.gz"): # only consider merged reads
        
            file_path = os.path.join(in_dir, filename)
    
            with gzip.open(file_path, "rt") as handle:

                print(file_path)
                
                record_lst = []
                record_dict = defaultdict(list)
                
                for record in tqdm(SeqIO.parse(handle, "fastq")):

                    seq = str(record.seq)
                    match = re.search(grep_str, seq)

                    if match != None:
                        
                        tar_region = re.findall(grep_str, seq)[0]
                        mm_bc = tar_region[9:17]
                        ref_id = fuzzy_match_id(mm_bc, read_1.keys())
                        
                        if ref_id != None:
                            record_dict[ref_id].append(record)
                
                for ref_id, record_lst in record_dict.items():
                
                    out_fp = out_dir + "/" + filename.split(".")[0] + "_" + ref_id + ".fastq"
                    
                    with open(out_fp, "w") as output_handle:
                        
                        SeqIO.write(record_lst, output_handle, "fastq")
                    
                
                        
parse_1 = parse_fq("provided input files")



In [None]:
# now read in the parsed query fasta and the reference fasta
# based on the mismatch-bc, compute the LD and use as a proxy for editing efficiency

from Bio import SeqIO
import Levenshtein
from tqdm.notebook import tqdm
import os
from Bio import SeqIO
import Levenshtein
import editdistance
from collections import defaultdict
import re

out_dict = {}

def read_fasta(file_path):
    """Read a FASTA file and return a dictionary of sequences."""
    sequences = defaultdict(list)
    for record in SeqIO.parse(file_path, "fasta"):
        #print(record)
        sequences[record.id].append(str(record.seq))
    #print(sequences)
    return sequences

def fuzzy_match_id(query_id, ref_ids, max_mismatches=2):
    """Find a matching ID in ref_ids allowing for a certain number of mismatches."""
    for ref_id in ref_ids:
        if editdistance.eval(query_id, ref_id) <= max_mismatches:
            return ref_id
    return None

def calculate_levenshtein(ref_sequences, query_sequences):
    """Calculate Levenshtein distance between reference and query sequences."""
    grep_str = r"CTTATGC[ATCG]+ACCGGT"
    results = defaultdict(list)
    for query_id, query_seq_lst in tqdm(query_sequences.items()):
        ref_id = fuzzy_match_id(query_id, ref_sequences.keys())
        if ref_id:
                    
            for query_seq in query_seq_lst:

                    ref_seq = ref_sequences[ref_id][0]
                    tar_ref_seq = re.findall(grep_str, ref_seq)[0][7:-6]
                    tar_query_seq = query_seq[24:-6]
                    if len(tar_ref_seq) != len(tar_query_seq): # only consider reads with indels
                        
                        ld = Levenshtein.distance(tar_ref_seq.upper(), tar_query_seq.upper())
                        results[ref_id].append(ld)
                        
                    else:
                        
                        ld = 0
                        results[ref_id].append(ld)
    
    out_dict[filename] = results

# Path to the reference FASTA file
ref_fasta = 'ref.fa'
# Path to the directory containing query FASTA files
query_directory = 'query_dir'

# Read the reference sequences
ref_sequences = read_fasta(ref_fasta)

# Loop through all files in the directory containing query FASTA files
for filename in tqdm(os.listdir(query_directory)):
    if filename.endswith("grep_str"):  # Ensure processing only FASTA files
        file_path = os.path.join(query_directory, filename)
        query_sequences = read_fasta(file_path)
        levenshtein_distances = calculate_levenshtein(ref_sequences, query_sequences)

print(out_dict)


In [None]:
import pandas as pd

ee_dict = {} # dict of series

for sample, ld_dict in out_dict.items():
    
    #print(sample)
    sample_sries = pd.Series()
    for mm_bc, ld_lst in ld_dict.items():
        
        edit_eff_proxy = 1 - (ld_lst.count(0) / len(ld_lst))
        sample_sries[mm_bc] = edit_eff_proxy
        ee_dict[sample] = sample_sries
        
ee_df = pd.DataFrame(ee_dict)

ee_df.to_csv("out-file")
        
        
        