PATRICIA TREE(COMPRESSED TRIE)

In [2]:
#compressedtrie

class PatriciaTrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

class PatriciaTrie:
    def __init__(self):
        self.root = PatriciaTrieNode()

    def insert(self, word):
        node = self.root
        while word:
            for key in node.children.keys():
                common_prefix = self._common_prefix(word, key)
                if common_prefix:
                    if common_prefix == key:
                        node = node.children[key]
                        word = word[len(common_prefix):]
                    else:
                        existing_node = node.children.pop(key)
                        new_node = PatriciaTrieNode()
                        new_node.children[key[len(common_prefix):]] = existing_node
                        node.children[common_prefix] = new_node
                        node = new_node
                        word = word[len(common_prefix):]
                        break
            else:
                node.children[word] = PatriciaTrieNode()
                node = node.children[word]
                word = ''
        node.is_end_of_word = True

    def count_frequency(self, sequence, pattern):
        count = 0
        pattern_len = len(pattern)
        for i in range(len(sequence) - pattern_len + 1):
            if sequence[i:i + pattern_len] == pattern:
                count += 1
        return count

    def _common_prefix(self, str1, str2):
        min_len = min(len(str1), len(str2))
        for i in range(min_len):
            if str1[i] != str2[i]:
                return str1[:i]
        return str1[:min_len]

def load_fasta(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    sequence = ''.join(lines[1:]).replace('\n', '')
    return sequence

def count_pattern_in_exons_with_patricia_trie(sequence, exon_ranges, pattern):
    trie = PatriciaTrie()
    total_count = 0
    for start, end in exon_ranges:
        segment = sequence[start-1:end]  # Adjusting for 0-based indexing
        total_count += trie.count_frequency(segment, pattern)
    return total_count


TRIE

In [3]:
#trie
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False
        self.frequency = 0

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_word = True
        node.frequency += 1

    def search(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                return 0
            node = node.children[char]
        return node.frequency if node.is_end_of_word else 0

def load_fasta(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    sequence = ''.join(lines[1:]).replace('\n', '')
    return sequence

def count_pattern_in_exons(sequence, exon_ranges, pattern):
    trie = Trie()
    pattern_len = len(pattern)
    
    # Insert all patterns in the specified ranges into the Trie
    for start, end in exon_ranges:
        segment = sequence[start-1:end]  # Adjusting for 0-based indexing
        for i in range(len(segment) - pattern_len + 1):
            substring = segment[i:i + pattern_len]
            trie.insert(substring)
    
    # Count the frequency of the specified pattern
    return trie.search(pattern)


BWT(BURROWS WHEELER TRANSFORM)

In [4]:
import os

def read_fasta(file_path):
    """Reads the sequence from a FASTA file, ignoring the first line."""
    sequence = ""
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for line in lines[1:]:
            sequence += line.strip()
    return sequence

def bwt_transform(sequence):
    """Computes the Burrows-Wheeler Transform of the sequence."""
    sequence = sequence + "$"  # Add end of string marker
    table = sorted(sequence[i:] + sequence[:i] for i in range(len(sequence)))
    return ''.join(row[-1] for row in table)

def bwt_search(bwt, pattern):
    """Searches for occurrences of the pattern in the BWT."""
    # First, build the suffix array and the first column of the BWT matrix
    suffix_array = sorted(range(len(bwt)), key=lambda i: bwt[i:])
    first_col = ''.join(sorted(bwt))

    # Initialize the counts
    counts = {}
    for char in bwt:
        if char in counts:
            counts[char] += 1
        else:
            counts[char] = 1

    # Build the first occurrence map
    first_occurrence = {}
    total = 0
    for char in sorted(counts.keys()):
        first_occurrence[char] = total
        total += counts[char]

    # Perform the backward search
    l, r = 0, len(bwt) - 1
    for char in reversed(pattern):
        if char in first_occurrence:
            l = first_occurrence[char] + bwt[:l].count(char)
            r = first_occurrence[char] + bwt[:r+1].count(char) - 1
        else:
            return 0

    return r - l + 1

def main():
    fasta_file_path = r"C:\users\yaaju\Downloads\NM_002025.4 (1).fa"  # Path to your FASTA file
    pattern = "CCG"

    # Read the mRNA sequence
    sequence = read_fasta(fasta_file_path)

    # Compute the Burrows-Wheeler Transform
    bwt = bwt_transform(sequence)

    # Search for the pattern in the BWT
    frequency = bwt_search(bwt, pattern)

    print(f"The pattern '{pattern}' appears {frequency} times in the mRNA sequence.")

if __name__ == "__main__":
    main()


The pattern 'CCG' appears 84 times in the mRNA sequence.


BLOOM FILTER

In [5]:
pip install pybloom_live

Note: you may need to restart the kernel to use updated packages.


In [6]:
import hashlib

class CountingBloomFilter:
    def __init__(self, size, num_hashes):
        self.size = size
        self.num_hashes = num_hashes
        self.bloom = [0] * size

    def _hashes(self, item):
        hashes = []
        for i in range(self.num_hashes):
            hash_result = int(hashlib.md5((item + str(i)).encode()).hexdigest(), 16)
            hashes.append(hash_result % self.size)
        return hashes

    def add(self, item):
        for hash_val in self._hashes(item):
            self.bloom[hash_val] += 1

    def count(self, item):
        return min(self.bloom[hash_val] for hash_val in self._hashes(item))

def read_fasta(file_path):
    """Reads the sequence from a FASTA file, ignoring the first line."""
    sequence = ""
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for line in lines[1:]:
            sequence += line.strip()
    return sequence

def find_pattern_frequency(sequence, pattern, bloom_size, num_hashes):
    bloom_filter = CountingBloomFilter(bloom_size, num_hashes)
    kmer_size = len(pattern)
    for i in range(len(sequence) - kmer_size + 1):
        kmer = sequence[i:i+kmer_size]
        bloom_filter.add(kmer)
    return bloom_filter.count(pattern)

def main():
    fasta_file_path = r"C:\users\yaaju\Downloads\NM_002025.4 (1).fa" # Path to your FASTA file
    pattern = "CCG"
    bloom_size = 10000  # Adjust the size based on your needs
    num_hashes = 3      # Number of hash functions

    # Read the mRNA sequence
    sequence = read_fasta(fasta_file_path)
    
    # Find the frequency of the pattern using a Counting Bloom Filter
    frequency = find_pattern_frequency(sequence, pattern, bloom_size, num_hashes)

    print(f"The pattern '{pattern}' appears approximately {frequency} times in the mRNA sequence.")

if __name__ == "__main__":
    main()


The pattern 'CCG' appears approximately 84 times in the mRNA sequence.


SUFFIX ARRAY

In [7]:
def build_suffix_array(text):
    suffixes = [(text[i:], i) for i in range(len(text))]
    suffixes.sort()
    suffix_array = [suffix[1] for suffix in suffixes]
    return suffix_array

def suffix_array_search(sequence, pattern, suffix_array):
    left = 0
    right = len(suffix_array) - 1
    pattern_length = len(pattern)
    count = 0

    while left <= right:
        mid = (left + right) // 2
        start_index = suffix_array[mid]
        substring = sequence[start_index:start_index + pattern_length]

        if pattern == substring:
            count += 1
            # Look for multiple occurrences
            l, r = mid - 1, mid + 1
            while l >= 0 and sequence[suffix_array[l]:suffix_array[l] + pattern_length] == pattern:
                count += 1
                l -= 1
            while r < len(suffix_array) and sequence[suffix_array[r]:suffix_array[r] + pattern_length] == pattern:
                count += 1
                r += 1
            break
        elif pattern < substring:
            right = mid - 1
        else:
            left = mid + 1

    return count

def main():
    fasta_file_path = r"C:\users\yaaju\Downloads\NM_002025.4 (1).fa"  # Path to your FASTA file
    pattern = "CCG"

    # Read the mRNA sequence
    sequence = read_fasta(fasta_file_path)

    # Build suffix array
    suffix_array = build_suffix_array(sequence)

    # Find the frequency of the pattern using Suffix Array
    frequency = suffix_array_search(sequence, pattern, suffix_array)

    print(f"The pattern '{pattern}' appears approximately {frequency} times in the mRNA sequence.")

if __name__ == "__main__":
    main()


The pattern 'CCG' appears approximately 84 times in the mRNA sequence.


wAVELET tREE

In [None]:
class WaveletTree:
    def __init__(self, data, alphabet=None):
        if alphabet is None:
            alphabet = sorted(set(data))
        self.alphabet = alphabet
        self.mid = len(alphabet) // 2
        self.left = self.right = None
        self.bit_vector = []
        
        if len(alphabet) == 1:
            return

        left_alphabet = alphabet[:self.mid]
        right_alphabet = alphabet[self.mid:]

        left_data = []
        right_data = []

        for char in data:
            if char in left_alphabet:
                self.bit_vector.append(0)
                left_data.append(char)
            else:
                self.bit_vector.append(1)
                right_data.append(char)
        
        self.left = WaveletTree(left_data, left_alphabet)
        self.right = WaveletTree(right_data, right_alphabet)

    def rank(self, char, index):
        if len(self.alphabet) == 1:
            return index + 1

        if char in self.alphabet[:self.mid]:
            return self.left.rank(char, self.bit_vector[:index + 1].count(0) - 1)
        else:
            return self.right.rank(char, self.bit_vector[:index + 1].count(1) - 1)

    def range_query(self, char, start, end):
        return self.rank(char, end) - self.rank(char, start - 1)

def read_fasta(file_path):
    """Reads the sequence from a FASTA file, ignoring the first line."""
    sequence = ""
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for line in lines[1:]:
            sequence += line.strip()
    return sequence

def wavelet_tree_search(sequence, pattern):
    wavelet_tree = WaveletTree(sequence)
    count = 0
    for i in range(len(sequence) - len(pattern) + 1):
        if all(wavelet_tree.range_query(pattern[j], i + j, i + j) > 0 for j in range(len(pattern))):
            count += 1
    return count

def main():
    fasta_file_path = r"C:\users\yaaju\Downloads\NM_002025.4 (1).fa"  # Path to your FASTA file
    pattern = "CCG"

    # Read the mRNA sequence
    sequence = read_fasta(fasta_file_path)

    # Find the frequency of the pattern using Wavelet Tree
    frequency = wavelet_tree_search(sequence, pattern)

    print(f"The pattern '{pattern}' appears approximately {frequency} times in the mRNA sequence.")

if __name__ == "__main__":
    main()


DEFINATIVE FINITE AUTOMATA

In [None]:
class DFA:
    def __init__(self, pattern):
        self.pattern = pattern
        self.states = len(pattern) + 1
        self.alphabet = set(pattern)
        self.transition_table = self.build_transition_table()

    def build_transition_table(self):
        transition_table = {}
        for state in range(self.states):
            for char in self.alphabet:
                next_state = min(self.states - 1, state + 1)
                while next_state > 0 and self.pattern[next_state - 1] != char:
                    next_state -= 1
                transition_table[(state, char)] = next_state
        return transition_table

    def search(self, text):
        state = 0
        occurrences = []
        for i, char in enumerate(text):
            if (state, char) in self.transition_table:
                state = self.transition_table[(state, char)]
            else:
                state = 0

            if state == self.states - 1:
                occurrences.append(i - len(self.pattern) + 1)
                state = 0
        return occurrences

def read_fasta(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        sequence = ''.join(line.strip() for line in lines if not line.startswith('>'))
    return sequence

# Define the pattern to search for
pattern = "CCG"

# Read the sequence from the FASTA file
fasta_file ='/content/NM_002025.4.fasta'
sequence = read_fasta(fasta_file)

# Create the DFA and search for the pattern
dfa = DFA(pattern)
occurrences = dfa.search(sequence)

# Print the positions where the pattern is found
print("Pattern found at positions:", occurrences)
print("Frequency:", len(occurrences))




KMP

In [8]:
def read_fasta(file_path):
    """Reads the sequence from a FASTA file, ignoring the first line."""
    sequence = ""
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for line in lines[1:]:
            sequence += line.strip()
    return sequence

def kmp_preprocess(pattern):
    """Preprocesses the pattern to create the longest prefix suffix (LPS) array."""
    lps = [0] * len(pattern)
    length = 0  # length of the previous longest prefix suffix
    i = 1

    while i < len(pattern):
        if pattern[i] == pattern[length]:
            length += 1
            lps[i] = length
            i += 1
        else:
            if length != 0:
                length = lps[length - 1]
            else:
                lps[i] = 0
                i += 1

    return lps

def kmp_search_in_ranges(sequence, pattern, ranges):
    """Searches for the pattern using the KMP algorithm within specified ranges."""
    lps = kmp_preprocess(pattern)
    positions = []

    for start, end in ranges:
        i = start  # index for sequence
        j = 0  # index for pattern
        while i < end:
            if pattern[j] == sequence[i]:
                i += 1
                j += 1

            if j == len(pattern):
                positions.append(i - j)
                j = lps[j - 1]
            elif i < end and pattern[j] != sequence[i]:
                if j != 0:
                    j = lps[j - 1]
                else:
                    i += 1

    return positions

fasta_file_path = r"C:\users\yaaju\Downloads\NM_002025.4 (1).fa"  # Path to your FASTA file
pattern = "CCG"
exon_ranges =  [(89, 284), (577, 4489)]

# Read the mRNA sequence
sequence = read_fasta(fasta_file_path)

# Search for the pattern using the KMP algorithm within exon ranges
positions = kmp_search_in_ranges(sequence, pattern, exon_ranges)
frequency = len(positions)

print(f"The pattern '{pattern}' appears {frequency} times in the exon ranges of the mRNA sequence at positions {positions}.")


The pattern 'CCG' appears 34 times in the exon ranges of the mRNA sequence at positions [92, 106, 110, 116, 119, 122, 128, 131, 134, 139, 167, 177, 189, 192, 224, 231, 241, 265, 281, 687, 1195, 1214, 1375, 2268, 2487, 2555, 2683, 3258, 3512, 3654, 3785, 3960, 4268, 4362].


In [7]:
import pandas as pd
import re

# GENSCAN output(Sample genscan data given below)
genscan_output = """


Gn.Ex Type S .Begin ...End .Len Fr Ph I/Ac Do/T CodRg P.... Tscr..

----- ---- - ------ ------ ---- -- -- ---- ---- ----- ----- ------



 1.01 Intr +     89    284  196  0  1   29   53   231 0.364  12.17

 1.02 Term +    577   4489 3913  1  1   54   40  2749 0.364 252.14

 1.03 PlyA +   5053   5058    6                               1.05


"""

# Regular expression to parse the GENSCAN output
pattern = re.compile(r'\s+(\d+\.\d+)\s+(\w+)\s+(\+|\-)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d*)\s+(\d*)\s+(\d*)\s+(\d*\.\d*)\s+(\d*\.\d*)')

# Find all matches in the output
matches = pattern.findall(genscan_output)

# Convert matches to a DataFrame
columns = ['Gn.Ex', 'Type', 'Strand', 'Begin', 'End', 'Len', 'Fr', 'Ph', 'I/Ac', 'Do/T', 'CodRg', 'P', 'Tscr']
data = pd.DataFrame(matches, columns=columns)

# Convert numeric columns to appropriate data types
numeric_cols = ['Begin', 'End', 'Len', 'Fr', 'Ph', 'I/Ac', 'Do/T', 'CodRg', 'P', 'Tscr']
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Calculate exon ranges
exon_ranges = []

for idx, row in data.iterrows():
    if row['Type'] in ['Term', 'Intr']:
        exon_ranges.append((row['Begin'], row['End']))

# Print the exon ranges
print("exon range =", exon_ranges)


exon range = [(89, 284), (577, 4489)]


In [8]:
# Specific exon ranges from the paper
exon_ranges =   [(1,10000)]


# Load the sequence from the provided FASTA file
fasta_file_path =r"C:\Users\yaaju\Downloads\mrnas_by_gene\klhl1 - Copy\XM_054374773.1.fna"
sequence = load_fasta(fasta_file_path)

# Calculate the frequency of 'CAG' in the specified exon segments
pattern="CCG"
cag_count_exons_correct_with_trie = count_pattern_in_exons(sequence, exon_ranges, pattern)
print(f"Count of {pattern} using Trie: {cag_count_exons_correct_with_trie}")
cag_count_exons_correct_with_patricia_trie = count_pattern_in_exons_with_patricia_trie(sequence, exon_ranges, pattern)
print(f"Count of {pattern} using Patricia Trie: {cag_count_exons_correct_with_patricia_trie}")

Count of CCG using Trie: 7
Count of CCG using Patricia Trie: 7
