In [18]:
# Import the Counter class
from collections import Counter


In [19]:
# Reverse transcribe RNA to DNA
def Reverse_Transcribe(rna_sequence):
    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'U': 'A'}
    return ''.join(complement.get(base, base) for base in rna_sequence)

In [20]:

# occurrences of each nucleotide
def count_nucleotides(sequence):
    return Counter(sequence)

In [21]:
#occurrences of n-grams
def count_n_grams(sequence, n):
    return Counter(sequence[i:i+n] for i in range(len(sequence)-n+1))


In [22]:
# Calculate percentage frequencies
def calculate_percentage_frequencies(counter, total):
    return {key: (value / total) * 100 for key, value in counter.items()}


In [23]:
# Compare frequencies and print differences above threshold
def compare_frequencies(frequencies1, frequencies2, threshold=3):
    for key in frequencies1:
        if key in frequencies2:
            diff = abs(frequencies1[key] - frequencies2[key])
            if diff >= threshold:
                print(f"{key}: {frequencies1[key]:.2f}% in Sequence 1, {frequencies2[key]:.2f}% in Sequence 2")


In [24]:
# Process sequence from file
def process_sequence_from_file(file_path):
    with open(file_path, 'r') as file:
        header = file.readline().strip()
        sequence = ''.join(line.strip() for line in file.readlines())
    dna_sequence = Reverse_Transcribe(sequence)
    nucleotide_counts = count_nucleotides(sequence)
    dinucleotide_counts = count_n_grams(sequence, 2)
    trinucleotide_counts = count_n_grams(sequence, 3)

    total_nucleotides = len(sequence)
    total_dinucleotides = total_nucleotides - 1
    total_trinucleotides = total_nucleotides - 2

    nucleotide_freq = calculate_percentage_frequencies(nucleotide_counts, total_nucleotides)
    dinucleotide_freq = calculate_percentage_frequencies(dinucleotide_counts, total_dinucleotides)
    trinucleotide_freq = calculate_percentage_frequencies(trinucleotide_counts, total_trinucleotides)

    return dna_sequence, nucleotide_freq, dinucleotide_freq, trinucleotide_freq

# Print frequencies
def print_frequencies(name, nucleotide_freq, dinucleotide_freq, trinucleotide_freq):
    print(f"\n{name} Frequencies:")
    print("Nucleotide\tPercentage")
    for nucleotide, frequency in nucleotide_freq.items():
        print(f"{nucleotide}\t{frequency:.2f}%")

    print("\nDinucleotide Frequencies:")
    print("Dinucleotide\tPercentage")
    for dinucleotide, frequency in dinucleotide_freq.items():
        print(f"{dinucleotide}\t{frequency:.2f}%")

    print("\nTrinucleotide Frequencies:")
    print("Trinucleotide\tPercentage")
    for trinucleotide, frequency in trinucleotide_freq.items():
        print(f"{trinucleotide}\t{frequency:.2f}%")


In [25]:
# Main function
def main():
    file1 = "/content/RNA-sequence1.fna"
    file2 = "/content/RNA-sequence2.fna"

    dna_seq1, nucleotide_freq1, dinucleotide_freq1, trinucleotide_freq1 = process_sequence_from_file(file1)
    dna_seq2, nucleotide_freq2, dinucleotide_freq2, trinucleotide_freq2 = process_sequence_from_file(file2)

    print_frequencies("Sequence 1", nucleotide_freq1, dinucleotide_freq1, trinucleotide_freq1)
    print_frequencies("Sequence 2", nucleotide_freq2, dinucleotide_freq2, trinucleotide_freq2)

    compare_frequencies(dinucleotide_freq1, dinucleotide_freq2)
    compare_frequencies(trinucleotide_freq1, trinucleotide_freq2)

if __name__ == "__main__":
    main()


Sequence 1 Frequencies:
Nucleotide	Percentage
C	23.16%
U	24.00%
A	22.72%
 	7.69%
G	22.44%

Dinucleotide Frequencies:
Dinucleotide	Percentage
CU	5.25%
UA	4.22%
AC	6.30%
CC	6.46%
AA	6.03%
C 	1.91%
 C	1.92%
CA	4.30%
AG	4.90%
GG	6.16%
G 	1.99%
 G	1.82%
GA	6.29%
GU	6.12%
UU	6.86%
UC	6.59%
UG	4.34%
CG	5.22%
GC	1.88%
AU	3.70%
 A	1.89%
U 	1.99%
 U	2.06%
A 	1.80%

Trinucleotide Frequencies:
Trinucleotide	Percentage
CUA	0.86%
UAC	0.90%
ACC	1.83%
CCC	1.62%
CCU	1.46%
UAA	1.06%
AAC	1.49%
AC 	0.53%
C C	0.66%
 CC	0.61%
CCA	1.16%
CAA	0.93%
AAA	2.11%
AAG	1.00%
AGG	1.33%
GGG	1.51%
GG 	0.63%
G G	0.58%
 GA	0.55%
GAG	1.66%
GGU	1.70%
GUA	1.07%
ACA	1.23%
CAC	1.41%
C G	0.45%
AGU	1.30%
GUU	1.13%
UUC	1.20%
UCU	1.53%
CUG	0.76%
UGA	1.21%
GAC	1.91%
 CG	0.47%
CGC	0.48%
GCG	0.40%
CGA	1.40%
GAU	0.90%
AUU	0.94%
UUU	2.93%
UC 	0.64%
C A	0.36%
 AA	0.58%
ACU	1.42%
CUC	1.87%
UCG	1.44%
GAA	1.35%
AGA	1.38%
UCA	1.13%
CAG	0.82%
AG 	0.51%
G A	0.62%
 AU	0.38%
AUC	0.82%
CGG	1.57%
GUG	1.30%
UGG	1.18%
G C	0.17%
 CA	0.39%
GUC	2.04%