## Sequence Features


In [49]:
from Bio import SeqIO
import pandas as pd

# Function to remove letter 'O' from a sequence
def remove_letter_O(sequence):
    return sequence.replace('O', '')

# Function to extract k-mer frequencies
def kmer_frequencies(seq, k):
    kmers = {}
    for i in range(len(seq) - k + 1):
        kmer = seq[i:i+k]
        kmers[kmer] = kmers.get(kmer, 0) + 1
    total_kmers = sum(kmers.values())
    for kmer, count in kmers.items():
        kmers[kmer] = count / total_kmers
    return kmers

# Input FASTA file
fasta_file = "all_lncrna_location_1057.fasta"

# Parameter for k-mer length
#k = 2  # Change this value to the desired k-mer length

# Initialize a list to store feature data
feature_data = []


# Loop through each sequence in the FASTA file
for record in SeqIO.parse(fasta_file, "fasta"):
    sequence_id = record.id
    sequence = str(record.seq)
    
    # Remove 'O' from the sequence
    sequence = remove_letter_O(sequence)
    
    # Initialize a dictionary to store features for this sequence
    features = {}
    
    # Composition
    for base in ['A', 'T', 'C', 'G']:
        features[f"{base}_freq"] = sequence.count(base) / len(sequence)
    
    # GC content
    features["GC_content"] = (sequence.count('G') + sequence.count('C')) / len(sequence)
    
    # k-mer frequencies, dimer
    kmers = kmer_frequencies(sequence, 2)
    for kmer, freq in kmers.items():
        features[f"{kmer}_freq"] = freq
        
    # k-mer frequencies, trimer
    kmers = kmer_frequencies(sequence, 3)
    for kmer, freq in kmers.items():
        features[f"{kmer}_freq"] = freq
        
    # k-mer frequencies, tetramer
    kmers = kmer_frequencies(sequence, 4)
    for kmer, freq in kmers.items():
        features[f"{kmer}_freq"] = freq
        
    # k-mer frequencies,pentamer
    kmers = kmer_frequencies(sequence, 5)
    for kmer, freq in kmers.items():
        features[f"{kmer}_freq"] = freq
        
    # k-mer frequencies,pentamer
    kmers = kmer_frequencies(sequence, 6)
    for kmer, freq in kmers.items():
        features[f"{kmer}_freq"] = freq
        
    # Add sequence ID and class (you may need to modify this part)
    # For example, if your FASTA header contains class information:
    # class_info = sequence_id.split("|")[1]
    # features["Class"] = class_info
    
    feature_data.append(features)


## Structure features

In [7]:
import pandas as pd
import re

def parse_rnafold_output_enhanced(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
    features_list = []
    for i in range(0, len(lines), 1):
        # Ensure we have at least two lines to process
        if i + 1 >= len(lines):
            break
        # Get the sequence and structure info
        seq = lines[i].strip()
        # Check if the next line contains the expected structure format
        if '(' in lines[i+1] and ')' in lines[i+1]:
            struct_info = lines[i+1].strip().split()
            if len(struct_info) < 2:
                continue
            struct = struct_info[0]
            mfe = float(struct_info[1].replace('(', '').replace(')', ''))
            # Compute features
            total_bases = len(struct)
            loop_percentage = struct.count('.') / total_bases
            paired_percentage = (struct.count('(') + struct.count(')')) / total_bases

            # Count occurrences of hairpins, bulges, and internal loops
            hairpin_loops = len(re.findall('\(\.+?\)', struct)) 
            hairpin_loops_percentage = len(re.findall('\(\.+?\)', struct)) / total_bases
            bulges = len(re.findall('\(\.+\)|\)\.+?\(', struct))
            bulges_percentage = len(re.findall('\(\.+\)|\)\.+?\(', struct)) / total_bases
            internal_loops = len(re.findall('\(\..+?\.\)', struct))
            internal_loops_percentage = len(re.findall('\(\..+?\.\)', struct)) / total_bases

            features = {
                'Total_length': total_bases,
                'MFE': mfe,
                'Loop_percentage': loop_percentage,
                'Stem_percentage': paired_percentage,
                'Hairpin_loops': hairpin_loops,
                'Hairpin_loops_percentage': hairpin_loops_percentage,
                'Bulges': bulges,
                'Bulges_percentage': bulges_percentage,
                'Internal_loops': internal_loops,
                'Internal_loops_percentage': internal_loops_percentage
                
            }

            features_list.append(features)

    return features_list


## Triple helix motif

In [3]:
import re
from Bio import SeqIO
import pandas as pd

def has_triple_helix_motif(sequence):
    # Search for a run of purines at the 3' end
    purine_run = re.search('[AG]{5,}$', sequence)
    return 1 if purine_run else 0

def extract_triple_helix_feature_from_fasta(input_file):
    features = {}
    with open(input_file, 'r') as f:
        for record in SeqIO.parse(f, "fasta"):
            sequence = str(record.seq)
            features[record.id] = has_triple_helix_motif(sequence)
    return features

# Path to the provided multi-FASTA file
input_file_path = "all_lncrna_location_1057.fasta"

# Extract features from the provided file
triple_helix_features = extract_triple_helix_feature_from_fasta(input_file_path)

# Convert the features dictionary to a DataFrame
df_triple_helix = pd.DataFrame(list(triple_helix_features.items()), columns=["Sequence_ID", "Triple_Helix_Motif"])

In [1]:
from Bio import SeqIO
import pandas as pd
import re

def count_palindromes_by_length(sequence, min_length=12, max_length=20):
   
    sequence = str(sequence).upper()
    palindrome_counts = {i: 0 for i in range(min_length, max_length+1)}
    
    for length in range(min_length, max_length + 1):
        for start in range(0, len(sequence) - length + 1):
            segment = sequence[start:start+length]
            if segment == segment[::-1]:
                palindrome_counts[length] += 1
                
    return palindrome_counts

def extract_palindrome_counts_from_fasta(input_file, min_length=12, max_length=20):
    data = []
    for record in SeqIO.parse(input_file, "fasta"):
        sequence = record.seq
        palindrome_counts = count_palindromes_by_length(sequence, min_length, max_length)
        data.append({
            "Sequence_ID": record.id,
            **palindrome_counts
        })
    return pd.DataFrame(data)

input_file_path = "all_lncrna_location_1057.fasta"


### ORF Features

In [2]:
from Bio import SeqIO
from Bio.Seq import Seq
import pandas as pd

# Define the find_longest_orf function again
def find_longest_orf(sequence):
    """Find the longest ORF in a given sequence."""
    start_codon = 'ATG'
    stop_codons = ['TAA', 'TAG', 'TGA']
    longest_orf = None
    orf_count = 0

    for frame in range(3):
        for i in range(frame, len(sequence), 3):
            if sequence[i:i+3] == start_codon:
                for j in range(i+3, len(sequence), 3):
                    if sequence[j:j+3] in stop_codons:
                        orf_count += 1
                        if longest_orf is None or (j + 3 - i) > len(longest_orf[0]):
                            longest_orf = (sequence[i:j+3], i, j+3, frame)
                        break
    return longest_orf, orf_count

# Modify the code to add a default set of values for sequences without ORFs
fasta_file = "all_lncrna_location_1057.fasta"
feature_data = []

for record in SeqIO.parse(fasta_file, "fasta"):
    sequence = str(record.seq)
    sequence = sequence.replace('O', '')
    
    longest_orf, orf_count = find_longest_orf(sequence)

    if longest_orf is not None:
        orf_seq, start, end, frame = longest_orf
        orf_length = len(orf_seq)
        gc_content = (orf_seq.count('G') + orf_seq.count('C')) / orf_length
        amino_acid_seq = Seq(orf_seq).translate()

        features = {
            'Sequence_ID': record.id,
            'ORF_Length': orf_length,
            'ORF_GC_Content': gc_content,
            'ORF_Count': orf_count,
        }
    else:
        # Default values for sequences without ORFs
        features = {
            'Sequence_ID': record.id,
            'ORF_Length': 0,
            'ORF_GC_Content': 0,
            'ORF_Count': 0,
        }
        
    feature_data.append(features)

## Physico-chemical Feature

In [26]:
from Bio import SeqIO
import pandas as pd
import re

from Bio import SeqIO

def calculate_properties(sequence):
    """Calculate molecular weight, GC content, and Tm for an RNA sequence."""
    # Molecular weights of individual nucleotides in g/mol
    mw = {
        'A': 347.2212, 
        'U': 324.1813, 
        'G': 363.2206, 
        'C': 323.1965
    }
    
    # Calculate molecular weight of the sequence
    sequence_mw = sum([mw.get(nucleotide, 0) for nucleotide in sequence]) - (len(sequence) - 1) * 79.0  # subtracting phosphate weight
    # Calculate GC content
    gc_content = (sequence.count('G') + sequence.count('C')) / len(sequence)
    # Calculate melting temperature (Tm)
    tm = 81.5 + 16.6 * 0.05 + 0.41 * gc_content * 100 - 600 / len(sequence)  # assuming 50mM Na+
    return sequence_mw, gc_content * 100, tm


## Physico_chemical Features

In [28]:
# Main function to extract properties from the FASTA file and save to CSV
def calculate_properties_for_fasta(input_fasta, output_csv):
    data = []

    for record in SeqIO.parse(input_fasta, "fasta"):
        sequence = str(record.seq).upper()
        mw, gc, tm = calculate_properties(sequence)
        data.append({
            "Sequence_ID": record.id,
            "Molecular_Weight_g/mol": mw,
            "GC_Content_%": gc,
            "Melting_Temperature_C": tm
        })

    df = pd.DataFrame(data)
    df.to_csv(output_csv, index=False)
    
    return df
# Extract properties from the provided FASTA file and save to CSV
input_fasta_path = "all_lncrna_location_1057.fasta"