# Calculate MFE on Cuperus Data

In [None]:
import pandas as pd
import numpy as np
from multiprocessing import Pool

try:
    import RNA
    HAS_VIENNA = True
    print("ViennaRNA Loaded")
except ImportError:
    HAS_VIENNA = False
    RNA = None

INPUT_FILE = 'Cuperus_InputData_TRIM.with_pred.csv'  # RAW Data
OUTPUT_FILE = 'TRIM_data_with_MFE_processed.csv'
N_CORES = 16

# ==========================================
# Calculation Function
# ==========================================
def calculate_single_mfe(args):
    """
    MFE of Single Sequence
    """
    utr, cds = args
    
    utr_seq = str(utr).upper().strip()
    cds_seq = str(cds).upper().strip() if pd.notna(cds) else ""
    full_seq = utr_seq + cds_seq
    
    if not full_seq or full_seq == 'NAN':
        return 0.0
        
    try:
        structure, mfe = RNA.fold(full_seq)
        return mfe
    except Exception:
        return 0.0

def main_calculation():
    if not HAS_VIENNA:
        return

    # 1. Load data
    print(f"Loading raw file: {INPUT_FILE}")
    try:
        df = pd.read_csv(INPUT_FILE)
    except FileNotFoundError:
        print("Input file not found.")
        return

    # 3. MFE Calculation    
    sequences_pairs = list(zip(df['utr5_sequence'], df.get('cds_sequence', [''] * len(df))))
    with Pool(processes=N_CORES) as pool:
        mfe_results = pool.map(calculate_single_mfe, sequences_pairs)
    
    # Writing results to dataframe
    df['MFE'] = mfe_results
    
    print(f"Results witten to {OUTPUT_FILE}")
    df.to_csv(OUTPUT_FILE, index=False)

if __name__ == "__main__":
    main_calculation()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np

DATA_FILE = 'TRIM_data_with_MFE_processed.csv' 

def plot_mfe_analysis():
    print(f"Loadind data: {DATA_FILE}")
    try:
        df = pd.read_csv(DATA_FILE)
    except FileNotFoundError:
        print("Error: No file found.")
        return

    print(f"Data loaded with {len(df)} lines")

    df["MFE_Bin"] = pd.qcut(df["MFE"], q=10, duplicates="drop")  # Binning
    original_categories = df['MFE_Bin'].cat.categories
    new_labels = [f"({i.left:.1f}, {i.right:.1f}]" for i in original_categories]
    df['MFE_Bin'] = df['MFE_Bin'].cat.rename_categories(new_labels)

    # Plot Box-plot
    sns.set_style("whitegrid")
    plt.figure(figsize=(10, 5))
    sns.boxplot(x='MFE_Bin', y='pred_TE_5U', data=df, palette="viridis", showfliers=False)
    plt.xlabel("Minimum Free Energy", fontsize=18,fontweight="bold")
    plt.ylabel("TRIM Predicted TE", fontsize=18,fontweight="bold")
    plt.xticks(rotation=45, ha='right', fontsize=14)
    plt.yticks(fontsize=14)
    plt.grid(axis='y', linestyle='--', alpha=0.5)

    # Plot mean trend line
    bin_means = df.groupby('MFE_Bin', observed=True)['pred_TE_5U'].mean()
    plt.plot(range(len(bin_means)), bin_means.values, color='#d62728', marker='o', 
             linewidth=2.5, label='Mean Trend')
    plt.legend(loc='upper left',fontsize=16)
    plt.tight_layout()
    plt.savefig("MFE_TE_Analysis.png", dpi=600) # Save figure
    plt.show()

    r, p = stats.pearsonr(df['MFE'], df['pred_TE_5U'])
    print(f"Pearson Correlation (R): {r:.4f}")

if __name__ == "__main__":
    plot_mfe_analysis()