# Load raw data

In [None]:
import pandas as pd

file_name = 'train1'
df = pd.read_csv(f'../data/raw/{file_name}.csv')
print(df.head())

# Set up suffix array for ChimeraARS

In [None]:
from chimera import calc_cARS, build_suffix_array

# setup chimeraARS index using all genes
suffix_array = build_suffix_array(df['promoter'])

# setup chimeraARS index using only top 15% most highly expressed genes
n = int(0.15 * len(df))
highly_expressed_genes = df.nlargest(n, 'mrna_level')
highly_expressed_genes.reset_index(inplace=True)
highly_expressed_suffix_array = build_suffix_array(highly_expressed_genes['promoter'])

# Compute ChimeraARS with varying parameters

In [None]:
# define parameters
max_len_options = [50,80,110,150,200]
max_pos_options = [0.5,1,1.5,2]

for max_len in max_len_options:
    for max_pos in max_pos_options:
        # Chimera ARS (cARS) with respect to all genes
        df[f'chimeraARS_len{max_len}_pos{max_pos}'] = calc_cARS(df['promoter'], suffix_array, max_len=max_len, max_pos=max_pos)

        # Chimera ARS (cARS) with respect to highly expressed genes
        df[f'highly_expressed_chimeraARS_len{max_len}_pos{max_pos}'] = calc_cARS(df['promoter'], highly_expressed_suffix_array, max_len=max_len, max_pos=max_pos)
        
        # Print progress since this takes a while
        print(f'Completed max_len={max_len}, max_pos={max_pos}')

# Calculate Position Specific ChimeraARS score with varying parameters

In [None]:
# define parameters
win_params = {'size': 150, 'center': 0, 'by_start': True, 'by_stop': True}

for max_len in max_len_options:
    for max_pos in max_pos_options:
        # Position-Specific Chimera ARS (PScARS) with respect to all genes
        df[f'PS_chimeraARS_len{max_len}_pos{max_pos}'] = calc_cARS(df['promoter'], suffix_array, win_params=win_params, max_len=max_len, max_pos=max_pos)

        # Position-Specific Chimera ARS (PScARS) with respect to highly expressed genes
        df[f'highly_expressed_PS_chimeraARS_len{max_len}_pos{max_pos}'] = calc_cARS(df['promoter'], highly_expressed_suffix_array, win_params=win_params, max_len=max_len, max_pos=max_pos)

        # Print progress since this takes a while
        print(f'Completed max_len={max_len}, max_pos={max_pos}')

# Plot results

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr

feature_cols = df.columns.difference(['promoter', 'mrna_level']).to_list()

# Determine subplot layout: 3 columns, calculate rows based on number of features
cols = 3
rows = (len(feature_cols) + cols - 1) // cols  # Ceiling division
fig, axes = plt.subplots(rows, cols, figsize=(18, 6 * rows))  # Adjust figsize based on rows
axes = axes.flatten()

for i, feature in enumerate(feature_cols):
    ax = axes[i]
    corr, p_value = spearmanr(df[feature], df['mrna_level'])

    # Create scatter plot
    sns.scatterplot(data=df, x=feature, y='mrna_level', ax=ax, alpha=0.6, s=30)

    # Add correlation line (optional - linear regression)
    try:
        from scipy.stats import linregress
        slope, intercept, r_value, p_value, std_err = linregress(df[feature], df['mrna_level'])
        x_range = [df[feature].min(), df[feature].max()]
        y_range = [slope * x + intercept for x in x_range]
        ax.plot(x_range, y_range, color='red', linestyle='--', linewidth=2,
                label=f'r = {corr:.3f}')
        ax.legend()
    except:
        pass

    # Set titles and labels
    ax.set_title(f'{feature}\n(r = {corr:.3f}, p = {p_value:.2e})',
                fontsize=10, fontweight='bold')
    ax.set_xlabel(feature.replace('_', ' ').title(), fontsize=9)
    ax.set_ylabel('mRNA Level', fontsize=9)

    # Rotate x-axis labels if needed
    ax.tick_params(axis='x', labelrotation=45, labelsize=8)
    ax.tick_params(axis='y', labelsize=8)

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

plt.suptitle('All Chimera Features vs mRNA Expression Level', fontsize=16, fontweight='bold', y=0.98)
plt.tight_layout()
plt.show()

NameError: name 'feature_cols' is not defined

# Save results to file

In [None]:
from datetime import datetime

timestamp = datetime.now().strftime("%m%d_%H%M")

df.to_csv(f'../data/processed/{file_name}_chimera_features_{timestamp}.csv', index=False)