# Codon Readthrough Analyses

This notebook performs analyses on the processed readthrough data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Read in the CSV file
df = pd.read_csv('drosophila_readthrough_analysis.csv')
print(f"Loaded {len(df)} genes")
print(f"\nColumns: {list(df.columns)}")
df.head()

## Analysis 1: Genes with Second Stop Codon

In [None]:
# Create subset of transcripts that have a second stop codon (in 3' UTR)
df_with_second_stop = df[df['first_stop_after_canonical'] != ''].copy()

print(f"Total genes: {len(df)}")
print(f"Genes with second stop codon: {len(df_with_second_stop)}")
print(f"Percentage with second stop: {len(df_with_second_stop) / len(df) * 100:.2f}%")

# Breakdown by readthrough status
print(f"\nBreakdown by readthrough status:")
print(f"  Readthrough genes with second stop: {len(df_with_second_stop[df_with_second_stop['has_readthrough']])}")
print(f"  Non-readthrough genes with second stop: {len(df_with_second_stop[~df_with_second_stop['has_readthrough']])}")

In [None]:
# Calculate ORF2 length (in nucleotides and amino acids)
df_with_second_stop['orf2_length_nt'] = df_with_second_stop['orf2_nucleotide'].str.len()
df_with_second_stop['orf2_length_aa'] = df_with_second_stop['orf2_translation'].str.replace('*', '').str.len()

print("ORF2 length statistics (amino acids):")
print(df_with_second_stop.groupby('has_readthrough')['orf2_length_aa'].describe())

In [None]:
# Create histogram of ORF2 length
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Separate data by readthrough status
orf2_readthrough = df_with_second_stop[df_with_second_stop['has_readthrough']]['orf2_length_aa']
orf2_non_readthrough = df_with_second_stop[~df_with_second_stop['has_readthrough']]['orf2_length_aa']

# Histogram 1: Overlaid histograms (normalized by frequency)
axes[0].hist(orf2_readthrough, bins=50, range=(0, 250), alpha=0.6, 
             label=f'Readthrough (n={len(orf2_readthrough)})', 
             color='red', edgecolor='black', density=True)
axes[0].hist(orf2_non_readthrough, bins=50, range=(0, 250), alpha=0.6, 
             label=f'Non-readthrough (n={len(orf2_non_readthrough)})', 
             color='blue', edgecolor='black', density=True)
axes[0].set_xlabel('ORF2 Length (amino acids)')
axes[0].set_ylabel('Density')
axes[0].set_title('ORF2 Length Distribution: Readthrough vs Non-readthrough')
axes[0].set_xlim(0, 250)
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Histogram 2: Side-by-side box plots
box_data = [
    orf2_readthrough[orf2_readthrough <= 250],
    orf2_non_readthrough[orf2_non_readthrough <= 250]
]
axes[1].boxplot(box_data, labels=['Readthrough', 'Non-readthrough'])
axes[1].set_ylabel('ORF2 Length (amino acids)')
axes[1].set_title('ORF2 Length Distribution (Box Plot)')
axes[1].set_ylim(0, 250)
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('orf2_length_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nPlot saved as 'orf2_length_distribution.png'")

In [None]:
# Statistical test to compare ORF2 lengths
from scipy import stats

# Mann-Whitney U test (non-parametric)
statistic, pvalue = stats.mannwhitneyu(orf2_readthrough, orf2_non_readthrough, alternative='two-sided')

print("Statistical comparison of ORF2 lengths:")
print(f"Mann-Whitney U test:")
print(f"  U statistic: {statistic:.2f}")
print(f"  P-value: {pvalue:.4e}")
print(f"  Significant at α=0.05: {'Yes' if pvalue < 0.05 else 'No'}")

# Calculate median and mean
print(f"\nReadthrough genes:")
print(f"  Median ORF2 length: {orf2_readthrough.median():.1f} aa")
print(f"  Mean ORF2 length: {orf2_readthrough.mean():.1f} aa")

print(f"\nNon-readthrough genes:")
print(f"  Median ORF2 length: {orf2_non_readthrough.median():.1f} aa")
print(f"  Mean ORF2 length: {orf2_non_readthrough.mean():.1f} aa")