In [233]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [234]:
file_path = '/results/Actinokineospora/codon_positions_normalized.csv'
position_df = pd.read_csv(file_path)

In [235]:
output_path = '/results/Actinokineospora/'

In [236]:
# Add bins for relative positions (0-9)
position_df["Position_Bin"] = (position_df["Relative_Position"] // 10).astype(int)

# Count codon occurrences by bins
codon_bin_counts = position_df.groupby(["Codon", "Position_Bin"]).size().unstack(fill_value=0)

# Sort codons by overall frequency
codon_totals = codon_bin_counts.sum(axis=1).sort_values(ascending=False)
codon_bin_counts = codon_bin_counts.loc[codon_totals.index]

In [237]:
# Calculate the total counts for each codon across all bins
total_counts = codon_bin_counts.sum(axis=1)  # Total for each codon across all bins

# Normalize each value by the total for its codon
df_normalized = codon_bin_counts.div(codon_totals, axis=0)  # Proportion for each codon

In [238]:
# Calculate rarity using square root transformation
rarity = np.sqrt(1 / total_counts)

# Normalize rarity values (min-max scaling)
normalized_rarity = (rarity - rarity.min()) / (rarity.max() - rarity.min())

In [243]:
# Combine folder + filename into full path
output_path = os.path.join(output_dir, 'df_bins_normalized_Actinokineospora.csv'')
# Save CSV
df_normalized.to_csv(output_path, index=True)

In [245]:
bin_cols = [str(i) for i in range(10)]  # ensure they are strings

# normalized_rarity: pd.Series indexed by Codon from your earlier code
df_plot = df_normalized.merge(
    normalized_rarity.rename("Normalized_Rarity"),
    left_on="Codon", right_index=True, how="left"
)
# df_plot now has: Codon, Genus, 0..9, Normalized_Rarity  -> ready to plot


In [246]:
# Combine folder + filename into full path
output_path = os.path.join(output_dir, 'df_bins_normalized_plot_Actinokineospora.csv'')
# Save CSV
df_normalized.to_csv(output_path, index=True)