## To make barplot associated with fig 5

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick # Import ticker for percentage formatting
import seaborn as sns
import io
import os

# Load the data from the results folder in the last step analysis
data = """
Classification	Model	Accuracy	AUC	AUPR	MCC	F1 Score
Native vs. shuffled chain pairing 	8M-F	0.6341	0.6628	0.7067	0.3083	0.5143
Native vs. shuffled chain pairing 	8M-H	0.6272	0.65	0.6704	0.2737	0.5445
Native vs. shuffled chain pairing 	8M-Q	0.6319	0.6584	0.6872	0.2856	0.5448
Native vs. shuffled chain pairing 	35M-F	0.6531	0.6903	0.733	0.3541	0.5367
Native vs. shuffled chain pairing 	35M-H	0.6377	0.6665	0.7095	0.3168	0.5186
Native vs. shuffled chain pairing 	35M-Q	0.6256	0.6603	0.6951	0.2659	0.552
Native vs. shuffled chain pairing 	150M-F	0.6435	0.6751	0.711	0.3176	0.5467
Native vs. shuffled chain pairing 	150M-H	0.6378	0.6685	0.7036	0.3059	0.5374
Native vs. shuffled chain pairing 	150M-Q	0.6381	0.6682	0.6994	0.3118	0.529
Native vs. shuffled chain pairing 	350M-F	0.6698	0.7173	0.7643	0.3878	0.5646
Native vs. shuffled chain pairing 	350M-H	0.6486	0.6891	0.7184	0.3355	0.5426
Native vs. shuffled chain pairing 	350M-Q	0.6191	0.6499	0.6732	0.2618	0.5196
Native vs. shuffled chain pairing 	650M-F	0.6609	0.7019	0.7464	0.364	0.5574
Native vs. shuffled chain pairing 	650M-H	0.595	0.6228	0.6245	0.1955	0.5418
Native vs. shuffled chain pairing 	650M-Q	0.5801	0.6023	0.59	0.1697	0.4975
"""

# Read the data into a pandas DataFrame and set it up
df = pd.read_csv(io.StringIO(data), sep='\t')
df.columns = df.columns.str.strip()
df = df.drop(columns=['Classification'])
df[['Size', 'Variation']] = df['Model'].str.split('-', expand=True)

# Define the order for plotting model sizes and variations
model_size_order = ['8M', '35M', '150M', '350M', '650M']
variation_order = ['F', 'H', 'Q'] 

df['Size'] = pd.Categorical(df['Size'], categories=model_size_order, ordered=True)
df['Variation'] = pd.Categorical(df['Variation'], categories=variation_order, ordered=True)

# --- Plotting only Accuracy ---

metric_to_plot = 'Accuracy'

# Set the style and context
sns.set_style("whitegrid")
sns.set_context("talk")
plt.figure(figsize=(14, 8))

# Create custom color palette to be consistent w rest of the analysis
colors = sns.color_palette("colorblind", n_colors=3)
palette_dict = {
    'F': colors[0],  # Blue -> Full data
    'H': colors[1],  # Orange -> Half data
    'Q': colors[2]   # Green -> Quarter data
}

# Create the bar plot for Accuracy
ax = sns.barplot(
    x='Size',
    y=metric_to_plot,
    hue='Variation',
    data=df,
    order=model_size_order,
    hue_order=variation_order, 
    palette=palette_dict,
    edgecolor='black'
)

# Add value labels on top of bars, formatted as percentages
for container in ax.containers:
    labels = [f'{v*100:.1f}%' for v in container.datavalues]
    ax.bar_label(
        container,
        labels=labels,
        fontsize=10,
        padding=3
    )

# --- Apply styling ---

# Set labels
plt.ylabel('Accuracy (%)', fontsize=20)
plt.xlabel('Model Size', fontsize=20)

# Set tick labels
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

# Format the y-axis to show percentages
ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1.0))

# Enhance grid
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Set the y-axis limits
plt.ylim(bottom=0.0, top=0.75) 

# --- Modify Legend Labels ---
handles, _ = ax.get_legend_handles_labels()
new_labels = ['Full data', 'Half data', 'Quarter data']
# Create the legend with new labels and existing formatting
plt.legend(
    handles=handles,           
    labels=new_labels,         
    title='Data Amount',       
    title_fontsize=17,
    fontsize=16,
    loc='upper left',
    bbox_to_anchor=(1, 1)
)

# Adjust the plot layout
plt.tight_layout(rect=[0, 0, 0.85, 1])

# --- Optional: Save the figure ---
output_dir = './output_plots/'
plt.savefig(os.path.join(output_dir, "paired_classification"), dpi=300, bbox_inches='tight')
plt.show()