In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import matplotlib.ticker as ticker
import matplotlib as mpl
from scipy.stats import gaussian_kde, ttest_ind

# Set global font to Times New Roman
mpl.rcParams['font.family'] = 'serif'
mpl.rcParams['font.serif'] = ['Times New Roman', 'Times', 'DejaVu Serif', 'serif']

# Define continuous and discrete feature lists
DENSITY_FEATURES = [
    'Amino Acid based volume Score',
    'Pocket volume (Monte Carlo)',
    'Local hydrophobic density Score',
    'Hydrophobicity Score',
    'Polarity Score'
]

HISTOGRAM_FEATURES = [
    'Aromatic',
    'Tryptophan - W',
    'Phenylalanine - F',
    'Tyrosine - Y',
    'Histidine - H'
]

# Define labels and plotting colors
LABEL_MAP = {0: 'Linalool', 1: 'Limonene'}
CLASS_COL = 'Cyclical'
HISTOGRAM_COLORS = {'Linalool': '#F46920', 'Limonene': '#1f77b4'}

# Helper to make tick labels bold
def _bold_ticklabels(ax, size=13):
    for lab in ax.get_xticklabels() + ax.get_yticklabels():
        lab.set_fontweight('bold')
        lab.set_fontsize(size)

# Plot KDE density plots for continuous features
def plot_density(data, out_path, show_titles=False, show_legend=False):
    fig, axes = plt.subplots(1, len(DENSITY_FEATURES), figsize=(6 * len(DENSITY_FEATURES), 6))
    if show_titles:
        fig.suptitle('Density Plots by Group', fontsize=22, fontweight='bold')

    for i, feature in enumerate(DENSITY_FEATURES):
        ax = axes[i] if len(DENSITY_FEATURES) > 1 else axes
        for group_name, group_data in data.groupby('Group'):
            x = group_data[feature].dropna()
            if len(x) < 2:
                continue
            kde = gaussian_kde(x)
            padding = (x.max() - x.min()) * 0.1 if x.max() > x.min() else 0.1
            x_vals = np.linspace(x.min() - padding, x.max() + padding, 200)
            y_vals = kde(x_vals)
            ax.fill_between(x_vals, y_vals, alpha=0.5, label=group_name, color=HISTOGRAM_COLORS[group_name])

        if show_titles:
            ax.set_title(f'Density: {feature}', fontsize=18, fontweight='bold')
        ax.set_xlabel(feature, fontsize=20, fontweight='bold')
        ax.set_ylabel('', fontsize=0)
        ax.tick_params(axis='both', which='major', width=2.0)
        for spine in ax.spines.values():
            spine.set_linewidth(1.6)
        _bold_ticklabels(ax, size=13)
        if show_legend:
            leg = ax.legend(fontsize=13, frameon=False)
            for t in leg.get_texts():
                t.set_fontweight('bold')
        elif ax.get_legend() is not None:
            ax.get_legend().remove()

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(out_path, dpi=600, bbox_inches='tight')
    plt.close(fig)

# Plot bar charts for discrete amino acid features
def plot_discrete_distributions(data, out_path, show_titles=False, show_legend=False):
    fig, axes = plt.subplots(1, len(HISTOGRAM_FEATURES), figsize=(6 * len(HISTOGRAM_FEATURES), 6))
    if show_titles:
        fig.suptitle('Discrete Distributions by Group', fontsize=22, fontweight='bold')

    for i, feature in enumerate(HISTOGRAM_FEATURES):
        ax = axes[i] if len(HISTOGRAM_FEATURES) > 1 else axes
        freq_table = data.groupby(['Group', feature]).size().unstack(fill_value=0)
        freq_table = freq_table.div(freq_table.sum(axis=1), axis=0)
        freq_long = freq_table.T.reset_index().melt(id_vars=feature, var_name='Group', value_name='Proportion')

        sns.barplot(
            data=freq_long, x=feature, y='Proportion', hue='Group', ax=ax,
            palette=HISTOGRAM_COLORS, edgecolor='black'
        )

        if show_titles:
            ax.set_title(feature, fontsize=18, fontweight='bold')
        ax.set_xlabel(f'Count of {feature}', fontsize=20, fontweight='bold')
        ax.set_ylabel('', fontsize=0)
        ax.tick_params(axis='both', which='major', width=2.0)
        for spine in ax.spines.values():
            spine.set_linewidth(1.6)
        _bold_ticklabels(ax, size=13)
        if show_legend:
            leg = ax.legend(fontsize=13, frameon=False, title=None)
            for t in leg.get_texts():
                t.set_fontweight('bold')
        elif ax.get_legend() is not None:
            ax.get_legend().remove()

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(out_path, dpi=600, bbox_inches='tight')
    plt.close(fig)

# Main function: load data, create output folder, plot, and run t-tests
def main():
    file_path = 'FullDataset.csv'
    out_dir = 'PlotsOut'
    os.makedirs(out_dir, exist_ok=True)  # Create folder for outputs

    data = pd.read_csv(file_path)
    data['Group'] = data[CLASS_COL].map(LABEL_MAP)
    data['Aromatic_per_residue'] = data['Aromatic'] / data['Number of residues']

    # Generate and save plots in output folder
    plot_density(data, os.path.join(out_dir, 'density_plots.png'), show_titles=False, show_legend=False)
    plot_discrete_distributions(data, os.path.join(out_dir, 'discrete_distributions.png'), show_titles=False, show_legend=False)

    # Run and print summary statistics
    print("\n=== Summary Statistics and T-Tests ===")
    all_features = DENSITY_FEATURES + ['Aromatic', 'Aromatic_per_residue']
    for feature in all_features:
        group0 = data[data['Group'] == 'Linalool'][feature].dropna()
        group1 = data[data['Group'] == 'Limonene'][feature].dropna()
        stat, pval = ttest_ind(group0, group1, equal_var=False)
        print(f"{feature}:")
        print(f"  Linalool: Mean = {group0.mean():.3f}, SEM = {group0.sem():.3f}, Count = {len(group0)}")
        print(f"  Limonene: Mean = {group1.mean():.3f}, SEM = {group1.sem():.3f}, Count = {len(group1)}")
        print(f"  p-value = {pval:.4f}\n")

# Run main if executed directly
if __name__ == '__main__':
    main()
