In [2]:
import pandas as pd

# Load the main data from Excel file
print("Loading main dataset from Excel...")
df_main = pd.read_excel('/Users/guidoputignano/PycharmProjects/VERO-Code-Salerno/Phase_1/data/processed/cleaned_data.xlsx')
print(f"Main dataset shape: {df_main.shape}")
print(f"Main dataset columns: {len(df_main.columns)}")

# Load the phenotype labels from CSV
print("\nLoading phenotype labels from CSV...")
df_phenotype = pd.read_csv('/Users/guidoputignano/PycharmProjects/VERO-Code-Salerno/Phase_3/phase3_outputs/phenotype_labels.csv')
print(f"Phenotype dataset shape: {df_phenotype.shape}")

# Display first few rows of each dataset
print("\n" + "="*80)
print("MAIN DATASET - First 5 rows:")
print("="*80)
print(df_main.head())

print("\n" + "="*80)
print("PHENOTYPE DATASET - First 5 rows:")
print("="*80)
print(df_phenotype.head())

# Join the datasets on patient_id
print("\n" + "="*80)
print("JOINING DATASETS...")
print("="*80)
df_merged = df_main.merge(df_phenotype, on='patient_id', how='left')

print(f"\nMerged dataset shape: {df_merged.shape}")
print(f"Number of columns after merge: {len(df_merged.columns)}")

# Check for any unmatched records
unmatched = df_merged['Phenotype_K'].isna().sum()
print(f"\nRecords without phenotype match: {unmatched}")

# Display first few rows of merged dataset
print("\n" + "="*80)
print("MERGED DATASET - First 5 rows (showing key columns):")
print("="*80)
key_columns = ['patient_id', 'age', 'gender', 'tumor_type', 'Phenotype_K']
print(df_merged[key_columns].head(10))

# Display phenotype distribution
print("\n" + "="*80)
print("PHENOTYPE DISTRIBUTION:")
print("="*80)
print(df_merged['Phenotype_K'].value_counts().sort_index())
print(f"\nTotal records: {len(df_merged)}")
print(f"Phenotype_K = 0: {(df_merged['Phenotype_K'] == 0).sum()}")
print(f"Phenotype_K = 1: {(df_merged['Phenotype_K'] == 1).sum()}")

# Save the merged dataset
output_file = '/Users/guidoputignano/PycharmProjects/VERO-Code-Salerno/Phase_4/outputs/merged_data_with_phenotype.xlsx'
print(f"\n" + "="*80)
print(f"SAVING MERGED DATASET to {output_file}")
print("="*80)
df_merged.to_excel(output_file, index=False)
print("✓ File saved successfully!")

# Also save as CSV for easier viewing
output_csv = '/Users/guidoputignano/PycharmProjects/VERO-Code-Salerno/Phase_4/outputs/merged_data_with_phenotype.csv'
df_merged.to_csv(output_csv, index=False)
print(f"✓ CSV version saved to {output_csv}")

print("\n" + "="*80)
print("PROCESS COMPLETED SUCCESSFULLY!")
print("="*80)

Loading main dataset from Excel...
Main dataset shape: (403, 109)
Main dataset columns: 109

Loading phenotype labels from CSV...
Phenotype dataset shape: (403, 2)

MAIN DATASET - First 5 rows:
                           patient_id  birth_date  age    age_group  gender  \
0                       10_AO San Pio  1950-01-02   73   > 65 years    Male   
1               10_AORN A. Cardarelli  1964-08-15   58  <= 65 years  Female   
2  10_AORN Monaldi – Cotugno - C.T.O.  1939-04-03   84   > 65 years  Female   
3        10_AORN San Giuseppe Moscati  1947-12-09   76   > 65 years    Male   
4  10_AORN Sant’Anna e San Sebastiano  1952-11-04   70   > 65 years  Female   

   ethnicity      education_level  bmi_value             bmi_category  \
0  Caucasian  Not Known / Missing      24.17  18.5-24.9 Normal Weight   
1  Caucasian          High School      19.53  18.5-24.9 Normal Weight   
2  Caucasian        Middle School      19.53  18.5-24.9 Normal Weight   
3  Caucasian  Not Known / Missing      

In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path

# Set style for better-looking plots
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 10)
plt.rcParams['font.size'] = 10

# Load the main data from Excel file
print("Loading main dataset from Excel...")
df_main = pd.read_excel('/Users/guidoputignano/PycharmProjects/VERO-Code-Salerno/Phase_1/data/processed/cleaned_data.xlsx')
print(f"Main dataset shape: {df_main.shape}")
print(f"Main dataset columns: {len(df_main.columns)}")

# Load the phenotype labels from CSV
print("\nLoading phenotype labels from CSV...")
df_phenotype = pd.read_csv('/Users/guidoputignano/PycharmProjects/VERO-Code-Salerno/Phase_3/phase3_outputs/phenotype_labels.csv')
print(f"Phenotype dataset shape: {df_phenotype.shape}")

# Display first few rows of each dataset
print("\n" + "="*80)
print("MAIN DATASET - First 5 rows:")
print("="*80)
print(df_main.head())

print("\n" + "="*80)
print("PHENOTYPE DATASET - First 5 rows:")
print("="*80)
print(df_phenotype.head())

# Join the datasets on patient_id
print("\n" + "="*80)
print("JOINING DATASETS...")
print("="*80)
df_merged = df_main.merge(df_phenotype, on='patient_id', how='left')

print(f"\nMerged dataset shape: {df_merged.shape}")
print(f"Number of columns after merge: {len(df_merged.columns)}")

# Check for any unmatched records
unmatched = df_merged['Phenotype_K'].isna().sum()
print(f"\nRecords without phenotype match: {unmatched}")

# Display first few rows of merged dataset
print("\n" + "="*80)
print("MERGED DATASET - First 5 rows (showing key columns):")
print("="*80)
key_columns = ['patient_id', 'age', 'gender', 'tumor_type', 'Phenotype_K']
print(df_merged[key_columns].head(10))

# Display phenotype distribution
print("\n" + "="*80)
print("PHENOTYPE DISTRIBUTION:")
print("="*80)
print(df_merged['Phenotype_K'].value_counts().sort_index())
print(f"\nTotal records: {len(df_merged)}")
print(f"Phenotype_K = 0: {(df_merged['Phenotype_K'] == 0).sum()}")
print(f"Phenotype_K = 1: {(df_merged['Phenotype_K'] == 1).sum()}")

# Save the merged dataset
output_file = '/Users/guidoputignano/PycharmProjects/VERO-Code-Salerno/Phase_4/outputs/merged_data_with_phenotype.xlsx'
print(f"\n" + "="*80)
print(f"SAVING MERGED DATASET to {output_file}")
print("="*80)
# Create output directory if it doesn't exist
Path(output_file).parent.mkdir(parents=True, exist_ok=True)
df_merged.to_excel(output_file, index=False)
print("✓ File saved successfully!")

# Also save as CSV for easier viewing
output_csv = '/Users/guidoputignano/PycharmProjects/VERO-Code-Salerno/Phase_4/outputs/merged_data_with_phenotype.csv'
df_merged.to_csv(output_csv, index=False)
print(f"✓ CSV version saved to {output_csv}")

print("\n" + "="*80)
print("PROCESS COMPLETED SUCCESSFULLY!")
print("="*80)

# ============================================================================
# START PLOTTING SOCIAL SIGNATURE
# ============================================================================

print("\n\n" + "="*80)
print("CREATING SOCIAL SIGNATURE PLOTS")
print("="*80)

# Create output directory for plots
output_dir = Path('/Users/guidoputignano/PycharmProjects/VERO-Code-Salerno/Phase_4/outputs')
output_dir.mkdir(parents=True, exist_ok=True)

# Remove rows without phenotype
df = df_merged.dropna(subset=['Phenotype_K'])
print(f"\nTotal records with phenotype: {len(df)}")

# Create age group binary variable (<=65 or >65)
df['age_group_binary'] = df['age'].apply(lambda x: '≤65 years' if x <= 65 else '>65 years')

# Create phenotype descriptive names
phenotype_names = {
    0: "Aging-Resilient, Treatment-Compatible",
    1: "Frailty-Linked, Treatment-Vulnerable"
}

df['phenotype_name'] = df['Phenotype_K'].astype(int).map(phenotype_names)

# Create combined group variable
df['group'] = df['Phenotype_K'].astype(int).astype(str) + '_' + df['age_group_binary']
df['group_label'] = df.apply(
    lambda row: f"{phenotype_names[int(row['Phenotype_K'])]}\n{row['age_group_binary']}",
    axis=1
)

# Define the social signature variables
social_vars = [
    'ethnicity',
    'gender',
    'education_level',
    'bmi_value',
    'employment_status',
    'alcohol_consumption',
    'smoking_status_binary',
    'smoking_status_detail',
    'smoking_years'
]

# Print group distribution
print("\n" + "="*80)
print("GROUP DISTRIBUTION:")
print("="*80)
group_dist = df.groupby(['phenotype_name', 'age_group_binary']).size().reset_index(name='count')
print(group_dist)
print("\nDetailed breakdown:")
for _, row in group_dist.iterrows():
    print(f"  {row['phenotype_name']}, {row['age_group_binary']}: {row['count']} patients")
print()

# Function to plot categorical variables
def plot_categorical(data, var_name, output_dir):
    """Create stacked bar plot for categorical variables"""
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    # Remove NaN values for this variable
    data_clean = data.dropna(subset=[var_name])

    if len(data_clean) == 0:
        print(f"  ⚠ Warning: No data for {var_name}")
        plt.close()
        return

    # Sort group labels for consistent ordering
    group_order = sorted(data_clean['group_label'].unique())

    # Left plot: Stacked bar chart (proportions within each group)
    ct = pd.crosstab(data_clean['group_label'], data_clean[var_name], normalize='index')
    ct_sorted = ct.reindex(group_order)
    ct_sorted.plot(kind='bar', stacked=True, ax=axes[0], colormap='Set3')
    axes[0].set_title(f'{var_name.replace("_", " ").title()}\nProportions within Each Group',
                      fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Group', fontsize=12)
    axes[0].set_ylabel('Proportion', fontsize=12)
    axes[0].legend(title=var_name.replace("_", " ").title(), bbox_to_anchor=(1.05, 1),
                   loc='upper left', fontsize=9)
    axes[0].tick_params(axis='x', rotation=45, labelsize=9)
    axes[0].set_ylim([0, 1])

    # Right plot: Grouped bar chart (counts)
    ct_counts = pd.crosstab(data_clean['group_label'], data_clean[var_name])
    ct_counts_sorted = ct_counts.reindex(group_order)
    ct_counts_sorted.plot(kind='bar', ax=axes[1], colormap='Set3')
    axes[1].set_title(f'{var_name.replace("_", " ").title()}\nAbsolute Counts',
                      fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Group', fontsize=12)
    axes[1].set_ylabel('Count', fontsize=12)
    axes[1].legend(title=var_name.replace("_", " ").title(), bbox_to_anchor=(1.05, 1),
                   loc='upper left', fontsize=9)
    axes[1].tick_params(axis='x', rotation=45, labelsize=9)

    plt.tight_layout()
    plt.savefig(output_dir / f'{var_name}_by_group.png', dpi=300, bbox_inches='tight')
    print(f"  ✓ Saved: {var_name}_by_group.png")
    plt.close()


# Function to plot continuous variables
def plot_continuous(data, var_name, output_dir):
    """Create box plots and violin plots for continuous variables"""
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))

    # Remove NaN values for this variable
    data_clean = data.dropna(subset=[var_name])

    if len(data_clean) == 0:
        print(f"  ⚠ Warning: No data for {var_name}")
        plt.close()
        return

    # Sort groups for consistent ordering
    group_order = sorted(data_clean['group_label'].unique())

    # Box plot
    sns.boxplot(data=data_clean, x='group_label', y=var_name, ax=axes[0, 0],
                order=group_order, palette='Set2')
    axes[0, 0].set_title(f'{var_name.replace("_", " ").title()}\nBox Plot by Group',
                         fontsize=14, fontweight='bold')
    axes[0, 0].set_xlabel('Group', fontsize=12)
    axes[0, 0].set_ylabel(var_name.replace("_", " ").title(), fontsize=12)
    axes[0, 0].tick_params(axis='x', rotation=45, labelsize=9)

    # Violin plot
    sns.violinplot(data=data_clean, x='group_label', y=var_name, ax=axes[0, 1],
                   order=group_order, palette='Set2')
    axes[0, 1].set_title(f'{var_name.replace("_", " ").title()}\nViolin Plot by Group',
                         fontsize=14, fontweight='bold')
    axes[0, 1].set_xlabel('Group', fontsize=12)
    axes[0, 1].set_ylabel(var_name.replace("_", " ").title(), fontsize=12)
    axes[0, 1].tick_params(axis='x', rotation=45, labelsize=9)

    # Histogram with overlapping distributions
    colors = plt.cm.Set2(np.linspace(0, 1, len(group_order)))
    for i, group in enumerate(group_order):
        group_data = data_clean[data_clean['group_label'] == group][var_name]
        axes[1, 0].hist(group_data, alpha=0.5, label=group, bins=20, color=colors[i])
    axes[1, 0].set_title(f'{var_name.replace("_", " ").title()}\nDistribution Overlay',
                         fontsize=14, fontweight='bold')
    axes[1, 0].set_xlabel(var_name.replace("_", " ").title(), fontsize=12)
    axes[1, 0].set_ylabel('Frequency', fontsize=12)
    axes[1, 0].legend(fontsize=8)

    # Statistical summary table
    summary = data_clean.groupby('group_label')[var_name].describe()
    summary_sorted = summary.reindex(group_order)

    # Create text summary
    axes[1, 1].axis('off')
    summary_text = f'{var_name.replace("_", " ").title()}\n\nStatistical Summary:\n\n'
    for idx, row in summary_sorted.iterrows():
        summary_text += f"{idx}:\n"
        summary_text += f"  N: {int(row['count'])}\n"
        summary_text += f"  Mean: {row['mean']:.2f}\n"
        summary_text += f"  Median: {row['50%']:.2f}\n"
        summary_text += f"  SD: {row['std']:.2f}\n"
        summary_text += f"  Range: {row['min']:.2f} - {row['max']:.2f}\n\n"

    axes[1, 1].text(0.1, 0.9, summary_text, fontsize=10, verticalalignment='top',
                    family='monospace')

    plt.tight_layout()
    plt.savefig(output_dir / f'{var_name}_by_group.png', dpi=300, bbox_inches='tight')
    print(f"  ✓ Saved: {var_name}_by_group.png")
    plt.close()


# Generate plots for each variable
print("\n" + "="*80)
print("GENERATING INDIVIDUAL PLOTS FOR EACH VARIABLE...")
print("="*80)

for var in social_vars:
    print(f"\nProcessing: {var}")

    if var not in df.columns:
        print(f"  ⚠ Warning: {var} not found in dataset, skipping...")
        continue

    # Check if variable is numeric or categorical
    if df[var].dtype in ['float64', 'int64']:
        plot_continuous(df, var, output_dir)
    else:
        plot_categorical(df, var, output_dir)

# Create a comprehensive summary plot
print("\n" + "="*80)
print("CREATING COMPREHENSIVE SUMMARY PLOT...")
print("="*80)

fig = plt.figure(figsize=(20, 12))
gs = fig.add_gridspec(3, 3, hspace=0.4, wspace=0.3)

plot_idx = 0
for i, var in enumerate(social_vars):
    if var not in df.columns:
        continue

    row = plot_idx // 3
    col = plot_idx % 3

    if row >= 3:  # We only have space for 9 plots
        break

    ax = fig.add_subplot(gs[row, col])

    data_clean = df.dropna(subset=[var])

    if len(data_clean) == 0:
        continue

    group_order = sorted(data_clean['group_label'].unique())

    if df[var].dtype in ['float64', 'int64']:
        # Box plot for continuous
        sns.boxplot(data=data_clean, x='group_label', y=var, ax=ax,
                   order=group_order, palette='Set2')
        ax.set_xlabel('')
        ax.tick_params(axis='x', rotation=45, labelsize=8)
    else:
        # Stacked bar for categorical
        ct = pd.crosstab(data_clean['group_label'], data_clean[var], normalize='index')
        ct_sorted = ct.reindex(group_order)
        ct_sorted.plot(kind='bar', stacked=True, ax=ax, legend=False, colormap='Set3')
        ax.set_xlabel('')
        ax.tick_params(axis='x', rotation=45, labelsize=8)
        ax.set_ylim([0, 1])

    ax.set_title(var.replace("_", " ").title(), fontsize=11, fontweight='bold')
    ax.set_ylabel('')

    plot_idx += 1

plt.suptitle('Social Signature Across Groups\nAging-Resilient vs Frailty-Linked Phenotypes × Age Group',
             fontsize=18, fontweight='bold', y=0.995)
plt.savefig(output_dir / 'social_signature_summary.png', dpi=300, bbox_inches='tight')
print(f"✓ Saved: social_signature_summary.png")
plt.close()

print("\n" + "="*80)
print("ALL PLOTS GENERATED SUCCESSFULLY!")
print("="*80)
print(f"Output location: {output_dir}")
print(f"\nIndividual plots created for {len(social_vars)} variables")
print("Plus 1 comprehensive summary plot")
print("\nFiles created:")
for var in social_vars:
    if var in df.columns:
        print(f"  - {var}_by_group.png")
print(f"  - social_signature_summary.png")

Loading main dataset from Excel...
Main dataset shape: (403, 109)
Main dataset columns: 109

Loading phenotype labels from CSV...
Phenotype dataset shape: (403, 2)

MAIN DATASET - First 5 rows:
                           patient_id  birth_date  age    age_group  gender  \
0                       10_AO San Pio  1950-01-02   73   > 65 years    Male   
1               10_AORN A. Cardarelli  1964-08-15   58  <= 65 years  Female   
2  10_AORN Monaldi – Cotugno - C.T.O.  1939-04-03   84   > 65 years  Female   
3        10_AORN San Giuseppe Moscati  1947-12-09   76   > 65 years    Male   
4  10_AORN Sant’Anna e San Sebastiano  1952-11-04   70   > 65 years  Female   

   ethnicity      education_level  bmi_value             bmi_category  \
0  Caucasian  Not Known / Missing      24.17  18.5-24.9 Normal Weight   
1  Caucasian          High School      19.53  18.5-24.9 Normal Weight   
2  Caucasian        Middle School      19.53  18.5-24.9 Normal Weight   
3  Caucasian  Not Known / Missing      


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=data_clean, x='group_label', y=var_name, ax=axes[0, 0],

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(data=data_clean, x='group_label', y=var_name, ax=axes[0, 1],


  ✓ Saved: bmi_value_by_group.png

Processing: employment_status
  ✓ Saved: employment_status_by_group.png

Processing: alcohol_consumption
  ✓ Saved: alcohol_consumption_by_group.png

Processing: smoking_status_binary



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=data_clean, x='group_label', y=var_name, ax=axes[0, 0],

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(data=data_clean, x='group_label', y=var_name, ax=axes[0, 1],


  ✓ Saved: smoking_status_binary_by_group.png

Processing: smoking_status_detail
  ✓ Saved: smoking_status_detail_by_group.png

Processing: smoking_years



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=data_clean, x='group_label', y=var_name, ax=axes[0, 0],

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(data=data_clean, x='group_label', y=var_name, ax=axes[0, 1],


  ✓ Saved: smoking_years_by_group.png

CREATING COMPREHENSIVE SUMMARY PLOT...



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=data_clean, x='group_label', y=var, ax=ax,

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=data_clean, x='group_label', y=var, ax=ax,

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=data_clean, x='group_label', y=var, ax=ax,


✓ Saved: social_signature_summary.png

ALL PLOTS GENERATED SUCCESSFULLY!
Output location: /Users/guidoputignano/PycharmProjects/VERO-Code-Salerno/Phase_4/outputs

Individual plots created for 9 variables
Plus 1 comprehensive summary plot

Files created:
  - ethnicity_by_group.png
  - gender_by_group.png
  - education_level_by_group.png
  - bmi_value_by_group.png
  - employment_status_by_group.png
  - alcohol_consumption_by_group.png
  - smoking_status_binary_by_group.png
  - smoking_status_detail_by_group.png
  - smoking_years_by_group.png
  - social_signature_summary.png
