In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from matplotlib.ticker import MaxNLocator

# Define file paths
input_file = "Classification-SampleID.csv"  # Update with your specific file path
output_heatmap = "Classification_SampleID_heatmap.png"
output_matrix_csv = "Classification_SampleID_Matrix_Data.csv"

# Data acquisition and preprocessing
print(f"Reading data from {input_file}...")
data = pd.read_csv(input_file)

# Data structure identification and processing
# Examine the first few rows to understand structure
print("Data preview:")
print(data.head())

# Handle potential column structure issues
if 'Unnamed: 0' in data.columns:
    # Rename first column to a meaningful name
    data = data.rename(columns={'Unnamed: 0': 'Classification'})
    
    # If second column is Sample ID, we'll use it for structuring
    if 'Sample ID' in data.columns:
        # Set Classification as index
        data = data.set_index('Classification')
    else:
        # Handle alternative structures
        first_col_name = data.columns[0]
        data = data.set_index(first_col_name)

# Convert all data to numeric format
print("Converting data to numeric format...")
for col in data.columns:
    # Skip the first column if it's the index
    if col != data.index.name:
        data[col] = pd.to_numeric(data[col], errors='coerce')

# Fill NaN values with 0
data = data.fillna(0)

# Create label mappings for long names
row_label_mapping = {}
for idx, label in enumerate(data.index):
    if len(str(label)) > 25:  # Threshold for truncation
        shortened = f"Class {idx+1}"  # Numeric identifier
        row_label_mapping[shortened] = str(label)
        data.rename(index={label: shortened}, inplace=True)

column_label_mapping = {}
for idx, col in enumerate(data.columns):
    if len(str(col)) > 15:  # Column truncation threshold
        shortened = f"S{idx+1}"  # Sample identifier
        column_label_mapping[shortened] = str(col)
        data.rename(columns={col: shortened}, inplace=True)

print(f"Created {len(row_label_mapping)} row label mappings and {len(column_label_mapping)} column label mappings")

# Save processed matrix data
data.to_csv(output_matrix_csv)
print(f"Processed matrix data saved to {output_matrix_csv}")

# Calculate prevalence statistics 
prevalence = data.mean(axis=1).sort_values(ascending=False)
print(f"\nTop 5 prevalent classifications:")
for classification, value in prevalence.head(5).items():
    print(f"  - {classification}: {value*100:.1f}%")
    
print(f"\nLeast 5 prevalent classifications:")
for classification, value in prevalence.tail(5).items():
    print(f"  - {classification}: {value*100:.1f}%")

# Calculate similarity matrix between classes
similarity_matrix = pd.DataFrame(
    np.zeros((len(data.index), len(data.index))),
    index=data.index,
    columns=data.index
)

# Calculate Jaccard similarity index
print("\nCalculating similarity matrix between classification classes...")
for i, class1 in enumerate(data.index):
    for j, class2 in enumerate(data.index):
        if i <= j:  # Only calculate upper triangle (matrix is symmetric)
            # Convert to binary presence/absence if not already
            set1 = set(data.columns[data.loc[class1] > 0])
            set2 = set(data.columns[data.loc[class2] > 0])
            
            # Jaccard similarity: size of intersection divided by size of union
            if len(set1.union(set2)) > 0:
                similarity = len(set1.intersection(set2)) / len(set1.union(set2))
            else:
                similarity = 0
                
            similarity_matrix.loc[class1, class2] = similarity
            similarity_matrix.loc[class2, class1] = similarity  # Symmetric

# Save similarity matrix
similarity_matrix.to_csv("Classification_Similarity_Matrix.csv")
print("Similarity matrix saved to Classification_Similarity_Matrix.csv")

# Figure dimensioning with optimization for data volume
fig_width = max(20, data.shape[1] * 0.5)
fig_height = max(14, data.shape[0] * 0.4)

# Create the heatmap visualization
print("\nGenerating heatmap visualization...")
plt.figure(figsize=(fig_width, fig_height))

# Custom color gradient
cmap = plt.cm.colors.LinearSegmentedColormap.from_list(
    "custom", ["black", "green", "red"], N=256
)

# Generate the heatmap
heatmap = sns.heatmap(
    data,
    cmap=cmap,
    cbar=True,
    square=False,
    xticklabels=True,
    yticklabels=True,
    linewidths=0.5,
    linecolor='white',
    annot=True,
    fmt='g',
    annot_kws={"size": 9},
    robust=True,
    cbar_kws={"shrink": 0.5, "label": "Value Magnitude"}
)

# Typography configuration
plt.xticks(fontsize=10, rotation=45, ha='right')
plt.yticks(fontsize=10)

# Title and labels
plt.title("Classification vs Sample ID Heatmap", fontsize=16, pad=20)
plt.xlabel('Sample ID', fontsize=14, labelpad=15)
plt.ylabel('Classification', fontsize=14, labelpad=15)

# Layout optimization
plt.tight_layout(pad=2.0)

# Add legend for truncated labels
if row_label_mapping or column_label_mapping:
    legend_text = "CLASS LABELS:\n"
    for short, full in sorted(row_label_mapping.items()):
        legend_text += f"{short}: {full}\n"
    
    if column_label_mapping:
        legend_text += "\nSAMPLE ID LABELS:\n"
        for short, full in sorted(column_label_mapping.items()):
            legend_text += f"{short}: {full}\n"
    
    # Position legend at center bottom of figure
    plt.figtext(0.5, 0.1, legend_text, ha='center', va='bottom', fontsize=11, 
                bbox={"facecolor":"white", "alpha":0.8, "pad":5})
    
    # Adjust bottom margin to accommodate legend
    plt.subplots_adjust(bottom=0.35)

# Save the heatmap
plt.savefig(output_heatmap, dpi=300, bbox_inches='tight', pad_inches=0.5)
print(f"Heatmap saved to {output_heatmap} with resolution 300 DPI")

# Generate auxiliary legend file for reference
if row_label_mapping or column_label_mapping:
    with open('classification_label_reference.txt', 'w') as f:
        f.write("CLASSIFICATION SAMPLE ID LABEL REFERENCE\n")
        f.write("=======================================\n\n")
        if row_label_mapping:
            f.write("CLASS LABELS:\n")
            for short, full in sorted(row_label_mapping.items()):
                f.write(f"{short}: {full}\n")
        if column_label_mapping:
            f.write("\nSAMPLE ID LABELS:\n")
            for short, full in sorted(column_label_mapping.items()):
                f.write(f"{short}: {full}\n")
    print("Label reference file generated: classification_label_reference.txt")

# Generate similarity heatmap
print("\nGenerating similarity matrix heatmap...")
plt.figure(figsize=(16, 14))

# Custom colormap for similarity
similarity_cmap = sns.color_palette("YlGnBu", as_cmap=True)

# Create similarity heatmap
similarity_heatmap = sns.heatmap(
    similarity_matrix,
    cmap=similarity_cmap,
    cbar=True,
    square=True,
    xticklabels=True,
    yticklabels=True,
    linewidths=0.5,
    annot=True,
    fmt='.2f',
    annot_kws={"size": 8},
    cbar_kws={"shrink": 0.5, "label": "Jaccard Similarity Index"}
)

# Title and labels for similarity matrix
plt.title("Classification Similarity Matrix (Jaccard Index)", fontsize=16, pad=20)
plt.xticks(fontsize=9, rotation=45, ha='right')
plt.yticks(fontsize=9)

# Adjust layout
plt.tight_layout(pad=2.0)

# Save similarity heatmap
plt.savefig("Classification_Similarity_Heatmap.png", dpi=300, bbox_inches='tight')
print("Similarity heatmap saved to Classification_Similarity_Heatmap.png")

# Display completion message
print("\nAnalysis complete.")

Reading data from Classification-SampleID.csv...


FileNotFoundError: [Errno 2] No such file or directory: 'Classification-SampleID.csv'