In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from matplotlib.ticker import MaxNLocator

# Define file paths
input_file = "datasets/Antibiotics-SampleID.csv"
output_heatmap = "Antibiotics_SampleID_heatmap.png"
output_matrix_csv = "Antibiotics_SampleID_Matrix_Data.csv"

# File verification with alternative path detection
if not os.path.exists(input_file):
    possible_alternatives = [
        "data/AntibioticsSampleID.csv",
        "datasets/AntibioticsSampleID.csv",
        "input/AntibioticsSampleID.csv"
    ]
    
    for alt_path in possible_alternatives:
        if os.path.exists(alt_path):
            input_file = alt_path
            print(f"Located alternative file path: {input_file}")
            break
    else:
        raise FileNotFoundError(f"Unable to locate target file. Please ensure 'AntibioticsSampleID.csv' exists in the current directory.")

# Data acquisition and preprocessing
print(f"Reading data from {input_file}...")
data = pd.read_csv(input_file)

# Data structure identification and processing
# If first row contains sample IDs (which appears to be the case)
if data.iloc[0, 0] == 'Antibiotics':
    # Extract sample IDs from first row
    sample_ids = data.iloc[0, 1:].tolist()
    
    # Set column names properly
    new_headers = ['Antibiotics Class'] + sample_ids
    data.columns = new_headers
    
    # Remove the first row (which contains sample IDs)
    data = data.iloc[1:].reset_index(drop=True)
    
    # Set the first column as index
    data = data.set_index('Antibiotics Class')
else:
    # Handle alternative structure if needed
    if data.columns[0] == '':
        data = data.rename(columns={data.columns[0]: 'Antibiotics Class'})
        data = data.set_index('Antibiotics Class')
    elif 'Unnamed: 0' in data.columns:
        data = data.rename(columns={'Unnamed: 0': 'Antibiotics Class'})
        data = data.set_index('Antibiotics Class')

# Ensure all data is numeric
print("Converting data to numeric format...")
for col in data.columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Fill NaN values with 0
data = data.fillna(0).astype(int)

# Save processed matrix data
data.to_csv(output_matrix_csv)
print(f"Processed matrix data saved to {output_matrix_csv}")

# ENHANCED ANALYTICAL PREPROCESSING: Perform additional data analysis
# Calculate prevalence statistics for each antibiotic class
prevalence = data.mean(axis=1).sort_values(ascending=False)
print(f"\nTop 5 prevalent antibiotic classes:")
for antibiotic, value in prevalence.head(5).items():
    print(f"  - {antibiotic}: {value*100:.1f}%")
    
print(f"\nLeast 5 prevalent antibiotic classes:")
for antibiotic, value in prevalence.tail(5).items():
    print(f"  - {antibiotic}: {value*100:.1f}%")

# CRITICAL IMPLEMENTATION: Label truncation with mapping preservation
# Create a mapping dictionary for original to shortened names
row_label_mapping = {}
for idx, label in enumerate(data.index):
    if len(str(label)) > 25:  # Threshold for truncation
        shortened = f"Class {idx+1}"  # Numeric identifier
        row_label_mapping[shortened] = str(label)
        data.rename(index={label: shortened}, inplace=True)

column_label_mapping = {}
for idx, col in enumerate(data.columns):
    if len(str(col)) > 15:  # Column truncation threshold
        shortened = f"S{idx+1}"  # Sample identifier
        column_label_mapping[shortened] = str(col)
        data.rename(columns={col: shortened}, inplace=True)

print(f"\nCreated {len(row_label_mapping)} row label mappings and {len(column_label_mapping)} column label mappings")

# ANALYTICAL ENHANCEMENT: Compute similarity matrix between antibiotics classes
similarity_matrix = pd.DataFrame(
    np.zeros((len(data.index), len(data.index))),
    index=data.index,
    columns=data.index
)

# Calculate Jaccard similarity index between each pair of antibiotic classes
print("\nCalculating similarity matrix between antibiotic classes...")
for i, class1 in enumerate(data.index):
    for j, class2 in enumerate(data.index):
        if i <= j:  # Only calculate upper triangle (matrix is symmetric)
            set1 = set(data.columns[data.loc[class1] == 1])
            set2 = set(data.columns[data.loc[class2] == 1])
            
            # Jaccard similarity: size of intersection divided by size of union
            if len(set1.union(set2)) > 0:
                similarity = len(set1.intersection(set2)) / len(set1.union(set2))
            else:
                similarity = 0
                
            similarity_matrix.loc[class1, class2] = similarity
            similarity_matrix.loc[class2, class1] = similarity  # Symmetric

# Save similarity matrix
similarity_matrix.to_csv("Antibiotics_Similarity_Matrix.csv")
print("Similarity matrix saved to Antibiotics_Similarity_Matrix.csv")

# Figure dimensioning with optimization for data volume
fig_width = max(20, data.shape[1] * 0.5)
fig_height = max(14, data.shape[0] * 0.4)

# VISUALIZATION CONFIGURATION: Primary heatmap
print("\nGenerating heatmap visualization...")
plt.figure(figsize=(fig_width, fig_height))

# Custom color gradient specification
cmap = plt.cm.colors.LinearSegmentedColormap.from_list(
    "custom", ["black", "green", "red"], N=256
)

# Heatmap generation with enhanced visual parameters
heatmap = sns.heatmap(
    data,
    cmap=cmap,
    cbar=True,
    square=False,
    xticklabels=True,
    yticklabels=True,
    linewidths=0.5,
    linecolor='white',
    annot=True,
    fmt='g',
    annot_kws={"size": 9},
    robust=True,
    cbar_kws={"shrink": 0.5, "label": "Value Magnitude"}
)

# Typography configuration
plt.xticks(fontsize=10, rotation=45, ha='right')
plt.yticks(fontsize=10)

# Title positioning with adequate clearance
plt.title("Antibiotics Class vs Sample ID Heatmap", fontsize=16, pad=20)

# Axis labeling with optimized positioning
plt.xlabel('Sample ID', fontsize=14, labelpad=15)
plt.ylabel('Antibiotics Class', fontsize=14, labelpad=15)

# Layout optimization
plt.tight_layout(pad=2.0)

# MODIFIED LEGEND POSITIONING: Center the legend box directly under x-axis label
if row_label_mapping or column_label_mapping:
    legend_text = "CLASS LABELS:\n"
    for short, full in sorted(row_label_mapping.items()):
        legend_text += f"{short}: {full}\n"
    
    if column_label_mapping:
        legend_text += "\nSAMPLE ID LABELS:\n"
        for short, full in sorted(column_label_mapping.items()):
            legend_text += f"{short}: {full}\n"
    
    # Position legend at center bottom of figure
    # Set x=0.5 to center horizontally
    ### here
    # Adjust y position to be below the x-axis label
    plt.figtext(0.5, 0.1, legend_text, ha='center', va='bottom', fontsize=15, horizontalalignment='center', 
                multialignment='center',
                bbox={"facecolor":"white", "alpha":0.8, "pad":5})
    
    # Adjust bottom margin to accommodate legend
    # Increase bottom margin to create more space for the legend
    plt.subplots_adjust(bottom=0.35)

# Output generation with high resolution preservation
plt.savefig(output_heatmap, dpi=300, bbox_inches='tight', pad_inches=0.5)
print(f"Heatmap saved to {output_heatmap} with resolution 300 DPI")

# Generate auxiliary legend file for reference
if row_label_mapping or column_label_mapping:
    with open('antibiotics_label_reference.txt', 'w') as f:
        f.write("ANTIBIOTICS SAMPLE ID LABEL REFERENCE\n")
        f.write("=====================================\n\n")
        if row_label_mapping:
            f.write("CLASS LABELS:\n")
            for short, full in sorted(row_label_mapping.items()):
                f.write(f"{short}: {full}\n")
        if column_label_mapping:
            f.write("\nSAMPLE ID LABELS:\n")
            for short, full in sorted(column_label_mapping.items()):
                f.write(f"{short}: {full}\n")
    print("Label reference file generated: antibiotics_label_reference.txt")

# ADVANCED ANALYSIS: Generate secondary heatmap for similarity matrix
print("\nGenerating similarity matrix heatmap...")
plt.figure(figsize=(16, 14))

# Custom color gradient for similarity (utilizing matching scheme)
similarity_cmap = plt.cm.colors.LinearSegmentedColormap.from_list(
    "similarity", ["black", "green", "red"], N=256
)

# Notify user of optional clustering analysis
print("\nNOTE: For enhanced pattern detection, consider adding hierarchical clustering:")
print("  - Add 'from scipy.cluster.hierarchy import linkage, dendrogram'")
print("  - Add 'method=\"ward\"' to sns.clustermap() for optimal grouping")
print("  - Generated hierarchical clusters would reveal related antibiotic resistance patterns")

FileNotFoundError: Unable to locate target file. Please ensure 'AntibioticsSampleID.csv' exists in the current directory.

In [1]:
data.to_csv(output_matrix_csv)
print(f"Processed matrix data saved to {output_matrix_csv}")

NameError: name 'data' is not defined