#chemical space comparison between AMI and other 6 datasets

In [3]:
"""
PCA Comparison Analysis Script
Usage: python pca-comparison.py
"""

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

def load_fingerprints_from_csv(file_path):
    """Load fingerprint data from CSV file"""
    try:
        fp_columns = [f'fp_{i}' for i in range(1024)]
        df = pd.read_csv(file_path)
        return df[fp_columns].values
    except Exception as e:
        print(f"Error loading file {file_path}: {e}")
        return None

def perform_pca(fps1, fps2, label1, label2, n_components=2):
    """Perform PCA on two datasets and return DataFrame"""
    all_fps = np.vstack([fps1, fps2])
    pca = PCA(n_components=n_components)
    reduced = pca.fit_transform(all_fps)

    df = pd.DataFrame(reduced, columns=[f'PCA{i+1}' for i in range(n_components)])
    df['Label'] = np.concatenate([np.full(len(fps1), label1),
                                  np.full(len(fps2), label2)])
    return df

def main():
    """Main function"""
    print("Starting PCA comparison analysis...")
    
    # Set paths
    folder_path = "./processed_results"
    
    # Reference file
    reference_file = "AM-I-filtered.csv"
    reference_path = os.path.join(folder_path, reference_file)
    
    # Target files list
    target_files = [
        "AM-II-filtered.csv",
        "AM-III-filtered.csv",
        "AM-IV-filtered.csv",
        "AM-V-filtered.csv",
        "AM-VI-filtered.csv",
        "AM-VII-filtered.csv"
    ]


    # Subplot labels
    subplot_labels = ['(a)', '(b)', '(c)', '(d)', '(e)', '(f)']
    
    # Create output folder
    output_folder = "./pca-comparison"
    os.makedirs(output_folder, exist_ok=True)
    
    # Check if reference file exists
    if not os.path.exists(reference_path):
        print(f"Error: Reference file '{reference_file}' does not exist in current directory")
        print(f"Current directory: {os.getcwd()}")
        sys.exit(1)
    
    # Load reference fingerprints
    print(f"Loading reference file: {reference_file}")
    reference_fps = load_fingerprints_from_csv(reference_path)
    if reference_fps is None:
        print("Error: Reference file loading failed, please check file format")
        sys.exit(1)
    
    # Check target files
    missing_files = []
    for file_name in target_files:
        file_path = os.path.join(folder_path, file_name)
        if not os.path.exists(file_path):
            missing_files.append(file_name)
    
    if missing_files:
        print("Warning: The following files do not exist:")
        for f in missing_files:
            print(f"  - {f}")
        print("Please ensure all files are in the current directory")
    
    # Prepare canvas
    print("Generating PCA analysis plots...")
    fig, axes = plt.subplots(2, 3, figsize=(24, 16))
    axes = axes.flatten()
    
    # Unified colors
    colors = {reference_file: "#007AFF"}
    for f in target_files:
        colors[f] = "#FFCC00"
    
    # Process each target file sequentially
    processed_count = 0
    for idx, file_name in enumerate(target_files):
        file_path = os.path.join(folder_path, file_name)
        
        if not os.path.exists(file_path):
            axes[idx].text(0.5, 0.5, f"File not found:\n{file_name}", 
                         ha='center', va='center', fontsize=16)
            axes[idx].set_title(f"{subplot_labels[idx]}", 
                              fontsize=22, pad=20, loc='left', fontweight='bold')
            continue
        
        fps = load_fingerprints_from_csv(file_path)
        if fps is None:
            axes[idx].text(0.5, 0.5, f"Loading failed:\n{file_name}", 
                         ha='center', va='center', fontsize=16)
            axes[idx].set_title(f"{subplot_labels[idx]}", 
                              fontsize=22, pad=20, loc='left', fontweight='bold')
            continue
        
        # Perform PCA
        df = perform_pca(reference_fps, fps, reference_file, file_name)
        processed_count += 1
        
        # Plot scatter plot
        ax = axes[idx]
        sns.scatterplot(data=df,
                        x='PCA1', y='PCA2',
                        hue='Label',
                        palette=colors,
                        s=120,
                        alpha=0.7,
                        ax=ax)
        
        # Set title with subplot label only
        ax.set_title(f"{subplot_labels[idx]}", 
                    fontsize=22, pad=20, loc='left', fontweight='bold')
        ax.set_xlabel("PCA 1", fontsize=20)
        ax.set_ylabel("PCA 2", fontsize=20)
        ax.tick_params(axis='both', labelsize=18)
        
        # Set legend
        handles, labels = ax.get_legend_handles_labels()
        legend_labels = []
        for l in labels:
            l_clean = l.replace('.csv', '').replace('-filtered', '')
            legend_labels.append(l_clean)
        
        ax.legend(handles=handles,
                  labels=legend_labels,
                  title=None,
                  fontsize=18,
                  loc='best')
    
    # Save results
    plt.tight_layout()
    out_path = os.path.join(output_folder, "merged_6_panels-1.png")
    plt.savefig(out_path, dpi=600, bbox_inches='tight')
    plt.close()
    
    print(f"Successfully processed {processed_count} files")
    print(f"PCA analysis plot saved to: {out_path}")
    print("Analysis completed!")

if __name__ == "__main__":
    main()

Starting PCA comparison analysis...
Loading reference file: AM-I-filtered.csv
Generating PCA analysis plots...
Successfully processed 6 files
PCA analysis plot saved to: ./pca-comparison/merged_6_panels-1.png
Analysis completed!
