In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Load the Excel file
file_path = '/mnt/data/For PCA.xlsx'
data = pd.ExcelFile(file_path)

# Load data from the "Stim" sheet
stim_data = data.parse("Stim")

# Group by Genotype and average triplicates
stim_grouped = stim_data.groupby('Genotype').mean()

# Select features for PCA (columns B-G)
stim_features = stim_grouped.iloc[:, :]

# Standardize the data
scaler = StandardScaler()
stim_scaled = scaler.fit_transform(stim_features)

# Perform PCA
pca_stim = PCA()
stim_pca_result = pca_stim.fit_transform(stim_scaled)

# Create a DataFrame for PCA results
stim_pca_df = pd.DataFrame(
    stim_pca_result[:, :2],  # Use first two PCs for 2D visualization
    index=stim_grouped.index,
    columns=["PC1", "PC2"]
)

# Beautified 2D PCA plot for "Stim" sheet
def beautified_2d_pca(pca_df, title):
    plt.figure(figsize=(12, 8))
    plt.scatter(
        pca_df["PC1"], 
        pca_df["PC2"], 
        s=150, alpha=0.8, color="blue", edgecolors='black', label="Gene Variant", marker='o'
    )
    for i, txt in enumerate(pca_df.index):
        plt.annotate(
            txt, 
            (pca_df["PC1"].iloc[i] + 0.02, pca_df["PC2"].iloc[i] + 0.02), 
            fontsize=10, 
            color="darkblue"
        )
    plt.title(title, fontsize=16, fontweight='bold')
    plt.xlabel("Principal Component 1", fontsize=12)
    plt.ylabel("Principal Component 2", fontsize=12)
    plt.grid(visible=True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.legend()
    plt.show()

beautified_2d_pca(stim_pca_df, "PCA: Stim Sheet")

# Repeat the process for the "Len Stim" sheet
len_stim_data = data.parse("Len Stim")

# Group by Genotype and average triplicates
len_stim_grouped = len_stim_data.groupby('Genotype').mean()

# Select features for PCA (columns B-G)
len_stim_features = len_stim_grouped.iloc[:, :]

# Standardize the data
len_stim_scaled = scaler.fit_transform(len_stim_features)

# Perform PCA
pca_len_stim = PCA()
len_stim_pca_result = pca_len_stim.fit_transform(len_stim_scaled)

# Create a DataFrame for PCA results
len_stim_pca_df = pd.DataFrame(
    len_stim_pca_result[:, :2],  # Use first two PCs for 2D visualization
    index=len_stim_grouped.index,
    columns=["PC1", "PC2"]
)

# Beautified 2D PCA plot for "Len Stim" sheet
beautified_2d_pca(len_stim_pca_df, "PCA: Len Stim Sheet")
