<a href="https://colab.research.google.com/github/candpixie/pcos-informatics/blob/main/notebooks/02_feature_correlation_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 02: Feature Correlation & Selection for PCOS Prediction

**Goal:** Identify which clinical features are most strongly associated with PCOS diagnosis.

**Research Question:** Which biomarkers show the strongest correlation with PCOS status?

**Methods:**
- Correlation analysis
- Statistical testing (t-tests)
- Feature importance via Random Forest

---

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

In [None]:
# Load data from GitHub
df = pd.read_csv("https://raw.githubusercontent.com/candpixie/pcos-informatics/main/data/PCOS_infertility.csv")

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

## 1. Correlation Analysis

First, let's examine correlations between all features and the PCOS diagnosis.

In [None]:
# Calculate correlation matrix
correlation_matrix = df.corr()

# Create correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix,
            annot=True,
            cmap='coolwarm',
            center=0,
            fmt='.2f',
            square=True,
            linewidths=0.5)
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

print("\nCorrelation matrix calculated successfully.")

In [None]:
# Find the target column name (might be 'PCOS (Y/N)' or similar)
# Let's check what columns we have
print("Available columns:")
for i, col in enumerate(df.columns):
    print(f"{i}: {col}")

# Assuming the last column is the target (common in datasets)
# Adjust this if needed based on your actual column name
target_col = df.columns[-1]  # or specify exact name like 'PCOS (Y/N)'
print(f"\nUsing '{target_col}' as target variable")

In [None]:
# Sort features by correlation with PCOS status
pcos_correlations = correlation_matrix[target_col].sort_values(ascending=False)

print("\n=== Features Ranked by Correlation with PCOS ===")
print(pcos_correlations)

# Visualize
plt.figure(figsize=(10, 6))
pcos_correlations.drop(target_col).plot(kind='barh', color='steelblue')
plt.title('Feature Correlation with PCOS Status', fontsize=14, fontweight='bold')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Features')
plt.axvline(x=0, color='black', linestyle='--', linewidth=0.8)
plt.tight_layout()
plt.show()

## 2. Statistical Testing (T-Tests)

Compare feature distributions between PCOS and non-PCOS groups.

In [None]:
# Separate PCOS positive and negative groups
pcos_positive = df[df[target_col] == 1]
pcos_negative = df[df[target_col] == 0]

print(f"PCOS Positive samples: {len(pcos_positive)}")
print(f"PCOS Negative samples: {len(pcos_negative)}")

# Perform t-tests for each feature
feature_cols = [col for col in df.columns if col != target_col]

ttest_results = []

for feature in feature_cols:
    # Perform independent t-test
    t_stat, p_value = stats.ttest_ind(
        pcos_positive[feature].dropna(),
        pcos_negative[feature].dropna()
    )

    # Calculate mean difference
    mean_pos = pcos_positive[feature].mean()
    mean_neg = pcos_negative[feature].mean()
    mean_diff = mean_pos - mean_neg

    ttest_results.append({
        'Feature': feature,
        'T-Statistic': t_stat,
        'P-Value': p_value,
        'PCOS Mean': mean_pos,
        'Non-PCOS Mean': mean_neg,
        'Mean Difference': mean_diff,
        'Significant (p<0.05)': 'Yes' if p_value < 0.05 else 'No'
    })

# Create results dataframe
ttest_df = pd.DataFrame(ttest_results)
ttest_df = ttest_df.sort_values('P-Value')

print("\n=== Statistical Significance of Features ===")
print(ttest_df.to_string(index=False))

## 3. Feature Importance via Random Forest

Use a Random Forest model to determine feature importance.

In [None]:
# Prepare data for modeling
X = df[feature_cols]
y = df[target_col]

# Handle any missing values
X = X.fillna(X.mean())

# Train Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    max_depth=10
)

rf_model.fit(X, y)

# Get feature importances
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\n=== Random Forest Feature Importance ===")
print(feature_importance.to_string(index=False))

# Visualize
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'], color='forestgreen')
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.title('Random Forest Feature Importance for PCOS Prediction',
          fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 4. Summary of Key Findings

Combining correlation, statistical testing, and feature importance.

In [None]:
# Create comprehensive feature ranking
feature_summary = pd.DataFrame({
    'Feature': feature_cols
})

# Add correlation scores
feature_summary['Correlation'] = feature_summary['Feature'].map(
    lambda x: abs(pcos_correlations[x]) if x in pcos_correlations.index else 0
)

# Add p-values from t-tests
feature_summary['P-Value'] = feature_summary['Feature'].map(
    ttest_df.set_index('Feature')['P-Value']
)

# Add RF importance
feature_summary['RF_Importance'] = feature_summary['Feature'].map(
    feature_importance.set_index('Feature')['Importance']
)

# Calculate composite rank (lower is better)
feature_summary['Correlation_Rank'] = feature_summary['Correlation'].rank(ascending=False)
feature_summary['PValue_Rank'] = feature_summary['P-Value'].rank()
feature_summary['RF_Rank'] = feature_summary['RF_Importance'].rank(ascending=False)

# Average rank
feature_summary['Average_Rank'] = feature_summary[
    ['Correlation_Rank', 'PValue_Rank', 'RF_Rank']
].mean(axis=1)

feature_summary = feature_summary.sort_values('Average_Rank')

print("\n" + "="*80)
print("FINAL FEATURE RANKING (Top Predictors of PCOS)")
print("="*80)
print(feature_summary.to_string(index=False))
print("\n" + "="*80)

## Key Takeaways

**Top predictive features identified:**
1. [Will be filled in after running]
2. [Will be filled in after running]
3. [Will be filled in after running]

**Next steps:**
- Build ML classification models using top features
- Validate findings with cross-validation
- Compare model performance