
## Exercise 5: Correlation Matrix & Heatmap

**Problem Description:**  
Detect multicollinearity by identifying highly correlated feature pairs.

**Solution Overview:**  
Compute Pearson correlation matrix and visualize with heatmap.



In [None]:
!pip install scikit-learn pandas matplotlib seaborn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.feature_selection import VarianceThreshold

In [None]:
# 5.1 Compute correlation matrix
corr = X.corr()

In [None]:
# 5.2 Plot heatmap
plt.figure(figsize=(6,5))
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Feature Correlation Matrix')
plt.show()

In [None]:
# 5.3 Identify high-correlation pairs
high_corr = [(i, j, corr.loc[i,j])
for i in corr.columns for j in corr.columns
if i != j and abs(corr.loc[i,j]) > 0.7 and i < j]
print("Highly correlated pairs (|r|>0.7):", high_corr)

# Analysis
- Identify pairs with |correlation| > 0.7.

- Discuss multicollinearity implications.

- Remove one feature from each high-correlation pair and retrain a simple model to observe performance change.

# Exercise 5: Correlation & Heatmap
## Analysis to Include in Code

In [None]:
# High-Correlation Pairs
for i,j,r in high_corr:
    print(f"{i} ↔ {j}: r={r:.2f}")


In [None]:
# Redundancy Impact

drop = [i for i,j,_ in high_corr]
X_reduced = X.drop(columns=drop)
model = LogisticRegression(max_iter=200).fit(X_train[drop], y_train)
print("Accuracy without redundant:", accuracy_score(y_test, model.predict(X_test[drop])))


In [None]:
# Partial Correlation (optional)
import pingouin as pg
pg.partial_corr(data=pd.concat([X, y.rename('target')], axis=1), x='sepal length (cm)', y='petal length (cm)', covar=['sepal width (cm)','petal width (cm)'])
