<a href="https://colab.research.google.com/github/alfredqbit/NU-DDS-8515/blob/main/sepulvedaATIM_8515_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# multivariate_analysis.ipynb

# %%
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler

# %%
# 1. Load dataset
iris = load_iris()
X = iris.data
feature_names = iris.feature_names
y = iris.target
df = pd.DataFrame(X, columns=feature_names)
df['target'] = pd.Categorical.from_codes(y, iris.target_names)

# %%
# 2. Dataset Exploration
print("Shape:", df.shape)
print(df.dtypes)
print("\nMissing values per column:\n", df.isna().sum())
print("\nSummary statistics:\n", df.describe())

# Compute correlation matrix for numeric features
corr = df.iloc[:, :-1].corr()
print("\nCorrelation matrix:\n", corr)

# %%
# 3. Data Cleaning
# 3.1 Handle missing values (if any)
# In this data there are none; but we still illustrate:
df_clean = df.copy()
for col in feature_names:
    if df_clean[col].isna().sum() > 0:
        df_clean[col] = df_clean[col].fillna(df_clean[col].mean())

# 3.2 Outlier detection via Z‐score
zs = np.abs(zscore(df_clean[feature_names]))
outlier_mask = (zs > 3)
num_outliers = outlier_mask.sum(axis=0)
print("Number of suspected outliers by column:", dict(zip(feature_names, num_outliers)))

# Optionally remove outliers
# df_no_out = df_clean[(zs < 3).all(axis=1)]

# Standardize numeric features for further visualization or modelling
scaler = StandardScaler()
X_std = scaler.fit_transform(df_clean[feature_names])
df_std = pd.DataFrame(X_std, columns=feature_names)
df_std['target'] = df_clean['target']

# %%
# 4. Multivariate Visualization
# 4.1 Pair‐plot
sns.pairplot(df_clean, hue='target', markers=["o", "s", "D"])
plt.suptitle("Pairplot of Iris dataset", y=1.02)
plt.savefig("pairplot.png", dpi=300)
plt.show()

# 4.2 Heatmap of correlation matrix
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Heatmap")
plt.savefig("corr_heatmap.png", dpi=300)
plt.show()

# %%
# (Additional multivariate visualizations could be added: e.g., parallel‐coordinates, 3D scatter, PCA biplot.)

# %%
# End of notebook