<a href="https://colab.research.google.com/github/alfredqbit/NU-DDS-8515/blob/main/sepulvedaATIM_8515_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Assignment 1: Exploratory Data Analysis of a Multivariate Dataset

DDS-8515 - Multivariate Analysis

- utilize IRIS dataset for exercises
- perform EDA

In [None]:

# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import zscore
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler

1. Load dataset
 - well-known IRIS dataset to be utilize for exercises
 - form predictor features dataframe and target feature for algorithms going forward

In [None]:
iris = load_iris()
X = iris.data
feature_names = iris.feature_names
y = iris.target
df = pd.DataFrame(X, columns=feature_names)
df['target'] = pd.Categorical.from_codes(y, iris.target_names)

2. Dataset Exploration
 - find shape of dataset
 - find missing values (if any)
 - derive summary statistics
 - compute corrrleation matrix

In [None]:
print("Shape:", df.shape)
print(df.dtypes)
print("\nMissing values per column:\n", df.isna().sum())
print("\nSummary statistics:\n", df.describe())

# Compute correlation matrix for numeric features
corr = df.iloc[:, :-1].corr()
print("\nCorrelation matrix:\n", corr)

3. Data Cleaning
 - if missing date, handle imputations
 - detect any outliers using 3-sigma rule threshold and remove per rule
 - standard scale predictor feature data (z-scores)

In [None]:
# 3.1 Handle missing values (if any)
# In this data there are none; but we still illustrate:
df_clean = df.copy()
for col in feature_names:
    if df_clean[col].isna().sum() > 0:
        df_clean[col] = df_clean[col].fillna(df_clean[col].mean())

# 3.2 Outlier detection via Z‐score
zs = np.abs(zscore(df_clean[feature_names]))
outlier_mask = (zs > 3)
num_outliers = outlier_mask.sum(axis=0)
print("Number of suspected outliers by column:", dict(zip(feature_names, num_outliers)))

# Optionally remove outliers
# df_no_out = df_clean[(zs < 3).all(axis=1)]

# Standardize numeric features for further visualization or modelling
scaler = StandardScaler()
X_std = scaler.fit_transform(df_clean[feature_names])
df_std = pd.DataFrame(X_std, columns=feature_names)
df_std['target'] = df_clean['target']

4. Multivariate Visualization
- display pair scatterplots of feature pairs
- display heatmap of feature correlation matrix
- save plot to figures subdirectory for report

In [None]:
# pair-plot
sns.pairplot(df_clean, hue='target', markers=["o", "s", "D"])
plt.suptitle("Pairplot of Iris dataset", y=1.02)
plt.savefig("pairplot.png", dpi=300)
plt.show()

# 4.2 Heatmap of correlation matrix
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Heatmap")
plt.savefig("corr_heatmap.png", dpi=300)
plt.show()


# Multivariate visualizations
 - dusplay parallel‐coordinates
 - display 3D scatter plot
 - display PCA biplot

 save all plots to figures subdirectory for report

In [None]:
import pandas.plotting as pd_plot
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# 1. Parallel Coordinates
pd_plot.parallel_coordinates(df, 'target', color=sns.color_palette("Set1", 3).as_hex())
plt.title("Parallel Coordinates Plot")
plt.xlabel("Features")
plt.ylabel("Scaled Value")
plt.savefig("parallel_coordinates.png", dpi=300)
plt.show()

# 2. 3D Scatter Plot
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df_std['sepal length (cm)'], df_std['sepal width (cm)'], df_std['petal length (cm)'],
           c=df_std['target'].cat.codes, cmap='viridis')
ax.set_xlabel('Sepal Length (cm)')
ax.set_ylabel('Sepal Width (cm)')
ax.set_zlabel('Petal Length (cm)')
plt.title("3D Scatter Plot of Iris Features")
plt.savefig("3d_scatter.png", dpi=300)
plt.show()

# 3. PCA Biplot
pca = PCA(n_components=2)
X_pca = pca.fit_transform(df_std[feature_names])

plt.figure(figsize=(10, 8))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df_std['target'].cat.codes, cmap='viridis')
plt.title('PCA Biplot')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')

# Add vectors for the loadings of each feature
for i, feature in enumerate(feature_names):
    plt.arrow(0, 0, pca.components_[0, i] * 3, pca.components_[1, i] * 3,
              color='red', alpha=0.5)
    plt.text(pca.components_[0, i] * 3.2, pca.components_[1, i] * 3.2,
             feature, color='red', ha='center', va='center')
plt.savefig("pca_biplot.png", dpi=300)
plt.show()

# 4. TSNE plot (for visualization of non-linear data structures)
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_std)

plt.figure(figsize=(10, 8))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=df_std['target'].cat.codes, cmap='viridis')
plt.title("t-SNE Plot")
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.savefig("tsne_plot.png", dpi=300)
plt.show()