<a target="_blank" href="https://colab.research.google.com/github/ZHAW-ZAV/TSO-FS25-students/blob/main/07_unsupervised_ml/iris.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# Loading the Iris dataset

In [None]:
import plotly.express as px
df = px.data.iris()
df

# Dimensionality reduction
## PCA: Principal Component Analysis

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# first we need to standardize the data
features = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[features]), columns=features)
df_scaled.describe()

In [None]:
# then we calculate the covariance matrix
cov_matrix = df_scaled.cov()
cov_matrix

In [None]:
# then we calculate the eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
print(f"Eigenvalues: {eigenvalues}")
print(f"Eigenvectors: {eigenvectors}")

### 1D reduction

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# first we need to standardize the data
features = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[features]), columns=features)
df_scaled.describe()

# then we calculate the covariance matrix
cov_matrix = df_scaled.cov()
cov_matrix

# then we calculate the eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
# print(f"Eigenvalues: {eigenvalues}")

# Sort the eigenvalues and corresponding eigenvectors in descending order
sorted_idx = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[sorted_idx]
eigenvectors = eigenvectors[
    :, sorted_idx
]  # sort columns so each column is an eigenvector

# Select the first 2 eigenvectors (columns) for projection
eigenvectors_1 = eigenvectors[:, :1]

# Project the data onto the 1 principal components
df_pca = pd.DataFrame(np.dot(df_scaled, eigenvectors_1), columns=["PC1"])
px.scatter(df_pca, x="PC1", y=[0] * len(df_pca), color=df["species"])

### 2D reduction

**Task**: Obtain the 2 principal components and visualize the data in 2D with plotly

### 3D reduction

**Task**: Obtain the 3 principal components and visualize the data in 3D with plotly

# Clustering
## K means clustering


In [None]:
# standardize the data
features = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[features]), columns=features)
df_scaled.describe()

In [None]:
# randomly choose K centroids
K = 3
centroids = df_scaled.sample(n=K)
centroids

In [None]:
# calculate the distance between each point and each centroid
distances_centroid1 = np.sqrt(np.sum((df_scaled - centroids.iloc[0]) ** 2, axis=1))
distances_centroid2 = np.sqrt(np.sum((df_scaled - centroids.iloc[1]) ** 2, axis=1))
distances_centroid3 = np.sqrt(np.sum((df_scaled - centroids.iloc[2]) ** 2, axis=1))
# now assign cluster based on closest centroid
clusters = np.argmin(
    np.array([distances_centroid1, distances_centroid2, distances_centroid3]), axis=0
)
df_scaled["cluster"] = clusters
df_scaled

In [None]:
# recalculate the centroids
centroids = df_scaled.groupby("cluster").mean()
centroids

In [None]:
# repeat distance calculation and cluster assignment
distances_centroid1 = np.sqrt(np.sum((df_scaled - centroids.iloc[0]) ** 2, axis=1))
distances_centroid2 = np.sqrt(np.sum((df_scaled - centroids.iloc[1]) ** 2, axis=1))
distances_centroid3 = np.sqrt(np.sum((df_scaled - centroids.iloc[2]) ** 2, axis=1))
# now assign cluster based on closest centroid
clusters = np.argmin(
    np.array([distances_centroid1, distances_centroid2, distances_centroid3]), axis=0
)
df_scaled["cluster"] = clusters
df_scaled

In [148]:
# repeat until centroids don't change
change = True
previous_centroids = centroids.copy()
while change:
    centroids = df_scaled.groupby("cluster").mean()
    # repeat distance calculation and cluster assignment
    distances_centroid1 = np.sqrt(np.sum((df_scaled - centroids.iloc[0]) ** 2, axis=1))
    distances_centroid2 = np.sqrt(np.sum((df_scaled - centroids.iloc[1]) ** 2, axis=1))
    distances_centroid3 = np.sqrt(np.sum((df_scaled - centroids.iloc[2]) ** 2, axis=1))
    #  assign cluster based on closest centroid
    clusters = np.argmin(
        np.array([distances_centroid1, distances_centroid2, distances_centroid3]),
        axis=0,
    )
    df_scaled["cluster"] = clusters
    change = not np.array_equal(centroids, previous_centroids)
    previous_centroids = centroids.copy()



**Task**: add the obtained clusters to the 2D / 3D PCA and visualize the data in 2D / 3D with plotly

## DBSCAN clustering

In [None]:
from sklearn.cluster import DBSCAN

# standardize the data
features = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[features]), columns=features)

# apply DBscan
dbscan = DBSCAN(eps=0.6, min_samples=4)
dbscan.fit(df_scaled)
df_scaled["cluster"] = dbscan.labels_
# we apply PCA to plot in 2D
df_dbscan = pd.DataFrame(np.dot(df_scaled[features], eigenvectors_2), columns=["PC1", "PC2"])
df_dbscan["cluster"] = dbscan.labels_
px.scatter(df_dbscan, x="PC1", y="PC2", color="cluster", height=500)
