# Clustering

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [2]:
import plotly.graph_objs as go
from plotly.offline import  init_notebook_mode, iplot

In [3]:
import pandas as pd

In [4]:
data = pd.read_csv("data/heart_disease/heart_disease_combined.csv")

In [5]:
data = data.drop(labels="Unnamed: 0", axis=1)  # drop unnecessary cols

As proposed in Task 2, exercise 1 a), clustering algorithms should be applied on the whole dataset, without imputation or dimensionality reduction. However, sklearn.cluster methods cannot handle NA's. We continue by dropping rows which contain NA values.

In [6]:
data = data.dropna(axis=0)  # drop rows which contain missing values

In [7]:
data.dtypes

age         float64
sex         float64
cp          float64
trestbps    float64
chol        float64
fbs         float64
restecg     float64
thalach     float64
exang       float64
oldpeak     float64
slope       float64
ca          float64
thal        float64
num           int64
dtype: object

## Prepare Dataset for K-Means

For K-Means clustering we need to feature-scale numerical variables. K-Means measures euclidean distance between data-points. If we leave numerical variables un-scaled, then most of the distance measured between points would be attributed to the larger numerical variables rather than any of the categorical variables. 

However, some of our categorical variables hold more values than either $0$ or $1$. Therefore, we also need to convert multiclass labels to binary labels. 

Let's make these two preprocseeing steps optional for the User.

In [8]:
# change dtype of categorical variables
dtyp = {"sex": "category",
        "cp": "category",
        "restecg": "category",
        "fbs": "category",
        "exang": "category",
        "slope": "category",
        "ca": "category",
        "thal": "category",
        "num": "category"
}
dat = data.astype(dtyp)
dat.dtypes

age          float64
sex         category
cp          category
trestbps     float64
chol         float64
fbs         category
restecg     category
thalach      float64
exang       category
oldpeak      float64
slope       category
ca          category
thal        category
num         category
dtype: object

In [9]:
# first divide dataframe into two other dataframes
cater_columns = list(dat.columns[dat.dtypes == 'category'])

In [10]:
cater = dat[cater_columns]

In [11]:
numer_columns = list(dat.columns[dat.dtypes != 'category'])

In [12]:
numer_columns

['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

In [13]:
numer = dat[numer_columns]

### Rescale numerical variables

In [14]:
# initialize scaler
scaler = StandardScaler()

In [15]:
numer = pd.DataFrame(scaler.fit_transform(numer))

In [16]:
numer.columns = [sub + "_scaled" for sub in numer_columns]

### Binarize categorical variables

In [17]:
cater = cater.astype("int")

In [18]:
cater.loc[cater.cp < 4, "cp"] = 1
cater.loc[cater.cp == 4, "cp"] = 0  # cp val 4 means asymptomatic
cater.loc[cater.thal == 3, "thal"] = 0  # thal val 3 is normal
cater.loc[cater.thal == 6, "thal"] = 1
cater.loc[cater.thal == 7, "thal"] = 1
cater.loc[cater.slope == 1, "slope"] = 0  # bc upsloping (val 1) is "normal"
cater.loc[cater.slope > 1, "slope"] = 1
cater.loc[cater.ca > 0, "ca"] = 1  # no major vessels is "normal"
cater.loc[cater.restecg > 0, "restecg"] = 1 # 0 is normal
cater.loc[cater.num > 0, "num"] = 1  # 0 means disease not present

In [19]:
cater.columns = [sub + "_binarized" for sub in cater_columns]

In [20]:
dat_preproc = pd.concat([numer, cater], axis=1, join="inner")

## K-Means

In [21]:
# define arguments
# should be adjustable by users in App
k = 3
init_method = "k-means++"
# or "random" 

__Initialization__ __Methods__
- _k-means++_
  
  selects initial cluster centroids using sampling based on an empirical probability distribution of the points’ contribution to the overall inertia.
- _random_
  
   choose n_clusters observations (rows) at random from data for the initial centroids.

Let's define a maximum of clusters, which should be the default value (8)

In [22]:
max_k = 8

In [23]:
kmeans = KMeans(n_clusters=k, init=init_method, random_state=2022) # use random_state to make randomness deterministic

### ... on unprocessed data

In [25]:
# create matrix of shape (n_samples)(n_features)
# orient='records' to enable usage by DictVectorizer
data_dict = data.to_dict(orient='records')

In [24]:
# create vectorizer
vec = DictVectorizer()

In [26]:
matrix = vec.fit_transform(data_dict).toarray()

In [27]:
# note different order of column names!
feature_names = vec.get_feature_names_out()
feature_names

array(['age', 'ca', 'chol', 'cp', 'exang', 'fbs', 'num', 'oldpeak',
       'restecg', 'sex', 'slope', 'thal', 'thalach', 'trestbps'],
      dtype=object)

In [28]:
kmeans_unpr = kmeans.fit(matrix)

In [29]:
clusters = kmeans_unpr.labels_

In [30]:
data["Clusters"] = clusters

### ... on processed data

In [31]:
vec_preproc = DictVectorizer()

In [32]:
# create matrix of shape (n_samples)(n_features)
# orient='records' to enable usage by DictVectorizer
dat_preproc_dict = dat_preproc.to_dict(orient='records')
matrix_preproc = vec_preproc.fit_transform(dat_preproc_dict).toarray()
# note different order of column names!
feature_names_preproc = vec_preproc.get_feature_names_out()
feature_names_preproc

array(['age_scaled', 'ca_binarized', 'chol_scaled', 'cp_binarized',
       'exang_binarized', 'fbs_binarized', 'num_binarized',
       'oldpeak_scaled', 'restecg_binarized', 'sex_binarized',
       'slope_binarized', 'thal_binarized', 'thalach_scaled',
       'trestbps_scaled'], dtype=object)

In [33]:
kmeans_pro = kmeans.fit(matrix_preproc)  # use random_state to make randomness deterministic

In [34]:
clusters_proc = kmeans_pro.labels_

In [35]:
dat_preproc["Clusters"] = clusters_proc

## Visualization

In [36]:
init_notebook_mode(connected=True)

We have to apply PCA on our dataset to visualize clusters in 2D space.

In [37]:
pca_2d = PCA(n_components=2)

In [38]:
# create colors, we need max k_max colors
colors = ["rgba(255, 128, 255, 0.8)", # pink
          "rgba(255, 128, 2, 0.8)", # orange
          "rgba(0, 255, 200, 0.8)", # turquoise
          "rgba(102, 205, 0, 0.8)" ,# green
          "rgba(255, 215, 0, 0.8)", # yellow
          "rgba(123, 104, 238, 0.8)", #mediumslateblue
          "rgba(51, 161, 201, 0.8)", # peacock
          "rgba(255, 48, 48, 0.8)", # firebrick1
]

In [39]:
title = "Visualizing Clusters in Two Dimensions Using PCA"

In [40]:
layout = dict(title=title,
              xaxis= dict(title= "PC1", ticklen=5, zeroline=False),
              yaxis=dict(title= "PC2", ticklen=5, zeroline=False)
)

In [41]:
def SubClusterDF(k, df):
    ls = []
    for i in range(k):
        cluster = df[df["Clusters"] == i]
        ls.append(cluster)
    return ls

In [42]:
def SubScatterCluster(clusters, cols):
    ls = []
    for i in range(len(clusters)):
        trace = go.Scatter(
            x=clusters[i]["PC1"],
            y=clusters[i]["PC2"],
            mode="markers",
            name=f"Cluster {i}",
            marker=dict(color = cols[i]),
            text=None
        )
        ls.append(trace)
    return ls


### ...for unprocessed data

In [43]:
PCs_2d = pd.DataFrame(pca_2d.fit_transform(data.drop(["Clusters"], axis=1)))

In [44]:
PCs_2d.columns = ["PC1", "PC2"]

In [45]:
plotData = pd.concat([data, PCs_2d], axis=1, join="inner")

In [46]:
x = SubClusterDF(k, plotData)

In [47]:
traces = SubScatterCluster(clusters=x, cols=colors)

In [48]:
fig = dict(data=traces, layout=layout)

In [49]:
iplot(fig)

### ...for processed data

In [50]:
PCs_2d_proc = pd.DataFrame(pca_2d.fit_transform(dat_preproc.drop(["Clusters"], axis=1)))

In [51]:
PCs_2d_proc.columns = ["PC1", "PC2"]

In [52]:
plotData_proc = pd.concat([dat_preproc, PCs_2d], axis=1, join="inner")

In [53]:
x_proc = SubClusterDF(k, plotData_proc)

In [54]:
traces_proc = SubScatterCluster(clusters=x_proc, cols=colors)

In [55]:
fig_proc = dict(data=traces_proc, layout=layout)

In [56]:
iplot(fig_proc)

## Some References

Some useful information:
- [Visualizing High Dimensional Clusters](https://www.kaggle.com/code/minc33/visualizing-high-dimensional-clusters/notebook) and role of Scaling numerical values prior to clustering
- [Why do dimensionality reduction before Clustering (normally)](https://stats.stackexchange.com/questions/99171/why-is-euclidean-distance-not-a-good-metric-in-high-dimensions/)

Usage of Cluster Methods:
- [Usage of sklearn.cluster.KMeans](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans)