#**K-means clustering and K-NN classification**

Instructor: Dr Mario Rosario Guarracino


---

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# % cd "/content/drive/My Drive/2020 09 Cambridge course/PyNotebooks/"

**Import libraries**

In [None]:
import warnings
warnings.filterwarnings('ignore')

import sys
import scipy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.decomposition import PCA

# For clustering
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import contingency_matrix, adjusted_rand_score, normalized_mutual_info_score

# For classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

**Load data**

In [None]:
cleveland_heart = pd.read_csv("data/heart_processed.csv")
cleveland_heart.head()

**Correlation between features**

In [None]:
# Spearman Correlation
corr_heart = cleveland_heart.corr('spearman')

sns.set(font_scale=0.9)
plt.figure(figsize=(14, 14))

sns.heatmap(corr_heart, vmax=1, linewidths=2, square=True, annot=True, cbar_kws={"shrink": 0.5}, cmap='BuPu')

# Correlation with target variable
cor_target = abs(corr_heart["Diagnosis_CHD"])

# Highly correlated features
corr_features = cor_target[cor_target > 0.4]
print("\nFeatures highly correlated with the target (Diagnosis_CHD): \n", corr_features)
print()

**Data preparation**

In [None]:
X = cleveland_heart.iloc[:,:-1].values
y = cleveland_heart.iloc[:,-1].values

# X[0:5, :]
# y[0:5]

numerical_vars = ['age', 'chol', 'trestbps', 'thalach', 'oldpeak']
numerical_vars_idx = cleveland_heart.columns.get_indexer(numerical_vars)
cat_vars = ['cp', 'restecg', 'slope', 'thal']
cat_vars_idx = cleveland_heart.columns.get_indexer(cat_vars)
print(cat_vars_idx)
print(numerical_vars_idx)



---


## **Clustering**
Unsupervised machine learning algorithms, for grouping unlabelled data.
sklearn clustering algorithms can be found in
https://scikit-learn.org/stable/modules/clustering.html

**Standardize** numerical data by subtracting the mean and scaling to unit variance. For other data scaling and pre-processing techniques refer to https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing.
**One Hot encoding** of categorical variables.

In [None]:
scaler = StandardScaler()

trans = make_column_transformer(
    (StandardScaler(), numerical_vars_idx),
    (OneHotEncoder(), cat_vars_idx), remainder='passthrough'
)

X_transformed = trans.fit_transform(X)
y = cleveland_heart.iloc[:,-1].values

### **K-means clustering**

**Elbow method: Calculate within cluster sum of squares (wcss)**

In [None]:
# Calculate within cluster sum of squares (wcss) to choose optimal clusters using the elbow method
wcss_clheart=[]

for k in range(1, 9):
    # Create a K-means object
    kmeans_clheart = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state = 1)
    y_means_clheart = kmeans_clheart.fit(X_transformed)
    
    # Record wcss for each k
    wcss_clheart.append(y_means_clheart.inertia_)

**Elbow plot: Choose k**

In [None]:
# Elbow plot to choose optimal k
plt.plot(range(1, 9), wcss_clheart, c='r')
plt.title("Elbow plot")
plt.show()

**K-means clustering** with chosen k

In [None]:
# k=2 from elbow plot
k = 2
kmeans_clheart = KMeans(n_clusters = k, init = 'k-means++', random_state = 1, max_iter=500, n_init=10)
kmeans_clheart.fit(X_transformed)
cluster_labels = kmeans_clheart.predict(X_transformed)
centers = kmeans_clheart.cluster_centers_

**Clustering evaluation metrics**

In [None]:
print("Contingency matrix: \n", contingency_matrix(y, cluster_labels))

ARI = adjusted_rand_score(y, cluster_labels)
print("\n Adjusted Rand Index: ", ARI)

NMI = normalized_mutual_info_score(y, cluster_labels)
print("\n Normalized Mutual Information: ", NMI)

**Visualize clusters**

In [None]:
pca = PCA(n_components=2)
# choose quantitative variables for visualization with PCA
X_subset = cleveland_heart.loc[:, numerical_vars]
scaler = StandardScaler()
X_subset_scaled = scaler.fit_transform(X_subset)
X_pca = pca.fit_transform(X_subset_scaled)

f, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(10, 5))
scatter1 = ax1.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap="Spectral", alpha=0.7)
scatter2 = ax2.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap="Spectral", alpha=0.7)

# produce a legend with the unique colors from the scatter
legend1 = ax1.legend(*scatter1.legend_elements(), title="Diagnosis_CHD")
ax1.add_artist(legend1)
ax1.set_facecolor('white')
legend2 = ax2.legend(*scatter2.legend_elements(), title="Cluster Members", )
ax2.add_artist(legend2)
ax2.set_facecolor('white')

ax1.set_title("Original labels: 0-absent, 1-present")
ax2.set_title('K-Means Clustering labels')



---


## **Classification**

### **KNN classification**

**Split and scale data**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1, shuffle=True)

trans = make_column_transformer(
    (StandardScaler(), numerical_vars_idx),
    (OneHotEncoder(), cat_vars_idx), remainder='passthrough'
)

trans.fit(X_train)
X_train_transformed = trans.transform(X_train)
X_test_transformed = trans.transform(X_test)

**KNN classification**

In [None]:
# Initialize KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors = 2, metric = 'minkowski', p = 2)

# Fit model to training data
model = model.fit(X_train_transformed, y_train)

# Predict on test set
y_predicted = model.predict(X_test_transformed)


**Classification Metrics**

In [None]:
print('Accuracy:', accuracy_score(y_test, y_predicted), '\n')
print('Confusion Matrix:',  '\n', confusion_matrix(y_test, y_predicted), '\n')
print('Classification Report:',  '\n', classification_report(y_test, y_predicted), '\n')

**Parameter selection: Choose k**

In [None]:
for num_neighbor in range(1, 10):
  model = KNeighborsClassifier(n_neighbors = num_neighbor, metric = 'minkowski', p = 2)
  model = model.fit(X_train_transformed, y_train)

  # Predict on test set
  y_predicted = model.predict(X_test_transformed)

  # Accuracy
  accuracy = accuracy_score(y_test, y_predicted)
  print('Accuracy: k =', num_neighbor, ': {:.2f}'.format(accuracy))


**Plotting decision boundaries for KNN classifier**
An example using 3 features: age, chol and trestbps

In [None]:
from mlxtend.plotting import plot_decision_regions

X = cleveland_heart.iloc[:,:-1].values

scaler = StandardScaler()

trans = make_column_transformer(
    (StandardScaler(), numerical_vars_idx),
    (OneHotEncoder(), cat_vars_idx), remainder='passthrough'
)

X_transformed = trans.fit_transform(X)
y = cleveland_heart.iloc[:,-1].values

column_index = [0, 3, 4]
X_sub = X[:, column_index]
#X_sub = X

X_transformed = scaler.fit_transform(X_sub)

for num_neighbor in range(1, 4):
  model = KNeighborsClassifier(n_neighbors = num_neighbor)
  model = model.fit(X_transformed, y)

  fig, ax = plt.subplots()
  # Decision region for feature 3 = 1.5
  value = 1.5
  # Plot training sample with feature 3 = 1.5 +/- 0.75
  width = 0.75
  plot_decision_regions(X_transformed, y, clf=model,
                        filler_feature_values={2: value},
                        filler_feature_ranges={2: width},
                        legend=2, ax=ax)

  ax.set_xlabel('age')
  ax.set_ylabel('trestbps')
  ax.set_title('chol')

  # Adding axes annotations
  fig.suptitle('Decision Boundary: KNN = ' + str(num_neighbor))
  plt.show()




---

