In [None]:
!pip install sklearn_pandas category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.6.0-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.0


In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
import plotly.graph_objs as go
import plotly.express as px
from sklearn.metrics import silhouette_score

In [None]:
df_original = pd.read_csv('/content/European_bank_marketing.csv')

In [None]:
df_original

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,term_deposit,Ethnicity_African
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,1,0
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,0,0
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,0,0
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,1,0


In [None]:
def apply_categorical_encodings(df, threshold=3):
  for col in df.columns:
    if pd.api.types.is_categorical_dtype(df[col]) or pd.api.types.is_object_dtype(df[col]):
      unique_values = df[col].nunique()
      if unique_values <= threshold:
        one_hot_encoder = OneHotEncoder(sparse_output=False)
        encoded_col = pd.DataFrame(one_hot_encoder.fit_transform(df[[col]]))
        encoded_col.columns = [f'{col}_{val}' for val in one_hot_encoder.categories_[0]]
        df = pd.concat([df, encoded_col], axis=1)
      else:
        target_encoder = TargetEncoder()
        df[col] = target_encoder.fit_transform(df[col], df['term_deposit'])
      df.drop(col, axis=1, inplace=True)
  return df


In [None]:
def kmeans_clustering(X, cat_threshold):
    # Identify categorical features (including binary columns)
    cat_cols = [col for col in X.columns if pd.api.types.is_categorical_dtype(X[col]) or pd.api.types.is_object_dtype(X[col]) or (X[col].nunique() == 2)]
    # Identify numerical features
    num_cols = [col for col in X.columns if col not in cat_cols]

    # Apply categorical encodings to categorical features
    X = apply_categorical_encodings(X, cat_threshold)

    # Scale numerical features
    scaler = StandardScaler()
    X[num_cols] = scaler.fit_transform(X[num_cols])

    # Drop term deposit
    X.drop('term_deposit',axis = 1, inplace= True)

    # Define K-means pipeline and grid search parameters
    pipeline = Pipeline([
        ('kmeans', KMeans(random_state=42))
    ])
    grid_params = {
        'kmeans__n_clusters': range(2, 11),
    }

    # Perform grid search to find optimal number of clusters
    grid_search = GridSearchCV(pipeline, param_grid=grid_params, cv=5, n_jobs=-1)
    grid_search.fit(X)

    # Get the best estimator and number of clusters
    best_kmeans = grid_search.best_estimator_.named_steps['kmeans']
    best_n_clusters = best_kmeans.n_clusters

    # Perform K-means clustering with best number of clusters
    kmeans = KMeans(n_clusters=best_n_clusters, random_state=42)
    kmeans.fit(X)

    # Assign clusters to each data point
    clusters = kmeans.labels_
    
    return X, clusters


In [None]:
def plot_cluster_performance(X, clusters):
    # Calculate performance metrics for each cluster
    sse = []
    silhouette = []
    for n_clusters in range(2, 11):
        kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
        kmeans.fit(X)
        sse.append(kmeans.inertia_)
        silhouette.append(silhouette_score(X, kmeans.labels_))

    # Create plot of performance metrics vs number of clusters
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=list(range(2, 11)), y=sse, mode='lines+markers', name='SSE'))
    fig.add_trace(go.Scatter(x=list(range(2, 11)), y=silhouette, mode='lines+markers', name='Silhouette Coefficient'))
    fig.update_layout(title='Cluster Performance Metrics vs Number of Clusters',
                      xaxis_title='Number of Clusters', yaxis_title='Performance Metric')

    return fig


In [None]:
X, clusters = kmeans_clustering(df_original,3)
fig = plot_cluster_performance(X, clusters)
fig.show()



**K-means uses**
One approach is to use the clusters generated by K-means as features. This can be achieved by assigning a binary label to each cluster, based on the majority class of the data points in that cluster. Then, for each data point, the features are represented by a binary vector indicating the cluster membership.

Another approach is to use K-means to identify potential clusters of one class, and then use a binary classification algorithm to assign the label to each cluster. For example, in our dataset of bank marketing with two classes, we can use K-means clustering to group the customers into similar cluster, and then use a binary classification algorithm to classify each cluster as either take a term deposit or not take one.