# Classification using Clustering : KMeans 

Here I will design a pipeline to first do clustering with Kmeans and then apply a classification algorithm on the labels

In [1]:
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import pandas as pd
from src.utils import load_data



In [2]:
dfX, dfy = load_data(return_X_y=True)

In [3]:
from src.preprocessing import NaNDropper

In [6]:
pre1 = Pipeline([
    ('nan_drpr', NaNDropper(['Embarked']))
])

dfX_pre, dfy_pre = pre1.fit_transform(dfX, dfy)

## Preprocessing

In [7]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [8]:
class KMeansPredictor(KMeans):
    
    def __init__(self, n_clusters=8, init='k-means++', n_init=10,
                 max_iter=300, tol=1e-4, precompute_distances='auto',
                 verbose=0, random_state=None, copy_x=True,
                 n_jobs=None, algorithm='auto'):
        
        super().__init__(
            n_clusters, 
            init, 
            n_init, 
            max_iter,
            tol,
            precompute_distances,
            verbose,
            random_state,
            copy_x,
            n_jobs, 
            algorithm)
        
    
    def transform(self, X, y=None):
        return self.predict(X).reshape(-1, 1)
    
    def fit_transform(self, X, y=None):
        return self.fit_predict(X).reshape(-1, 1)
        
        

In [104]:
pre2 = Pipeline([
    ('imp', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('cluster', KMeansPredictor(n_clusters=8)),
    ('enc', OneHotEncoder(drop='first'))
], 'passthrough')



precomb2 = ColumnTransformer([
    ('clmn_drpr', 'drop', ['Name', 'Ticket', 'Cabin']),
    ('enc', OneHotEncoder(drop='first'), ['Sex', 'Embarked']),
    ('imp_scaler', pre2, ['Age', 'Fare'])
], 'passthrough')


## Cluster and Classify

In [105]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [106]:
pipe = Pipeline([
    ('preprocess', precomb2),
    ('scale', StandardScaler()),
    ('ml', KNeighborsClassifier())
])

In [107]:
pipe.steps

[('preprocess',
  ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                    transformer_weights=None,
                    transformers=[('clmn_drpr', 'drop',
                                   ['Name', 'Ticket', 'Cabin']),
                                  ('enc',
                                   OneHotEncoder(categories='auto', drop='first',
                                                 dtype=<class 'numpy.float64'>,
                                                 handle_unknown='error',
                                                 sparse=True),
                                   ['Sex', 'Embarked']),
                                  ('imp_scaler',
                                   Pipeline(memory='passthrough',
                                            steps=[(...
                                                   ('cluster',
                                                    KMeansPredictor(algorithm='auto',
                    

In [108]:
pipe.fit(dfX_pre, dfy_pre)

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('clmn_drpr', 'drop',
                                                  ['Name', 'Ticket', 'Cabin']),
                                                 ('enc',
                                                  OneHotEncoder(categories='auto',
                                                                drop='first',
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  ['Sex', 'Embarked']),
                                                 ('imp

In [109]:
from sklearn.model_selection import cross_val_score

In [110]:
pipe.score(dfX_pre, dfy_pre)

0.8447694038245219

In [111]:
cross_val_score(pipe, dfX_pre, dfy_pre).mean()

0.8054592775979179

In [185]:
from src.utils import print_params

## Tune the Hyperparameters

In [186]:
print_params(pipe)

['memory',
 'steps',
 'verbose',
 'preprocess',
 'clf',
 'preprocess__n_jobs',
 'preprocess__remainder',
 'preprocess__sparse_threshold',
 'preprocess__transformer_weights',
 'preprocess__transformers',
 'preprocess__verbose',
 'preprocess__clmn_drpr',
 'preprocess__enc',
 'preprocess__imp_scaler',
 'preprocess__enc__categories',
 'preprocess__enc__drop',
 'preprocess__enc__dtype',
 'preprocess__enc__handle_unknown',
 'preprocess__enc__sparse',
 'preprocess__imp_scaler__memory',
 'preprocess__imp_scaler__steps',
 'preprocess__imp_scaler__verbose',
 'preprocess__imp_scaler__imp',
 'preprocess__imp_scaler__scaler',
 'preprocess__imp_scaler__cluster',
 'preprocess__imp_scaler__enc',
 'preprocess__imp_scaler__imp__add_indicator',
 'preprocess__imp_scaler__imp__copy',
 'preprocess__imp_scaler__imp__fill_value',
 'preprocess__imp_scaler__imp__missing_values',
 'preprocess__imp_scaler__imp__strategy',
 'preprocess__imp_scaler__imp__verbose',
 'preprocess__imp_scaler__scaler__copy',
 'preproce

In [69]:
from sklearn.model_selection import GridSearchCV

In [71]:
%%time
param_grid = {'preprocess__imp_scaler__cluster__n_clusters': range(2, 12, 2)}
grid = GridSearchCV(pipe, param_grid)
grid.fit(dfX_pre, dfy_pre)

CPU times: user 3.35 s, sys: 53.4 ms, total: 3.41 s
Wall time: 1.79 s


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocess',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('clmn_drpr',
                                                                         'drop',
                                                                         ['Name',
                                                                          'Ticket',
                                                                          'Cabin']),
                                                                        ('enc',
                                              

In [72]:
print(grid.best_score_)
print(grid.best_params_)

0.8324192217355424
{'preprocess__imp_scaler__cluster__n_clusters': 8}


## Augment KMeans Features using FeatureUnioun

Augment the Feature set using cluster labels

In [73]:
from sklearn.base import BaseEstimator, ClassifierMixin

In [74]:
class ColumnSelector(BaseEstimator, ClassifierMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X


In [75]:
from sklearn.pipeline import FeatureUnion


kmeans_augment = FeatureUnion([
    ("cluster", KMeansPredictor()),
    ("original", ColumnSelector())
])


pre2 = Pipeline([
    ('imp', SimpleImputer()),
    ('scaler', StandardScaler()),
#     ('augment', kmeans_augment)
], 'passthrough')



precomb2 = ColumnTransformer([
    ('clmn_drpr', 'drop', ['Name', 'Ticket', 'Cabin']),
    ('enc', OneHotEncoder(drop='first'), ['Sex', 'Embarked']),
    ('imp_scale_augment', pre2, ['Age', 'Fare'])
], 'passthrough')


In [77]:
precomb2.fit_transform(dfX_pre, dfy_pre).shape

(889, 9)

In [85]:
pipe = Pipeline([
    ('preprocess', precomb2),
    ('scale', StandardScaler()),
    ('ml', SVC())
])

In [86]:
pipe.fit(dfX_pre, dfy_pre)

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('clmn_drpr', 'drop',
                                                  ['Name', 'Ticket', 'Cabin']),
                                                 ('enc',
                                                  OneHotEncoder(categories='auto',
                                                                drop='first',
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  ['Sex', 'Embarked']),
                                                 ('imp

In [87]:
pipe.score(dfX_pre, dfy_pre)

0.84251968503937

In [88]:
cross_val_score(pipe, dfX_pre, dfy_pre, cv=5).mean()

0.8301847267187202

In [199]:
print_params(pipe)

['memory',
 'steps',
 'verbose',
 'preprocess',
 'clf',
 'preprocess__n_jobs',
 'preprocess__remainder',
 'preprocess__sparse_threshold',
 'preprocess__transformer_weights',
 'preprocess__transformers',
 'preprocess__verbose',
 'preprocess__clmn_drpr',
 'preprocess__enc',
 'preprocess__imp_scale_augment',
 'preprocess__enc__categories',
 'preprocess__enc__drop',
 'preprocess__enc__dtype',
 'preprocess__enc__handle_unknown',
 'preprocess__enc__sparse',
 'preprocess__imp_scale_augment__memory',
 'preprocess__imp_scale_augment__steps',
 'preprocess__imp_scale_augment__verbose',
 'preprocess__imp_scale_augment__imp',
 'preprocess__imp_scale_augment__scaler',
 'preprocess__imp_scale_augment__augment',
 'preprocess__imp_scale_augment__imp__add_indicator',
 'preprocess__imp_scale_augment__imp__copy',
 'preprocess__imp_scale_augment__imp__fill_value',
 'preprocess__imp_scale_augment__imp__missing_values',
 'preprocess__imp_scale_augment__imp__strategy',
 'preprocess__imp_scale_augment__imp__ve

In [200]:
%%time
param_grid = {'preprocess__imp_scale_augment__augment__cluster__n_clusters': range(2, 12, 2)}
grid = GridSearchCV(pipe, param_grid)
grid.fit(dfX, dfy)

CPU times: user 12.6 s, sys: 205 ms, total: 12.8 s
Wall time: 5.54 s


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocess',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('clmn_drpr',
                                                                         'drop',
                                                                         ['Name',
                                                                          'Ticket',
                                                                          'Cabin']),
                                                                        ('enc',
                                              

In [201]:
grid.best_score_

0.8122008506316257