# Classification using Clustering : KMeans 

Here I will design a pipeline to first do clustering with Kmeans and then apply a classification algorithm on the labels

In [4]:
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import pandas as pd



In [5]:
import sys
sys.path.append("../")

In [168]:
df = pd.read_csv('../data/raw/train.csv').drop('PassengerId', axis=1)
dfX = df.drop('Survived', axis=1)
dfy = df.Survived

In [169]:
from titansurv.preprocessing.transformers import NaNDropper

In [170]:
pre1 = Pipeline([
    ('nan_drpr', NaNDropper(['Embarked']))
])

dfX, dfy = pre1.fit_transform(dfX, dfy)

## Preprocessing

In [171]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [172]:
class KMeansPredictor(KMeans):
    
    def __init__(self, n_clusters=8, init='k-means++', n_init=10,
                 max_iter=300, tol=1e-4, precompute_distances='auto',
                 verbose=0, random_state=None, copy_x=True,
                 n_jobs=None, algorithm='auto'):
        
        super().__init__(
            n_clusters, 
            init, 
            n_init, 
            max_iter,
            tol,
            precompute_distances,
            verbose,
            random_state,
            copy_x,
            n_jobs, 
            algorithm)
        
    
    def transform(self, X, y=None):
        return self.predict(X).reshape(-1, 1)
    
    def fit_transform(self, X, y=None):
        return self.fit_predict(X).reshape(-1, 1)
        
        

In [173]:
pre2 = Pipeline([
    ('imp', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('cluster', KMeansPredictor()),
    ('enc', OneHotEncoder(drop='first'))
], 'passthrough')



precomb2 = ColumnTransformer([
    ('clmn_drpr', 'drop', ['Name', 'Ticket', 'Cabin']),
    ('enc', OneHotEncoder(drop='first'), ['Sex', 'Embarked']),
    ('imp_scaler', pre2, ['Age', 'Fare'])
], 'passthrough')


## Cluster and Classify

In [174]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [175]:
pipe = Pipeline([
    ('preprocess', precomb2),
    ('clf', LogisticRegression(solver='liblinear'))
])

In [176]:
pipe.steps

[('preprocess',
  ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
                    transformer_weights=None,
                    transformers=[('clmn_drpr', 'drop',
                                   ['Name', 'Ticket', 'Cabin']),
                                  ('enc',
                                   OneHotEncoder(categories='auto', drop='first',
                                                 dtype=<class 'numpy.float64'>,
                                                 handle_unknown='error',
                                                 sparse=True),
                                   ['Sex', 'Embarked']),
                                  ('imp_scaler',
                                   Pipeline(memory='passthrough',
                                            steps=[(...
                                                   ('cluster',
                                                    KMeansPredictor(algorithm='auto',
                    

In [177]:
pipe.fit(dfX, dfy)

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('clmn_drpr', 'drop',
                                                  ['Name', 'Ticket', 'Cabin']),
                                                 ('enc',
                                                  OneHotEncoder(categories='auto',
                                                                drop='first',
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  ['Sex', 'Embarked']),
                                                 ('imp

In [178]:
from sklearn.model_selection import cross_val_score

In [183]:
pipe.score(dfX, dfy)

0.8155230596175478

In [184]:
cross_val_score(pipe, dfX, dfy).mean()

0.8054148416174696

In [185]:
from titansurv.utils import print_params

## Tune the Hyperparameters

In [186]:
print_params(pipe)

['memory',
 'steps',
 'verbose',
 'preprocess',
 'clf',
 'preprocess__n_jobs',
 'preprocess__remainder',
 'preprocess__sparse_threshold',
 'preprocess__transformer_weights',
 'preprocess__transformers',
 'preprocess__verbose',
 'preprocess__clmn_drpr',
 'preprocess__enc',
 'preprocess__imp_scaler',
 'preprocess__enc__categories',
 'preprocess__enc__drop',
 'preprocess__enc__dtype',
 'preprocess__enc__handle_unknown',
 'preprocess__enc__sparse',
 'preprocess__imp_scaler__memory',
 'preprocess__imp_scaler__steps',
 'preprocess__imp_scaler__verbose',
 'preprocess__imp_scaler__imp',
 'preprocess__imp_scaler__scaler',
 'preprocess__imp_scaler__cluster',
 'preprocess__imp_scaler__enc',
 'preprocess__imp_scaler__imp__add_indicator',
 'preprocess__imp_scaler__imp__copy',
 'preprocess__imp_scaler__imp__fill_value',
 'preprocess__imp_scaler__imp__missing_values',
 'preprocess__imp_scaler__imp__strategy',
 'preprocess__imp_scaler__imp__verbose',
 'preprocess__imp_scaler__scaler__copy',
 'preproce

In [187]:
from sklearn.model_selection import GridSearchCV

In [188]:
%%time
param_grid = {'preprocess__imp_scaler__cluster__n_clusters': range(2, 12, 2)}
grid = GridSearchCV(pipe, param_grid)
grid.fit(dfX, dfy)

CPU times: user 504 ms, sys: 0 ns, total: 504 ms
Wall time: 502 ms


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocess',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('clmn_drpr',
                                                                         'drop',
                                                                         ['Name',
                                                                          'Ticket',
                                                                          'Cabin']),
                                                                        ('enc',
                                              

In [190]:
print(grid.best_score_)
print(grid.best_params_)

0.8054148416174696
{'preprocess__imp_scaler__cluster__n_clusters': 8}


## Augment KMeans Features using FeatureUnioun

Augment the Feature set using cluster labels

In [191]:
from sklearn.base import BaseEstimator, ClassifierMixin

In [192]:
class ColumnSelector(BaseEstimator, ClassifierMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X


In [193]:
from sklearn.pipeline import FeatureUnion


kmeans_augment = FeatureUnion([
    ("cluster", KMeansPredictor()),
    ("original", ColumnSelector())
])


pre2 = Pipeline([
    ('imp', SimpleImputer()),
    ('scaler', StandardScaler()),
    ('augment', kmeans_augment)
], 'passthrough')



precomb2 = ColumnTransformer([
    ('clmn_drpr', 'drop', ['Name', 'Ticket', 'Cabin']),
    ('enc', OneHotEncoder(drop='first'), ['Sex', 'Embarked']),
    ('imp_scale_augment', pre2, ['Age', 'Fare'])
], 'passthrough')


In [194]:
precomb2.fit_transform(dfX, dfy).shape

(889, 9)

In [195]:
pipe = Pipeline([
    ('preprocess', precomb2),
    ('clf', RandomForestClassifier())
])

In [196]:
pipe.fit(dfX, dfy)

Pipeline(memory=None,
         steps=[('preprocess',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('clmn_drpr', 'drop',
                                                  ['Name', 'Ticket', 'Cabin']),
                                                 ('enc',
                                                  OneHotEncoder(categories='auto',
                                                                drop='first',
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  ['Sex', 'Embarked']),
                                                 ('imp

In [197]:
pipe.score(dfX, dfy)

0.9820022497187851

In [198]:
cross_val_score(pipe, dfX, dfy, cv=5).mean()

0.8132863581540024

In [199]:
print_params(pipe)

['memory',
 'steps',
 'verbose',
 'preprocess',
 'clf',
 'preprocess__n_jobs',
 'preprocess__remainder',
 'preprocess__sparse_threshold',
 'preprocess__transformer_weights',
 'preprocess__transformers',
 'preprocess__verbose',
 'preprocess__clmn_drpr',
 'preprocess__enc',
 'preprocess__imp_scale_augment',
 'preprocess__enc__categories',
 'preprocess__enc__drop',
 'preprocess__enc__dtype',
 'preprocess__enc__handle_unknown',
 'preprocess__enc__sparse',
 'preprocess__imp_scale_augment__memory',
 'preprocess__imp_scale_augment__steps',
 'preprocess__imp_scale_augment__verbose',
 'preprocess__imp_scale_augment__imp',
 'preprocess__imp_scale_augment__scaler',
 'preprocess__imp_scale_augment__augment',
 'preprocess__imp_scale_augment__imp__add_indicator',
 'preprocess__imp_scale_augment__imp__copy',
 'preprocess__imp_scale_augment__imp__fill_value',
 'preprocess__imp_scale_augment__imp__missing_values',
 'preprocess__imp_scale_augment__imp__strategy',
 'preprocess__imp_scale_augment__imp__ve

In [200]:
%%time
param_grid = {'preprocess__imp_scale_augment__augment__cluster__n_clusters': range(2, 12, 2)}
grid = GridSearchCV(pipe, param_grid)
grid.fit(dfX, dfy)

CPU times: user 12.6 s, sys: 205 ms, total: 12.8 s
Wall time: 5.54 s


GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocess',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('clmn_drpr',
                                                                         'drop',
                                                                         ['Name',
                                                                          'Ticket',
                                                                          'Cabin']),
                                                                        ('enc',
                                              

In [201]:
grid.best_score_

0.8122008506316257