In [1]:
from sklearn.pipeline import Pipeline


In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

class TextClusterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=5):
        self.n_clusters = n_clusters
        self.vectorizer = TfidfVectorizer() # This will generate the representation of the text. 
        self.model = KMeans(n_clusters=n_clusters) # This will group the text into clusters, based on the representation.

    def fit(self, X, y=None):
        tfidf = self.vectorizer.fit_transform(X)
        self.model.fit(tfidf)
        return self

    def transform(self, X, y=None):
        tfidf = self.vectorizer.transform(X)
        clusters = self.model.predict(tfidf)
        return clusters

# Example usage:
# pipeline = Pipeline(steps=[('t', TextClusterTransformer()), ('m', SomeModel())])
# pipeline.fit(X_train, y_train)