# Extending TransformerMixin

A transformer is a special type of Estimator that creates a new dataset from an old one based on rules that it has learned from the fitting process.

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

In [4]:
class Transformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        """
        Learn how to transform data based on input data, X.
        """
        return self

    def transform(self, X):
        """
        Transform X into a new dataset, Xprime and return it.
        """
        return Xprime

In [5]:
import os
from gensim.corpora import Dictionary
from gensim.matutils import sparse2full

In [6]:
class GensimVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, path=None):
        self.path = path
        self.id2word = None
        self.load()
    
    def load(self):
        if os.path.exists(self.path):
            self.id2word = Dictionary.load(self.path)
    
    def save(self):
        self.id2word.save(self.path)
        
    def fit(self, documents, labels=None):
        self.id2word = Dictionary(documents)
        self.save()
        return self
    
    def transform(self, documents):
        for document in documents:
            docvec = self.id2word.doc2bow(document)
            yield sparse2full(docvec, len(self.id2word))