In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer

In [2]:
class nlp_pipe:
    """
    A class for pipelining the NLP of the text data. The user provides 
    a series of tools, and this class manages all of the training, 
    transforming, and modification of the text data.
    ---
    Inputs:
    cleaning_function: how to clean the data
    vectorizer: the model to use for vectorization of text data
    tokenizer: the model to use for tokenization of text data
    stemmer: the model to use for stemming of text data
    """
    
    def __init__(self, 
                 cleaning_function, 
                 vectorizer=CountVectorizer(), 
                 tokenizer=TreebankWordTokenizer(), 
                 stemmer=PorterStemmer()):
        self.vectorizer = vectorizer
        self.cleaning_function = cleaning_function
        self.tokenizer = tokenizer
        self.stemmer = stemmer
    
    def fit(self, text):
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer)
        self.vectorizer.fit(clean_text)
    
    def transform(self, text):
        clean_text = self.cleaning_function(text, self.tokenizer, self.stemmer)
        vectorized = self.vectorizer.transform(clean_text)

        return pd.DataFrame(vectorized.toarray(),
                           columns = self.vectorizer.get_feature_names()
                           ).head()

In [4]:
def cleaning_function(text, tokenizer, stemmer):
    """
    A function for cleaning the text data. The user provides 
    the text and tools, and this function cleans the data.
    ---
    Inputs:
    text: data to be cleaned
    vectorizer: the model to use for vectorization of text data
    tokenizer: the model to use for tokenization of text data
    stemmer: the model to use for stemming of text data
    """
    cleaned_text = []
    for words in text:
        cleaned_words = []
        
        for word in tokenizer(words):
            low_word = stemmer.stem(word.lower())
            cleaned_words.append(low_word)
            
        cleaned_text.append(' '.join(cleaned_words))
        
    return cleaned_text