In [None]:
import re
import unicodedata
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

class cleaningPipeline():
    def _init_(self):
        pass

    def remove_non_ascii(self, text):
        """Remove non-ASCII characters from the text"""
        text = re.sub(r'\x85', '', text)  # replace ellipses
        text = re.sub(r'\x91', '', text)  # replace left single quote
        text = re.sub(r'\x92', '', text)  # replace right single quote
        text = re.sub(u'\x93', '', text)  # replace left double quote
        text = re.sub(u'\x94', '', text)  # replace right double quote
        text = re.sub(r'\x95', '', text)  # replace bullet
        text = re.sub(r'\x96', '', text)  # replace bullet
        text = re.sub(r'\x99', '', text)  # replace TM
        text = re.sub(r'\xae', '', text)  # replace (R)
        text = re.sub(r'\xb0', '', text)  # replace degree symbol
        text = re.sub(r'\xba', '', text)  # replace degree symbol
        new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        return new_text

    def remove_whitespace_and_special_chars(self, text):
        '''Remove whitespace and special characters from the text'''
        tab_newline_pattern = '[\t\n]'  # regex for newline
        multi_space = ' {2,}'  # regex for multispace

        formatted_text = text.lower()  # lower all the text

        formatted_text = re.sub(r'(?:\d{1,2} )?(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* (?:\d{1,2}, )?\d{2,4}', 'mdate', formatted_text)
        formatted_text = re.sub(r'(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]* \d{1,2}[a-z]*', 'mdate', formatted_text)

        formatted_text = re.sub('(?<=\d),(?=\d)', 'commaseperatednum', formatted_text)
        formatted_text = re.sub(tab_newline_pattern, ' ', formatted_text)
        formatted_text = re.sub(multi_space, ' ', formatted_text)
        formatted_text = re.compile(r'[^a-zA-Z0-9\s]').sub(' ', formatted_text)
        formatted_text = re.sub(multi_space, ' ', formatted_text)
        return formatted_text

    def remove_numerics(self, text):
        '''Remove all numeric values from the text'''
        text = re.sub(r'\d+', '', text)
        return text

    def removeStopWord(self, text):
        '''Removes all stopwords e.g., a, the, etc...'''
        stop = set(stopwords.words('english'))
        return " ".join([word for word in word_tokenize(text) if word.lower() not in stop])

    def word_lemmatization(self, text):
        '''Lemmatize words in the text'''
        lemmatizer = WordNetLemmatizer()
        words = word_tokenize(text)
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        lemmatized_text = " ".join(lemmatized_words)
        return lemmatized_text

    def transform(self, text):
        '''Call above methods in sequence'''
        if not isinstance(text, str):
            text = str(text)
        text = self.remove_non_ascii(text)
        text = self.remove_whitespace_and_special_chars(text)
        text = self.removeStopWord(text)
        text = self.word_lemmatization(text)
        text = self.remove_numerics(text)
        return text

In [None]:
import pickle

In [None]:
import pandas as pd

def save_to_pickle(dataframe, filename):
    with open(filename, 'wb') as file:
        pickle.dump(dataframe, file)
    print(f'DataFrame saved to {filename}')

In [None]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

class VectorizationPipeline():
    def __init__(self, max_features=5000):
        self.vectorizer = TfidfVectorizer(max_features=max_features)

    def fit_transform(self, text_data):
        return self.vectorizer.fit_transform(text_data)

    def transform(self, text_data):
        return self.vectorizer.transform(text_data)

    def get_feature_names(self):
        return self.vectorizer.get_feature_names_out()

In [None]:
# Example usage
def save_vectorization_pipeline(filename, max_features=5000):
    vectorization_pipeline = VectorizationPipeline(max_features=max_features)
    with open(filename, 'wb') as file:
        pickle.dump(vectorization_pipeline, file)
    print(f'VectorizationPipeline saved to {filename}')

# Save the vectorization pipeline
save_vectorization_pipeline('vectorization_pipeline.pkl')

VectorizationPipeline saved to vectorization_pipeline.pkl
