In [None]:
!pip install autocorrect

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting autocorrect
  Downloading autocorrect-2.6.1.tar.gz (622 kB)
[?25l[K     |▌                               | 10 kB 19.5 MB/s eta 0:00:01[K     |█                               | 20 kB 25.8 MB/s eta 0:00:01[K     |█▋                              | 30 kB 31.6 MB/s eta 0:00:01[K     |██                              | 40 kB 27.9 MB/s eta 0:00:01[K     |██▋                             | 51 kB 21.5 MB/s eta 0:00:01[K     |███▏                            | 61 kB 24.0 MB/s eta 0:00:01[K     |███▊                            | 71 kB 25.9 MB/s eta 0:00:01[K     |████▏                           | 81 kB 26.5 MB/s eta 0:00:01[K     |████▊                           | 92 kB 28.4 MB/s eta 0:00:01[K     |█████▎                          | 102 kB 30.6 MB/s eta 0:00:01[K     |█████▉                          | 112 kB 30.6 MB/s eta 0:00:01[K     |██████▎                        

In [None]:
from os import remove
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import spacy
from autocorrect import Speller

class TextPreprocessor:
    def __init__(self, stopwords=None, stemmer=None, spelling_correction=None):
        self.stopwords = stopwords
        self.stemmer = stemmer
        self.spelling_correction = spelling_correction

    def lowercase(self, input_str):
        return input_str.lower()
    
    def uppercase(self, input_str):
        return input_str.upper()

    def remove_punct(self, input_str):
        return input_str.translate(input_str.maketrans('', '', string.punctuation))

    def remove_stopwords(self, input_str):
        stopwords_list = list(self.stopwords.words('english'))
        return " ".join([word for word in input_str.split() if word not in stopwords_list])
        
    def remove_freqs(self, input_str):
        return list(set(input_str.split()))

    def stem(self, input_str):
        return " ".join([self.stemmer.stem(word) for word in input_str.split()])
    
    def lemmatize(self, input_str):
        nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
        doc = nlp(input_str)
        return " ".join([word.lemma_ for word in doc])

    def remove_url(self, input_str):
        return input_str.replace("".join([word for word in input_str.split() if word.startswith('https://') or word.startswith('http://')]), "")

    def remove_tags(self, input_str):
        pattern = re.compile('<.*?>')
        return re.sub(pattern, '', input_str)
    
    def word_correct(self, input_str):
        return self.spelling_correction(input_str)

    def preprocess_text(self, text, 
                        lowercase=True, 
                        uppercase=False, 
                        remove_punct=True, 
                        remove_stopwords=True, 
                        remove_freqs=False, 
                        stemming=True, 
                        lemmatize=False, 
                        remove_url=True, 
                        remove_tags=True, 
                        word_correct=True):
        preprocessed_text = text

        if lowercase == True and uppercase == True:
            print("Cannot do both lowercasing and uppercasing. Please specify only one of them.")
            return None
        elif lowercase == True:
            preprocessed_text = self.lowercase(preprocessed_text)
        elif uppercase == True:
            preprocessed_text = self.uppercase(preprocessed_text)
        if remove_url == True:
            preprocessed_text = self.remove_url(preprocessed_text)
        if remove_tags == True:
            preprocessed_text = self.remove_tags(preprocessed_text)
        if remove_punct == True:
            preprocessed_text = self.remove_punct(preprocessed_text)
        if remove_stopwords == True:
            preprocessed_text = self.remove_stopwords(preprocessed_text)
        if remove_freqs == True:
            preprocessed_text = self.remove_freqs(preprocessed_text)
        if word_correct == True:
            preprocessed_text = self.word_correct(preprocessed_text)
        if stemming == True and lemmatize == True:
            print("Cannot do both stemming and lemmatizing. Please specify only one of them.")
            return None
        elif stemming == True:
            preprocessed_text = self.stem(preprocessed_text)
        elif lemmatize == True:
            preprocessed_text = self.lemmatize(preprocessed_text)

        return preprocessed_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
text = "welcome to machine learning,"
preprocessor = TextPreprocessor(stopwords=stopwords, stemmer=PorterStemmer(), spelling_correction=Speller(lang='en'))

In [None]:
print(preprocessor.preprocess_text(text))

welcom machin learn


In [None]:
print(preprocessor.preprocess_text(text, remove_punct=False, remove_url=False, word_correct =False))

welcom machin learning,


In [None]:
print(preprocessor.preprocess_text(text, lowercase=True, uppercase=True))

Cannot do both lowercasing and uppercasing. Please specify only one of them.
None
