In [1]:
#installation
!pip install -U spacy
!python -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting spacy
  Downloading spacy-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0m0m
Installing collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.4.4
    Uninstalling spacy-3.4.4:
      Successfully uninstalled spacy-3.4.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-core-web-md 3.4.1 requires spacy<3.5.0,>=3.4.0, but you have spacy 3.6.1 which is incompatible.
en-core-web-sm 3.4.1 requires spacy<3.5.0,>=3.4.0, but you have spacy 3.6.1 which is incompatible.[0m[31m
[0mSuccessfully installed spacy-3.6.1

[1m[[0m[34;49mnotice[0m[

In [3]:
import spacy
from nltk.tokenize import word_tokenize
from spacy import displacy

In [14]:
class TextPreprocessor:
    def __init__(self):
        # loading english language model of spacy
        self.nlp = spacy.load("en_core_web_sm")
        
    def lower_casing(self, text):
        """
        Accepts text as arguments and return text in lowercase

        Arguments:
        text: raw text

        Returns:
        text_to_lower: text converted to lower case
        """
        text_to_lower = text.lower()

        return text_to_lower

    def remove_stopwords(self, text):
        """
        Removes stopwords passed from the text passed as an arguments

        Arguments:
        text: raw text from where stopwords need to removed

        Returns:
        tokens_without_sw: concatenated tokens of raw text without stopwords
        """
        # getting list of default stop words in spaCy english model
        stopwords = self.nlp.Defaults.stop_words

        # tokenize text
        text_tokens = word_tokenize(text)

        # remove stop words:
        tokens_without_sw = " ".join([word for word in text_tokens if word not in stopwords])

        # return list of tokens with no stop words
        return tokens_without_sw
    
    def tokenize_word(self, text):
        """
        Tokenize the text passed as an arguments into a list of words(tokens)

        Arguments:
        text: raw text

        Returns:
        words: list containing tokens in text
        """
        # passing the text to nlp and initialize an object called 'doc'
        doc = self.nlp(text)

        # Tokenize the doc using token.text attribute
        words = [token.text for token in doc]

        # return list of tokens
        return words
    
    def tokenize_sentence(self, text):
        """
        Tokenize the text passed as an arguments into a list of sentence

        Arguments:
        text: raw text

        Returns:
        sentences: list of sentences
        """
        # passing the text to nlp and initialize an object called 'doc'
        doc = self.nlp(text)

        # tokenize the sentence using sents attributes
        sentences = list(doc.sents)

        # return tokenize sentence
        return sentences
    
    def remove_punctuation(self, text):
        """
        removes punctuation symbols present in the raw text passed as an arguments

        Arguments:
        text: raw text

        Returns: 
        not_punctuation: text without punctuation
        """
        # passing the text to nlp and initialize an object called 'doc'
        doc = self.nlp(text)

        not_punctuation = []
        # remove the puctuation
        for token in doc:
            if token.is_punct == False:
                not_punctuation.append(token)
        
        return " ".join([str(w) for w in not_punctuation])
    
    
    def lemmatization(self, text):
        """
        obtain the lemma of the each token in the text, append to the list, and returns the list

        Arguments:
        text: raw text

        Returns:
        token_lemma_list: list containing token with its lemma
        """

        # passing the text to nlp and initialize an object called 'doc'
        doc = self.nlp(text)

        token_lemma_list = []
        # Lemmatization
        for token in doc:
            token_lemma_list.append((token.text, token.lemma_))

        return token_lemma_list
    
    def pos_tagging(self, text):
        # passing the text to nlp and initialize an object called 'doc'
        doc = self.nlp(text)

        pos_list = []
        for token in doc:
            pos_list.append((token.text, token.pos_, token.tag_))
        return pos_list
    
    def named_entity_recognition(self, text):
        """
        returns entity_text and entity labels as a tuple

        Arguments:
        text: raw text

        Returns:
        entity_text_label: entity text and labels as a tuple
        """
        # passing the text to nlp and initialize an object called 'doc'
        doc = self.nlp(text)

        #named entity recogniton using doc.ents
        entity_text_label = []

        for entity in doc.ents:
            entity_text_label.append((entity.text, entity.label_))

        return entity_text_label

In [15]:
def main():
    sample_text = "Books, are on the table. I want to read a book ? "
    preprocessor = TextPreprocessor()
    lower_text = preprocessor.lower_casing(sample_text)
    print(lower_text)
    remove_stopword = preprocessor.remove_stopwords(lower_text)
    print(remove_stopword)
    token = preprocessor.tokenize_word(remove_stopword)
    print(token)
    sample_text2 =  "Oh man, this is pretty cool. We will do more such things."
    sentence = preprocessor.tokenize_sentence(sample_text2)
    print(sentence)
    not_punctuation = preprocessor.remove_punctuation(remove_stopword)
    print(not_punctuation)
    lemma = preprocessor.lemmatization(not_punctuation)
    print(lemma)

In [16]:
main()

books, are on the table. i want to read a book ? 
books , table . want read book ?
['books', ',', 'table', '.', 'want', 'read', 'book', '?']
[Oh man, this is pretty cool., We will do more such things.]
books table want read book
[('books', 'book'), ('table', 'table'), ('want', 'want'), ('read', 'read'), ('book', 'book')]
