In [6]:
#pip install -U pip setuptools wheel

Note: you may need to restart the kernel to use updated packages.


In [7]:
#pip install -U spacy

Collecting spacy
  Downloading spacy-3.4.1-cp39-cp39-win_amd64.whl (11.8 MB)
     ---------------------------------------- 11.8/11.8 MB 3.5 MB/s eta 0:00:00
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.3-py3-none-any.whl (9.3 kB)
Collecting cymem<2.1.0,>=2.0.2
  Using cached cymem-2.0.6-cp39-cp39-win_amd64.whl (36 kB)
Collecting spacy-legacy<3.1.0,>=3.0.9
  Downloading spacy_legacy-3.0.10-py2.py3-none-any.whl (21 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.10.0,>=1.7.4
  Downloading pydantic-1.9.2-cp39-cp39-win_amd64.whl (2.0 MB)
     ---------------------------------------- 2.0/2.0 MB 4.2 MB/s eta 0:00:00
Collecting wasabi<1.1.0,>=0.9.1
  Downloading wasabi-0.10.1-py3-none-any.whl (26 kB)
Collecting pathy>=0.3.5
  Downloading pathy-0.6.2-py3-none-any.whl (42 kB)
     ---------------------------------------- 42.8/42.8 kB ? eta 0:00:00
Collecting thinc<8.2.0,>=8.1.0
  Downloading thinc-8.1.1-cp39-cp39-win_amd64.whl (1.3 MB)
     ------------------------------------

In [7]:
#!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 5.3 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.4.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [8]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [9]:
import spacy
import re


class BasicPreprocess:
    def __init__(self, model: str = "en_core_web_md", **params):
        """
        :param model: model name from spacy
        :param params:
        """
        self.params = params
        self.nlp = spacy.load(model)

    def __call__(self, text) -> str:
        raise NotImplemented


class AdvancedPreprocess:
    def __init__(self, model: str = "en_core_web_md", **params):
        """
        :param model: model name from spacy
        :param params:
        """
        self.params = params
        self.nlp = spacy.load(model)

    def __call__(self, text) -> list:
        raise NotImplemented


class RemoveStopwords(BasicPreprocess):
    def __init__(self, model: str = "en_core_web_md", **params):
        super(RemoveStopwords, self).__init__(model, **params)

    def __call__(self, text: str, **params) -> str:
        return " ".join(str(item) for item in [token for token in self.nlp(text) if not token.is_stop]).strip()


class Lemmatize(BasicPreprocess):
    def __init__(self, model: str = "en_core_web_md", **params):
        super(Lemmatize, self).__init__(model, **params)

    def __call__(self, text: str, **params) -> str:
        return " ".join(str(item) for item in [token.lemma_ for token in self.nlp(text)])


class RemovePunctuation(BasicPreprocess):
    def __init__(self, model: str = "en_core_web_md", **params):
        super(RemovePunctuation, self).__init__(model, **params)

    def __call__(self, text: str, **params) -> str:
        return " ".join(str(item) for item in [token for token in self.nlp(text) if not token.is_punct]).strip()


class Lower(BasicPreprocess):
    def __init__(self, model: str = "en_core_web_md", **params):
        super(Lower, self).__init__(model, **params)

    def __call__(self, text: str, **params) -> str:
        return " ".join(str(item) for item in [token.lower_ for token in self.nlp(text)])


class Sentensize(AdvancedPreprocess):
    def __init__(self, model: str = "en_core_web_md", **params):
        super(Sentensize, self).__init__(model, **params)

    def __call__(self, text: str, **params) -> list:
        return list(self.nlp(text).sents)


class RemoveExtras(BasicPreprocess):
    def __init__(self, model: str = "en_core_web_md", **params):
        super(RemoveExtras, self).__init__(model, **params)

    def __call__(self, text: str) -> str:
        result = text.strip()
        result = result.replace("AAm\nmeerriiccaannRRhheettoorriicc..ccoom\nm", "").replace(
            "Property of AmericanRhetoric.com", "").replace(
            "Property of  AmericanRhetoric.com", "")

        result = result.replace("AmericanRhetoric.com", "")
        result = re.sub("Copyright Status: ([a-zA-Z]+( [a-zA-Z]+)+)\.", "", result)
        result = re.sub("©Copyright\s[0-9]+\. ([a-zA-Z]+( [a-zA-Z]+)+)\.", "", result)
        result = re.sub("Copyright Status: [a-zA-Z]+, [a-zA-Z]+ [a-zA-Z]+\.", "", result)
        result = re.sub("Copyright Status = [a-zA-Z]+\s[a-zA-Z]+", "", result)
        result = re.sub("Copyright Status = [a-zA-Z]+", "", result)
        result = re.sub("Copyright Status: [a-zA-Z]+", "", result)


        result = re.sub("[a-zA-Z]+ by [a-zA-Z]+ [a-zA-Z]\. [a-zA-Z]+ [a-zA-Z]+", "", result)
        result = re.sub("[a-zA-Z]+ by [a-zA-Z]+ [a-zA-Z]\. [a-zA-Z]+\.", "", result)
        result = re.sub("[a-zA-Z]+ by [a-zA-Z]+ [a-zA-Z]\. [a-zA-Z]+", "", result)
        result = re.sub("[a-zA-Z]+ by [a-zA-Z]+ [a-zA-Z]+\.", "", result)
        result = re.sub("[a-zA-Z]+ by [a-zA-Z]+ [a-zA-Z]+ [a-zA-Z]+ [a-zA-Z]+ [a-zA-Z]+\.", "", result)
        result = re.sub("[a-zA-Z]+ by [a-zA-Z]+ [a-zA-Z]+", "", result)
        result = re.sub("Page [0-9]", "", result)
        result = re.sub("Updated [0-9]+/[0-9]+/[0-9]+", "", result)
        result = re.sub("Created [0-9]+/[0-9]+/[0-9]+", "", result)
        result = re.sub("AUTHENTICITY CERTIFIED: ([a-zA-Z]+( [a-zA-Z]+)+)", "", result)
        result = result.strip()

        if len(self.params.keys()) != 0:
            if self.params["names"]:
                return "\n".join(str(item) for item in result.split("\n")[3:])
            else:
                return result