In [1]:
import nltk
import wikipedia
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer, SnowballStemmer
import string

In [2]:
# Step-1 : Get text from wikipedia and lowercase
text = wikipedia.page("Geoffrey Hinton").content.lower()

In [3]:
# Step-2: Tokenization
tokens = word_tokenize(text)

In [4]:
# STep-3: Remove punctuations and numbers
tokens = [word for word in tokens if word.isalpha()]

In [5]:
# Step-4 : Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]

In [7]:
# Step- 5a: Porter Stemmer:
porter = PorterStemmer()
porter_stemmed = [porter.stem(word) for word in filtered_tokens]
porter_text = ' '.join(porter_stemmed)

In [8]:
# Step- 5b : Snowball Stemmer:
snowball = SnowballStemmer('english')
snowball_stemmed = [snowball.stem(word) for word in filtered_tokens]
snowball_text = ' '.join(snowball_stemmed)

In [10]:
# STep- 5c: Lemmatization using WordNet
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
nltk_lemmatized_text = ' '.join(lemmatized_tokens)

In [13]:
# Display Results : 
print("====== Porter Stemmer Result ======")
print(porter_text[:1000], "\n")

print("====== Snowball Stemmer Result ======")
print(snowball_text[:1000], "\n")

print("====== NLTK Lemmatizer Result ======")
print(nltk_lemmatized_text[:1000], "\n")

geoffrey everest hinton born comput scientist cognit scientist cognit psychologist known work artifici neural network earn titl godfath ai hinton univers professor emeritu univers toronto divid time work googl googl brain univers toronto publicli announc departur googl may cite concern mani risk artifici intellig ai technolog becam chief scientif advisor vector institut toronto david rumelhart ronald william hinton highli cite paper publish popularis backpropag algorithm train neural network although first propos approach hinton view lead figur deep learn commun mileston alexnet design collabor student alex krizhevski ilya sutskev imagenet challeng breakthrough field comput vision hinton receiv ture award often refer nobel prize comput togeth yoshua bengio yann lecun work deep learn sometim refer godfath deep learn continu give public talk togeth also award along john hopfield nobel prize physic foundat discoveri invent enabl machin learn artifici neural network may hinton announc resi

In [14]:
import spacy

In [15]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [17]:
# STep-1 : Get and lowercase text from wikipedia
text = wikipedia.page("Geoffrey Hinton").content.lower()

In [18]:
# Step-2 : Process using spacy
doc = nlp(text)

In [19]:
# STep-3 remove stopwords, punctuation, and non-alphabetic tokens; lemmatize
cleaned_tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

In [20]:
# step 4 : clean the text
spacy_cleaned_text = ' '.join(cleaned_tokens)