When processing texts for NLP, we want to convert words in our corpus to some sort of standard, normalized or canonical form. There have are two main ways to do this: stemming and lemmatization. 

In [1]:
import pandas as pd
from nltk.stem import LancasterStemmer, PorterStemmer, SnowballStemmer

words = ['caving', 'writing', 'calve']

In [2]:
lancaster = LancasterStemmer()
porter = PorterStemmer()
snowball = SnowballStemmer(language='english')

In [3]:
stemmed = pd.DataFrame.from_records([{'original': w, 'lancaster': lancaster.stem(w),
  'porter': porter.stem(w), 
  'snowball': snowball.stem(w)} for w in words]).reindex(columns=['original', 'lancaster', 'porter','snowball'])

In [4]:
stemmed.T

Unnamed: 0,0,1,2
original,caving,writing,calve
lancaster,cav,writ,calv
porter,cave,write,calv
snowball,cave,write,calv


In [5]:
from nltk.stem.wordnet import WordNetLemmatizer
import spacy

In [6]:
WordNetLemmatizer?

In [7]:
wordnet_lemmatizer = WordNetLemmatizer()
spacy_nlp = spacy.load('en')

In [8]:
stemmed['wordnet_lemmatizer'] = stemmed.original.apply(lambda x: wordnet_lemmatizer.lemmatize(x))

In [9]:
stemmed

Unnamed: 0,original,lancaster,porter,snowball,wordnet_lemmatizer
0,caving,cav,cave,cave,caving
1,writing,writ,write,write,writing
2,calve,calv,calv,calv,calve


In [10]:
wordnet_lemmatizer.lemmatize('mice')

'mouse'

In [11]:
spacy_processed = spacy_nlp(' '.join(words))

In [12]:
[(w.orth_, w.lemma_) for w in spacy_processed]

[('caving', 'cave'), ('writing', 'write'), ('calve', 'calve')]

In [13]:
from nltk.corpus import names

In [14]:
names

<WordListCorpusReader in '/home/brian/nltk_data/corpora/names'>

In [15]:
r = names.open('female.txt') 

In [16]:
male_names = [n.strip() for n in names.open('male.txt')]
female_names = [n.strip() for n in names.open('female.txt')]


In [17]:
len(male_names)

2943

In [18]:
len(female_names)

5001

In [None]:
from ipywidgets import Accordion