In [None]:
!pip install inflect
!python -m spacy download en_core_web_md
!pip install textacy
!pip install neuralcoref

# If neuralcoref fails to install due to spacy.strings.StringStore size changed, install from source
pip uninstall neuralcoref
git clone https://github.com/huggingface/neuralcoref.git
cd neuralcoref
pip install -r requirements.txt
pip install -e

# Counting nounds - plural and singular nouns

- determine whether a noun is singular or plural
- turn pluran nouns into singular and vice versa

In [4]:
import nltk
from nltk.stem import WordNetLemmatizer
import inflect

In [6]:
filename = './data/sherlock_holmes.txt'
file = open(filename, 'r', encoding='utf-8')
text = file.read()
text = text.replace('\n', ' ')
len(text)

562188

In [7]:
words = nltk.tokenize.word_tokenize(text)
words_with_pos = nltk.pos_tag(words)
words_with_pos[:10]

[('The', 'DT'),
 ('Adventures', 'NNP'),
 ('of', 'IN'),
 ('Sherlock', 'NNP'),
 ('Holmes', 'NNP'),
 ('by', 'IN'),
 ('Arthur', 'NNP'),
 ('Conan', 'NNP'),
 ('Doyle', 'NNP'),
 ('Contents', 'NNP')]

In [8]:
def get_nouns(words_with_pos):
    noun_set = {'NN','NNS'}
    nouns = [word for word in words_with_pos 
             if word[1] in noun_set]
    return nouns

In [19]:
nouns = get_nouns(words_with_pos)
print('nouns:', len(nouns))
print('first 10:', nouns[:10])

nouns: 19029
first 5: [('Case', 'NN'), ('Man', 'NN'), ('Adventure', 'NN'), ('Adventure', 'NN'), ('Adventure', 'NN'), ('Adventure', 'NN'), ('Adventure', 'NN'), ('Adventure', 'NN'), ('woman', 'NN'), ('name', 'NN')]


In [14]:
def is_plural_nltk(noun_info):
    pos = noun_info[1]
    # NN: Singular, NNS: Plural
    return pos == 'NNS'


def is_plural_wn(noun):
    wnl = WordNetLemmatizer()
    lemma = wnl.lemmatize(noun, 'n')
    plural = True if noun is not lemma else False
    return plural

In [15]:
def get_plural(singular_noun):
    """Change a singular noun into plural"""
    p = inflect.engine()
    return p.plural(singular_noun)

def get_singular(plural_noun):
    """Change a plural noun into singular"""
    p = inflect.engine()
    plural = p.singular_noun(plural_noun)
    return plural if plural else plural_noun

In [17]:
def plurals_wn(words_with_pos):
    other_nouns = []
    for noun_info in words_with_pos:
        word = noun_info[0]
        plural = is_plural_wn(word)
        if plural:
            singular = get_singular(word)
            other_nouns.append(singular)
        else:
            plural = get_plural(word)
            other_nouns.append(plural)
    return other_nouns

In [20]:
other_nouns_wn = plurals_wn(nouns)
other_nouns_wn[:10]

['Cases',
 'Men',
 'Adventures',
 'Adventures',
 'Adventures',
 'Adventures',
 'Adventures',
 'Adventures',
 'women',
 'names']

In [27]:
print(nltk.pos_tag(['man', 'men']))
plurals_wn([
    ('men', 'NNS'),
    ('man', 'NN'),
    ('cars', 'NNS'),
    ('car', 'NN'),
])

[('man', 'NN'), ('men', 'NNS')]


['mens', 'men', 'car', 'cars']

In [30]:
is_plural_wn('man')

False

In [34]:
# https://stackoverflow.com/questions/22333392/stemming-some-plurals-with-wordnet-lemmatizer-doesnt-work
# Why is the lemmatizer returning men instead of man?
wnl = WordNetLemmatizer()
lemma = wnl.lemmatize('men')
lemma

'men'