In [17]:
import nltk
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk.stem import WordNetLemmatizer
import string
import matplotlib.pyplot as plt

# Read the Moby Dick file from the Gutenberg dataset
nltk.download('gutenberg')
moby_dick = gutenberg.raw('melville-moby_dick.txt')

# Tokenization (without punctuation)
tokens = word_tokenize(moby_dick)
tokens = [token for token in tokens if token not in string.punctuation]
tokens



[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\12552\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


['Moby',
 'Dick',
 'by',
 'Herman',
 'Melville',
 '1851',
 'ETYMOLOGY',
 'Supplied',
 'by',
 'a',
 'Late',
 'Consumptive',
 'Usher',
 'to',
 'a',
 'Grammar',
 'School',
 'The',
 'pale',
 'Usher',
 '--',
 'threadbare',
 'in',
 'coat',
 'heart',
 'body',
 'and',
 'brain',
 'I',
 'see',
 'him',
 'now',
 'He',
 'was',
 'ever',
 'dusting',
 'his',
 'old',
 'lexicons',
 'and',
 'grammars',
 'with',
 'a',
 'queer',
 'handkerchief',
 'mockingly',
 'embellished',
 'with',
 'all',
 'the',
 'gay',
 'flags',
 'of',
 'all',
 'the',
 'known',
 'nations',
 'of',
 'the',
 'world',
 'He',
 'loved',
 'to',
 'dust',
 'his',
 'old',
 'grammars',
 'it',
 'somehow',
 'mildly',
 'reminded',
 'him',
 'of',
 'his',
 'mortality',
 '``',
 'While',
 'you',
 'take',
 'in',
 'hand',
 'to',
 'school',
 'others',
 'and',
 'to',
 'teach',
 'them',
 'by',
 'what',
 'name',
 'a',
 'whale-fish',
 'is',
 'to',
 'be',
 'called',
 'in',
 'our',
 'tongue',
 'leaving',
 'out',
 'through',
 'ignorance',
 'the',
 'letter',
 'H'

In [18]:
# Stopwords filtering
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words ]



['Moby',
 'Dick',
 'Herman',
 'Melville',
 '1851',
 'ETYMOLOGY',
 'Supplied',
 'Late',
 'Consumptive',
 'Usher',
 'Grammar',
 'School',
 'pale',
 'Usher',
 '--',
 'threadbare',
 'coat',
 'heart',
 'body',
 'brain',
 'see',
 'ever',
 'dusting',
 'old',
 'lexicons',
 'grammars',
 'queer',
 'handkerchief',
 'mockingly',
 'embellished',
 'gay',
 'flags',
 'known',
 'nations',
 'world',
 'loved',
 'dust',
 'old',
 'grammars',
 'somehow',
 'mildly',
 'reminded',
 'mortality',
 '``',
 'take',
 'hand',
 'school',
 'others',
 'teach',
 'name',
 'whale-fish',
 'called',
 'tongue',
 'leaving',
 'ignorance',
 'letter',
 'H',
 'almost',
 'alone',
 'maketh',
 'signification',
 'word',
 'deliver',
 'true',
 "''",
 '--',
 'HACKLUYT',
 "''",
 'WHALE',
 '...',
 'Sw.',
 'Dan',
 'HVAL',
 'animal',
 'named',
 'roundness',
 'rolling',
 'Dan',
 'HVALT',
 'arched',
 'vaulted',
 "''",
 '--',
 "WEBSTER'S",
 'DICTIONARY',
 "''",
 'WHALE',
 '...',
 'immediately',
 'Dut',
 'Ger',
 'WALLEN',
 'A.S.',
 'WALW-IAN',
 

In [28]:

pos_tags = nltk.pos_tag(filtered_tokens)

# POS frequency
pos_freq = FreqDist([tag[1] for tag in pos_tags])
print(pos_freq)
most_common_pos = pos_freq.most_common(5)
most_common_pos

<FreqDist with 36 samples and 115345 outcomes>


[('NN', 27640), ('JJ', 20466), ('NNP', 10374), ('NNS', 10128), ('RB', 8397)]

In [44]:
from nltk.stem import WordNetLemmatizer
import string
from collections import Counter

# Define a mapping of POS tags from pos_tag to WordNetLemmatizer
pos_tag_mapping = {
    'NNS': 'n',  # Noun, plural
    'VBG': 'v',  # Verb, gerund or present participle
    'NN': 'n',   # Noun, singular or mass
    'VBD': 'v',  # Verb, past tense
    'VBN': 'v',  # Verb, past participle
    'JJ': 'a',   # Adjective
    'VBZ': 'v',  # Verb, 3rd person singular present
    'VBP': 'v',  # Verb, non-3rd person singular present
    'RB': 'r',   # Adverb
    'NNP': 'n',  # Proper noun, singular
    'VB': 'v',   # Verb, base form
    'IN': 'n',   # Preposition or subordinating conjunction
    'PRP': 'n',  # Personal pronoun
    'PRP$': 'n', # Possessive pronoun
    'JJR': 'a',  # Adjective, comparative
    'JJS': 'a',  # Adjective, superlative
    'CD': 'n',   # Cardinal number
    'MD': 'v',   # Modal
    'VBG': 'v',  # Verb, gerund or present participle
    'RBR': 'r',  # Adverb, comparative
    'RBS': 'r',  # Adverb, superlative
    'WP': 'n',   # Wh-pronoun
    'WRB': 'r',  # Wh-adverb
    ':': 'n',    # Colon
    "''": 'n',   # Closing quotation mark
    '``': 'n',   # Opening quotation mark
    'like': 'n', # Inexact numeric modifier
    'POS' : 'n', # Possessive ending
}

# Assuming you have already defined 'pos_tagged_tokens' list

# Extract the top 20 tokens
top_20_tokens = Counter(pos_tags).most_common(20)

# Create a lemmatizer instance
lemmatizer = WordNetLemmatizer()

# Lemmatize the top 20 tokens and remove punctuation
lemmatized_tokens = [lemmatizer.lemmatize(token, pos=pos_tag_mapping[pos]) for (token, pos), count in top_20_tokens if pos in pos_tag_mapping ]

# Print the lemmatized tokens
print(lemmatized_tokens)

['--', "''", "'s", '``', 'one', 'like', 'Ahab', 'upon', 'man', 'old', 'would', 'whale', 'sea', 'whale', 'ship', 'though', 'time', 'say', 'still', 'yet']
