In [5]:
from nltk.tokenize import word_tokenize
from nltk.corpus import gutenberg

moby_raw = gutenberg.raw('melville-moby_dick.txt') 

def answer_one():
    tokens = word_tokenize(moby_raw)
    return len(set(tokens)) / len(tokens)

lexical_diversity = answer_one()
print(lexical_diversity)

0.08133224587104161


In [6]:
def answer_two():
    tokens = word_tokenize(moby_raw)
    whale_count = sum(1 for token in tokens if token in ['whale', 'Whale'])
    return (whale_count / len(tokens)) * 100

whale_percentage = answer_two()
print(whale_percentage)

0.4125037250811676


In [7]:
from collections import Counter

def answer_three():
    tokens = word_tokenize(moby_raw)
    frequency = Counter(tokens)
    return frequency.most_common(10)

top_tokens = answer_three()
print(top_tokens)

[(',', 19204), ('the', 13715), ('.', 7306), ('of', 6513), ('and', 6010), ('a', 4545), ('to', 4515), (';', 4173), ('in', 3908), ('that', 2978)]


In [9]:
def answer_four():
    tokens = word_tokenize(moby_raw)
    freq = Counter(tokens)
    return sorted([token for token in freq if len(token) > 5 and freq[token] > 150])

custom_tokens = answer_four()
print(custom_tokens)

['Captain', 'Pequod', 'Queequeg', 'Starbuck', 'almost', 'before', 'himself', 'little', 'seemed', 'should', 'though', 'through', 'whales', 'without']


In [10]:
def answer_five():
    tokens = word_tokenize(moby_raw)
    longest_word = max(tokens, key=len)
    return (longest_word, len(longest_word))

longest_word = answer_five()
print(longest_word)

("twelve-o'clock-at-night", 23)


In [13]:
def answer_six():
    tokens = word_tokenize(moby_raw)
    words = [word for word in tokens if word.isalpha()]
    freq = Counter(words)
    return sorted([(freq[word], word) for word in set(words) if freq[word] > 2000], reverse=True)

frequent_words = answer_six()
print(frequent_words)

[(13715, 'the'), (6513, 'of'), (6010, 'and'), (4545, 'a'), (4515, 'to'), (3908, 'in'), (2978, 'that'), (2459, 'his'), (2196, 'it'), (2113, 'I')]


In [14]:
from nltk.tokenize import sent_tokenize, word_tokenize
import numpy as np

def answer_seven():
    sentences = sent_tokenize(moby_raw)
    token_counts = [len(word_tokenize(sentence)) for sentence in sentences]
    return np.mean(token_counts)

avg_tokens = answer_seven()
print(avg_tokens)

25.88591149005278


In [18]:
from nltk import pos_tag
from collections import Counter

def answer_eight():
    tokens = word_tokenize(moby_raw)
    tags = pos_tag(tokens)
    freq = Counter(tag for word, tag in tags)
    return freq.most_common(5)

parts_of_speech = answer_eight()
print(parts_of_speech)

[('NN', 32727), ('IN', 28662), ('DT', 25879), (',', 19204), ('JJ', 17613)]


In [22]:
from nltk.corpus import words as nltk_words
import nltk

def answer_nine(default_words=['cormulent', 'incendenece', 'validrate']):
    correct_spellings = nltk_words.words()
    recommendations = []

    for word in default_words:
        same_letter_words = [w for w in correct_spellings if w.startswith(word[0].lower())]
        distances = ((nltk.edit_distance(word, w, transpositions=True), w) for w in same_letter_words)
        recommendation = min(distances, key=lambda x: x[0])[1]
        recommendations.append(recommendation)

    return recommendations

spelling_recommendations = answer_nine()
print(spelling_recommendations)

['corpulent', 'intendence', 'validate']
