In [2]:
import nltk

In [8]:
stopwords = set(nltk.corpus.stopwords.words('english'))
filename = './data/sherlock_holmes.txt'
file = open(filename, 'r', encoding='utf-8')
text = file.read()
text = text.replace('\n', ' ')
words = nltk.tokenize.word_tokenize(text)
words_without_stopwords = [word for word in words if word not in stopwords]
print(len(words), len(words_without_stopwords))

125119 74158


## Compiling stopwords using FreqDist

In [9]:
import nltk
from nltk.probability import FreqDist

In [10]:
file = open(filename, 'r', encoding='utf-8')
text = file.read()

# Remove newlines for better readability.
text = text.replace('\n', ' ')

# Tokenize the text.
words = nltk.tokenize.word_tokenize(text)

In [11]:
# Create a frequency distribution object and use it to create a tuple,
# where the first element of the tuple is the word, 
# and the second one is the frequency count.
freq_dist = FreqDist(word.lower() for word in words)
words_with_frequencies = [(word, freq_dist[word]) for word in freq_dist.keys()]
sorted_words = sorted(words_with_frequencies, key=lambda tup: tup[1])
print(sorted_words)



In [16]:
# Two options for stopwords choice
# 1. Take the frequency cutoff, that is if the words appears more than 100 times.
stopwords = [tuple[0] for tuple in sorted_words if tuple[1] > 100]
print('len:', len(stopwords))
print('first 5:', stopwords[:5])

len: 131
first 5: ['away', 'never', 'good', 'nothing', 'case']


In [21]:
# 2. Take the n% most frequent words
length_cutoff = int(0.02 * len(sorted_words))
print('length_cutoff:', length_cutoff)
stopwords = [tuple[0] for tuple in sorted_words[-length_cutoff:]]
print('len:', len(stopwords))
print('first 5:', stopwords[:5])

length_cutoff: 181
len: 181
first 5: ['make', 'myself', 'night', 'until', 'street']
