1. Download "Alice in Wonderland" by Lewis Carroll from Gutenberg project http://www.gutenberg.org/files/11/11-0.txt
2. Perform all the necessary preprocessing, including lowercasing, removing stopwords, numbers/non-alhabhetic symbols, etc.
3. Find Top-10 most important (in terms of count vectorizer or TF-IDF, for example) words from every chapter in the text (not "Alice"); how could you name each chapter according to the evaluated tokens?
4. Find the Top-10 most frequently used verbs in sentences with Alice. What does Alice do most often?

In [1]:
import nltk
import string

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import sent_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import warnings

warnings.simplefilter('ignore')

### Read text

In [2]:
filename = 'alice.txt'
with open(filename, 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
chapters = text.split('CHAPTER')
# delete introcudtion
chapters = chapters[1:]
# delete text after 'THE END'
last_chapter = chapters[-1]
last_chapter = last_chapter.split('THE END')
chapters[-1] = last_chapter[0]
print(len(chapters))

12


### Preprocessing for chapters

In [4]:
def clean_text(text):
    tokens = word_tokenize(text)
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word.lower() for word in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]
    tokens = [WordNetLemmatizer().lemmatize(word, 'v') for word in tokens]
    return tokens

In [5]:
chapters = [clean_text(chapter) for chapter in chapters]
#join words in chapter for CountVectorizer()
chapters = [(' ').join(chapter) for chapter in chapters]

### Top-10 most important words in every chapter

In [6]:
for j in range(len(chapters)):
    vectorizer = CountVectorizer()
    vectorizer_fit = vectorizer.fit_transform(chapters[j].split(' '))
    cur_names = vectorizer.get_feature_names()
    freq = vectorizer_fit.toarray().sum(axis = 0)
    cur_dict = {cur_names[i]:freq[i] for i in range(len(cur_names))}
    best_list = sorted(list(cur_dict.items()), key = lambda x: x[1], reverse = True)
    best_list = [item for item in best_list if item[0] != 'alice']
    print('10 Most important wrods for ' + str(j + 1) + ' chapter')
    for item in best_list[:10]:
        print(item)

10 Most important wrods for 1 chapter
('think', 19)
('say', 16)
('go', 15)
('little', 15)
('get', 14)
('see', 13)
('find', 11)
('like', 11)
('way', 11)
('eat', 10)
10 Most important wrods for 2 chapter
('go', 21)
('mouse', 20)
('say', 19)
('little', 17)
('think', 14)
('oh', 13)
('come', 10)
('cry', 10)
('dear', 10)
('like', 9)
10 Most important wrods for 3 chapter
('say', 40)
('mouse', 20)
('know', 13)
('dodo', 12)
('get', 9)
('think', 8)
('find', 7)
('one', 7)
('soon', 7)
('bird', 6)
10 Most important wrods for 4 chapter
('little', 23)
('say', 19)
('go', 17)
('get', 16)
('come', 15)
('one', 15)
('rabbit', 15)
('bill', 14)
('make', 14)
('grow', 13)
10 Most important wrods for 5 chapter
('say', 57)
('caterpillar', 26)
('think', 13)
('pigeon', 12)
('get', 11)
('little', 11)
('well', 10)
('serpent', 9)
('try', 9)
('begin', 8)
10 Most important wrods for 6 chapter
('say', 53)
('cat', 24)
('go', 23)
('think', 17)
('like', 16)
('duchess', 14)
('little', 14)
('get', 13)
('know', 13)
('baby', 

### Top-10 most frequently words in sentence with Alice

In [7]:
result_words = Counter()
sentences = sent_tokenize(text)

for sentence in sentences:
    sentence = clean_text(sentence)
    if 'alice' in sentence:
        sentence_words = nltk.pos_tag(sentence)
        for word, pos_tag in sentence_words:
            if pos_tag == 'VB':
                result_words[word] += 1
                
print(result_words.most_common(10))

[('go', 51), ('get', 40), ('say', 39), ('think', 23), ('take', 22), ('see', 18), ('know', 13), ('keep', 13), ('tell', 13), ('find', 12)]
