In [None]:
#!pip install nltk
#import nltk
#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')

In [1]:
import re
import numpy as np
import nltk
import collections
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

lemmatizer = WordNetLemmatizer()
stopwords_list = stopwords.words('english')

## Preprocessing

In [2]:
def sanitize_line(s):
    """
    Deletes everything except latin symbols, whitespaces and sentence endings.
    Sentence endings are converted to full stops. All characters are converted
    to lower case.
    """
    s = s.replace('-', ' ') \
         .replace('?', '.') \
         .replace('!', '.')
    return re.sub(r'[^a-zA-Z \.]', '', s).lower()

In [3]:
# read the lines
with open('alice_in_wonderland.txt') as f:
    lines = f.readlines()
# remove all characters except latin symbols, whitespaces and full stops
lines = [sanitize_line(s) for s in lines]
# drop empty lines and lines with only whitespaces
lines = [s for s in lines if s and set(s) != {' '}]
# strip the leading and trailing whitespaces
lines = [s.strip() for s in lines]
# remove the preamble and the postamble
lines = lines[32:-307]

In [4]:
# calculate the chapter heading indices
chapter_num_indices = [i for i, line in enumerate(lines) if line.startswith('chapter')]

In [5]:
# extract the chapters and sentences
chapters = []
chapter_num_indices.append(len(lines))

for i in range(len(chapter_num_indices) - 1):
    idx_start = chapter_num_indices[i] + 2
    idx_end = chapter_num_indices[i+1]
    chapter_lines = lines[idx_start:idx_end]
    chapter = ' '.join(chapter_lines)
    chapters.append(chapter)

sentences_by_chapter_ = [chapter.split('. ') for chapter in chapters]

In [6]:
sentences_by_chapter = []
for chapter_ in sentences_by_chapter_:
    chapter = []
    for s in chapter_:
        words = s.split()
        # remove stops words
        words = [w for w in words if w not in stopwords_list]
        # lemmatize the words
        words = [lemmatizer.lemmatize(w) for w in words]
        # reconstruct the sentences
        if words:
            s = ' '.join(words)
            chapter.append(s)
    sentences_by_chapter.append(chapter)
sentences_by_chapter = sentences_by_chapter[:-1]
chapters = [' '.join(sentences) for sentences in sentences_by_chapter]

## Most important words by chapter

In [7]:
tfidf = TfidfVectorizer()
tfidf.fit(chapters)
inverse_vocab = {v: k for k, v in tfidf.vocabulary_.items()}

In [8]:
for chapter in chapters:
    tfidf_enc = tfidf.transform([chapter]).todense()

In [9]:
tfidf_enc = tfidf.transform(chapters).todense()
# negate to sort in descending order
tfidf_enc *= -1
argsort = tfidf_enc.argsort(axis=1)
topk_indices = np.asarray(argsort[:, :11])

In [10]:
topk_words_by_chapter = []
for doc in topk_indices:
    # decode the words from indices
    words = [inverse_vocab[k] for k in doc]
    # remove "alice" and get first 10
    words = [w for w in words if w != 'alice'][:10]
    topk_words_by_chapter.append(words)

So here are the most important words (according to tf-idf features) for each chapter.

In [11]:
for i, words in enumerate(topk_words_by_chapter):
    print(f"Chapter {i+1}: {', '.join(words)}")

Chapter 1: little, bat, way, door, key, rabbit, eat, like, think, either
Chapter 2: mouse, little, pool, im, dear, swam, cat, said, foot, cried
Chapter 3: mouse, said, dodo, prize, lory, race, dry, thimble, know, course
Chapter 4: bill, window, little, rabbit, puppy, bottle, fan, glove, chimney, said
Chapter 5: caterpillar, said, pigeon, serpent, im, youth, egg, size, father, little
Chapter 6: said, footman, cat, baby, mad, duchess, wow, like, pig, cook
Chapter 7: hatter, dormouse, said, hare, march, tea, twinkle, time, draw, treacle
Chapter 8: queen, said, king, hedgehog, gardener, rose, soldier, cat, executioner, procession
Chapter 9: turtle, mock, said, gryphon, duchess, moral, queen, went, school, day
Chapter 10: turtle, mock, gryphon, said, lobster, dance, soup, beautiful, join, whiting
Chapter 11: king, hatter, court, said, dormouse, witness, jury, queen, juror, officer


**Proposed chapter names (some)**:

1. Little door and little way
7. Tea time
8. Queen's gardener and executioner
11. Queen's jury

Sounds stupid and awfully out-of-context :)

### Most used verbs in sentences with Alice

In [12]:
all_sentences = [s for c in sentences_by_chapter for s in c]

In [13]:
verbs = collections.defaultdict(int)
for s in all_sentences:
    s = s.split()
    if 'alice' in s:
        for w, pos_tag in nltk.pos_tag(s):
            if pos_tag.startswith('VB'):
                verbs[w] += 1

In [14]:
most_used_verbs = sorted(list(verbs.items()), key=lambda x: -x[1])[:10]

In [15]:
for word, num in most_used_verbs:
    print(f'{word}: {num} times')

said: 153 times
thought: 33 times
went: 29 times
say: 20 times
looked: 20 times
began: 19 times
got: 17 times
think: 16 times
know: 16 times
go: 15 times


Looks like Alice mostly says something, thinks about something, walks somewhere and looks at something. Well, we all do.