In [45]:
import nltk
import re
import requests
import string

from collections import Counter

from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm import tqdm

1. Download Alice in Wonderland

In [46]:
url = "http://www.gutenberg.org/files/11/11-0.txt"
response = requests.get(url)
text = response.text

2. Perform any necessary preprocessing on the text, including converting to lower case, removing stop words, numbers / non-alphabetic characters, lemmatization.

In [47]:
def preprocessing(text: str) -> str:

    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\W', ' ', text)  # Remove special character
    
    # Remove non-alphabetic characters
    # text = re.sub(r'[^a-z\s\.]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenize
    tokens = WhitespaceTokenizer().tokenize(text)

    # Lemmatize the text
    lemmatizer = WordNetLemmatizer()
    # stemmer = PorterStemmer()
    text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
    # text = ' '.join([stemmer.stem(token) for token in tokens])

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words and len(word) > 1])
    
    return text.replace('_', '').strip()

In [48]:
text = preprocessing(text)

In [49]:
len(text.split('chapter'))

25

3. Find Top 10 most important (for example, in terms of TF-IDF metric) words from each chapter in the text (not "Alice"); how would you name each chapter according to the identified tokens?

In [50]:
chapter_pattern = r"chapter\s+\w+"
chapters = re.split(chapter_pattern, text, flags=re.IGNORECASE)
chapters = chapters[13:]
# chapters = list(map(lambda x: x.strip(), chapters))

In [51]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [52]:
## FIXED ROW
tfs = tfidf_vectorizer.fit_transform(chapters)

In [53]:
chapters_dict = {}
for chapter_num, chapter in tqdm(enumerate(chapters)):
    ### FIXED ROW BELOW
    tfidf_matrix = tfidf_vectorizer.transform([chapter])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    top_words = sorted(zip(feature_names, tfidf_matrix.toarray()[0]), key=lambda x: x[1], reverse=True)[:11]
    chapters_dict[chapter_num + 1] = ", ".join([word for index, (word, score) in enumerate(top_words)
                                                         if word != 'alice' and index < 11])

12it [00:00, 295.64it/s]


Hello!
Here almost all the chapters contain the words "said" and "wa", this is incorrect, since in the tf-idf approach the weight of words that are contained in all chapters should decrease. 

First, fit all chapters to the model, second, sequentially transform the text for chapters

Honestly, my fantasy is working not so good, but I've tried to name the chapters.

1. Thinking was like litte
2. Oh, mouse was little
3. Mouse said dodo
4. There was one little rabbit
5. Caterpillar, pigeon, serpent
6. Duchees Cat
7. Dormouse Hatter
8. Queen and King
9. Mock Turtle
10. Gryphon Turtle
11. King Hatter
12. King likes jury

4. Find the Top 10 most used verbs in sentences with Alice

In [55]:
sentences = sent_tokenize(text)

In [56]:
url = "http://www.gutenberg.org/files/11/11-0.txt"
response = requests.get(url)
text = response.text
sentences = sent_tokenize(text)

In [57]:
sentences = list(map(preprocessing, sentences))    
sentences = list(map(lambda x: x.replace('.', ' '), sentences))    

In [58]:
alice_sentences = [sentence for sentence in sentences if 'alice' in sentence.lower()]

In [59]:
verb_counts = Counter()

In [60]:
for sentence in alice_sentences:
    for word, tag in nltk.pos_tag(sentence.split()):
        if tag in ['VB', 'VBP']:
            verb_counts[word] += 1

top_verbs = verb_counts.most_common(10)

In [61]:
top_verbs

[('say', 35),
 ('know', 32),
 ('go', 27),
 ('see', 23),
 ('think', 21),
 ('get', 17),
 ('make', 15),
 ('come', 14),
 ('take', 12),
 ('wa', 11)]

What does Alice do most often?

Alice thinks, knows, says, goes, and sees.