In [78]:
import nltk
import re
import requests
import string

from collections import Counter

from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm import tqdm

1. Download Alice in Wonderland

In [79]:
url = "http://www.gutenberg.org/files/11/11-0.txt"
response = requests.get(url)
text = response.text

ConnectTimeout: HTTPConnectionPool(host='www.gutenberg.org', port=80): Max retries exceeded with url: /files/11/11-0.txt (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x0000027A1EF95070>, 'Connection to www.gutenberg.org timed out. (connect timeout=None)'))

2. Perform any necessary preprocessing on the text, including converting to lower case, removing stop words, numbers / non-alphabetic characters, lemmatization.

In [None]:
def preprocessing(text: str) -> str:

    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\W', ' ', text)  # Remove special character
    
    # Remove non-alphabetic characters
    # text = re.sub(r'[^a-z\s\.]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenize
    tokens = WhitespaceTokenizer().tokenize(text)

    # Lemmatize the text
    lemmatizer = WordNetLemmatizer()
    # stemmer = PorterStemmer()
    text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
    # text = ' '.join([stemmer.stem(token) for token in tokens])

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words and len(word) > 1])
    
    return text.replace('_', '').strip()

In [None]:
text = preprocessing(text)

In [None]:
len(text.split('chapter'))

3. Find Top 10 most important (for example, in terms of TF-IDF metric) words from each chapter in the text (not "Alice"); how would you name each chapter according to the identified tokens?

In [None]:
chapter_pattern = r"chapter\s+\w+"
chapters = re.split(chapter_pattern, text, flags=re.IGNORECASE)
chapters = chapters[13:]
# chapters = list(map(lambda x: x.strip(), chapters))

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [None]:
## FIXED ROW
tfs = tfidf_vectorizer.fit_transform(chapters)

In [None]:
chapters_dict = {}
for chapter_num, chapter in tqdm(enumerate(chapters)):
    ### FIXED ROW BELOW
    tfidf_matrix = tfidf_vectorizer.transform([chapter])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    top_words = sorted(zip(feature_names, tfidf_matrix.toarray()[0]), key=lambda x: x[1], reverse=True)[:11]
    chapters_dict[chapter_num + 1] = ", ".join([word for index, (word, score) in enumerate(top_words)
                                                         if word != 'alice' and index < 11])

In [80]:
chapters_dict

{1: 'wa, little, bat, door, key, eat, like, think, way, bottle',
 2: 'mouse, wa, pool, little, oh, swam, cat, dear, said, foot',
 3: 'mouse, said, dodo, wa, prize, lory, dry, thimble, know, bird',
 4: 'wa, little, window, rabbit, puppy, glove, bottle, chimney, fan, said',
 5: 'caterpillar, said, pigeon, serpent, wa, egg, youth, size, father, little',
 6: 'said, cat, wa, footman, baby, mad, pig, duchess, wow, like',
 7: 'hatter, dormouse, said, march, hare, wa, twinkle, time, tea, draw',
 8: 'queen, said, wa, hedgehog, king, gardener, soldier, cat, executioner, procession',
 9: 'turtle, said, mock, gryphon, duchess, moral, wa, queen, went, say',
 10: 'turtle, mock, gryphon, said, dance, lobster, soup, join, beautiful, whiting',
 11: 'king, hatter, said, court, dormouse, wa, witness, queen, juror, officer',
 12: 'said, king, jury, queen, sister, dream, unimportant, wa, rabbit, fit'}

Honestly, my fantasy is working not so good, but I've tried to name the chapters.

1. Thinking was like litte
2. Oh, mouse was little
3. Mouse said dodo
4. There was one little rabbit
5. Caterpillar, pigeon, serpent
6. Duchees Cat
7. Dormouse Hatter
8. Queen and King
9. Mock Turtle
10. Gryphon Turtle
11. Hatter King
12. Jury King 

4. Find the Top 10 most used verbs in sentences with Alice

In [None]:
sentences = sent_tokenize(text)

In [None]:
url = "http://www.gutenberg.org/files/11/11-0.txt"
response = requests.get(url)
text = response.text
sentences = sent_tokenize(text)

In [None]:
sentences = list(map(preprocessing, sentences))    
sentences = list(map(lambda x: x.replace('.', ' '), sentences))    

In [None]:
alice_sentences = [sentence for sentence in sentences if 'alice' in sentence.lower()]

In [None]:
verb_counts = Counter()

In [None]:
for sentence in alice_sentences:
    for word, tag in nltk.pos_tag(sentence.split()):
        if tag in ['VB', 'VBP']:
            verb_counts[word] += 1

top_verbs = verb_counts.most_common(10)

In [None]:
top_verbs

What does Alice do most often?

Alice thinks, knows, says, goes, and sees.