In [1]:
import nltk
import re
import requests
import string

from collections import Counter

from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer, WhitespaceTokenizer, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm import tqdm

1. Download Alice in Wonderland

In [2]:
url = "http://www.gutenberg.org/files/11/11-0.txt"
response = requests.get(url)
text = response.text

2. Perform any necessary preprocessing on the text, including converting to lower case, removing stop words, numbers / non-alphabetic characters, lemmatization.

In [3]:
def preprocessing(text: str) -> str:

    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\W', ' ', text)  # Remove special character

    
    # Remove non-alphabetic characters
    # text = re.sub(r'[^a-z\s\.]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenize
    tokens = WhitespaceTokenizer().tokenize(text)

    # Lemmatize the text
    lemmatizer = WordNetLemmatizer()
    # stemmer = PorterStemmer()
    text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])
    # text = ' '.join([stemmer.stem(token) for token in tokens])

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words and len(word) > 1])
    
    return text.replace('_', '').strip()

In [4]:
text = preprocessing(text)

In [5]:
len(text.split('chapter'))

25

3. Find Top 10 most important (for example, in terms of TF-IDF metric) words from each chapter in the text (not "Alice"); how would you name each chapter according to the identified tokens?

In [6]:
chapter_pattern = r"chapter\s+\w+"
chapters = re.split(chapter_pattern, text, flags=re.IGNORECASE)
chapters = chapters[13:]
# chapters = list(map(lambda x: x.strip(), chapters))

In [7]:
tfidf_vectorizer = TfidfVectorizer()

In [8]:
chapters_dict = {}
for chapter_num, chapter in tqdm(enumerate(chapters)):
    tfidf_matrix = tfidf_vectorizer.fit_transform([chapter])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    top_words = sorted(zip(feature_names, tfidf_matrix.toarray()[0]), key=lambda x: x[1], reverse=True)[:11]
    chapters_dict[chapter_num + 1] = ", ".join([word for index, (word, score) in enumerate(top_words)
                                                         if word != 'alice' and index < 11])

12it [00:00, 500.89it/s]


In [9]:
chapters_dict

{1: 'wa, little, like, think, way, see, door, one, could, said',
 2: 'wa, mouse, little, oh, said, dear, go, thing, foot, like',
 3: 'said, mouse, wa, dodo, know, one, soon, bird, dry, long',
 4: 'wa, little, one, rabbit, bill, said, get, heard, sure, thought',
 5: 'said, wa, caterpillar, pigeon, serpent, little, well, know, minute, one',
 6: 'said, wa, cat, like, duchess, little, baby, footman, mad, much',
 7: 'said, hatter, dormouse, wa, march, hare, time, know, thing, well',
 8: 'said, wa, queen, head, king, cat, three, hedgehog, like, one',
 9: 'said, turtle, mock, wa, gryphon, duchess, queen, went, never, little',
 10: 'said, gryphon, turtle, mock, would, dance, lobster, wa, soup, beautiful',
 11: 'said, wa, king, hatter, court, dormouse, one, witness, queen, began',
 12: 'said, king, would, wa, jury, little, queen, know, head, one'}

4. Find the Top 10 most used verbs in sentences with Alice

In [10]:
sentences = sent_tokenize(text)

In [11]:
url = "http://www.gutenberg.org/files/11/11-0.txt"
response = requests.get(url)
text = response.text
sentences = sent_tokenize(text)

In [12]:
sentences = list(map(preprocessing, sentences))    
sentences = list(map(lambda x: x.replace('.', ' '), sentences))    

In [13]:
alice_sentences = [sentence for sentence in sentences if 'alice' in sentence.lower()]

In [14]:
verb_counts = Counter()

In [15]:
for sentence in alice_sentences:
    for word, tag in nltk.pos_tag(sentence.split()):
        if tag in ['VB', 'VBP']:
            verb_counts[word] += 1
top_verbs = verb_counts.most_common(10)

In [16]:
top_verbs

[('say', 35),
 ('know', 32),
 ('go', 27),
 ('see', 23),
 ('think', 21),
 ('get', 17),
 ('make', 15),
 ('come', 14),
 ('take', 12),
 ('wa', 11)]