In [43]:
import os
import urllib.request
import cv2
import numpy as np
from pathlib import Path
import nltk
from nltk.text import TextCollection
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
snowball_stemmer =  nltk.stem.SnowballStemmer('english')

urllib.request.urlretrieve("http://www.gutenberg.org/files/11/11-0.txt", 'text.txt')
text = Path('text.txt').read_text().lower()[1437:-18966] # Remove warnings and copyright notes


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [44]:

def tokenize_and_clean_text_chunk(text):
  tokens = nltk.word_tokenize(text)
  tokens = [word for word in tokens if word.isalpha()]
  table = str.maketrans('', '', string.punctuation)
  tokens = [w.translate(table) for w in tokens]
  stop_words = nltk.corpus.stopwords.words('english')
  stop_words.append('alice') #Remove "Alice" from consideration
  tokens = [w for w in tokens if not w in stop_words]
  tokens = [wordnet_lemmatizer.lemmatize(word) for word in tokens] 
  return tokens

chapters = text.split("chapter")[1:]

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

for idx, chapter in enumerate(chapters):
  vectorizer = TfidfVectorizer(tokenizer=tokenize_and_clean_text_chunk)
  response = vectorizer.fit_transform([chapter])
  terms = vectorizer.get_feature_names()
  # sum tfidf frequency of each term through documents
  sums = response.sum(axis=0)
  # connecting term to its sums frequency
  data = []
  for col, term in enumerate(terms):
      data.append( (term, sums[0,col] ))
  ranking = pd.DataFrame(data, columns=['term','tf_idf'])
  print("CHAPTER {}".format(idx+1))
  print("==========")
  print(ranking.sort_values('tf_idf', ascending=False)[:10])
  print("\n")

CHAPTER 1
        term    tf_idf
239   little  0.265331
445      way  0.194576
234     like  0.194576
354      see  0.176887
409    think  0.159199
95      door  0.159199
288      one  0.141510
69     could  0.141510
414     time  0.141510
411  thought  0.141510


CHAPTER 2
       term    tf_idf
212  little  0.267897
235   mouse  0.267897
317    said  0.200923
146      go  0.184179
72     dear  0.184179
125    foot  0.167436
398   thing  0.167436
261      oh  0.150692
438    went  0.150692
238    must  0.150692


CHAPTER 3
      term    tf_idf
317   said  0.575034
241  mouse  0.355168
105   dodo  0.202953
195   know  0.186040
260    one  0.135302
348   soon  0.118389
313  round  0.101477
109    dry  0.101477
213   long  0.101477
217   lory  0.101477


CHAPTER 4
        term    tf_idf
251   little  0.335098
350   rabbit  0.218542
373     said  0.203972
307      one  0.203972
34      bill  0.174834
453  thought  0.131125
159      get  0.131125
190    heard  0.131125
435     sure  0.13112

Chapter 1: Little Alice thinks of doors and time

Chapter 2: Little Alice sees mice and thinks

Chapter 3: Little Alice knows a bird Dodo

Chapter 4: Little Alice hears a rabbit

Chapter 5: Little Alice thinks of a caterpillar

Chapter 6: Little Alice likes cats

Chapter 7: Little Alice matches with hatters

Chapter 8: Little Alice goes to the queen

Chapter 9: Little Alice mocks turtles

Chapter 10: Little Alice dances with lobsters

Chapter 11: Little Alice thoughts of hatters, dormice, rabbits, kings

Chapter 12: Little Alice headed to jury, King and Queen with white rabbit


In [46]:
nltk.download('averaged_perceptron_tagger')

sentences = nltk.sent_tokenize(text)
print(len(sentences))
sentences_with_alice = []
sentences_with_alice = [_ for _ in sentences if 'alice' in _]
print(len(sentences_with_alice))
all_verbs = []
for sentence in sentences_with_alice:
  tokens = tokenize_and_clean_text_chunk(sentence)
  pos_tagged = nltk.pos_tag(tokens)
  verbs = list(filter(lambda x:x[1]=='VB',pos_tagged))
  verbs = [_[0] for _ in verbs]
  for verb in verbs:
    all_verbs.append(verb)
print(all_verbs)

def CountFrequency(my_list): 
  
    # Creating an empty dictionary  
    freq = {} 
    for item in my_list: 
        if (item in freq): 
            freq[item] += 1
        else: 
            freq[item] = 1

    freq = dict(sorted(freq.items(), key=lambda item: item[1], reverse=True))
    for key, value in freq.items(): 
        print ("{}: {}".format(key, value)) 

CountFrequency(all_verbs) 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
970
358
['get', 'hot', 'worth', 'watch', 'see', 'like', 'think', 'see', 'thousand', 'see', 'knowledge', 'get', 'get', 'belong', 'get', 'go', 'little', 'knew', 'see', 'find', 'drink', 'say', 'wise', 'shrink', 'end', 'advise', 'get', 'feel', 'go', 'manage', 'go', 'say', 'glove', 'go', 'glove', 'mabel', 'go', 'tell', 'existence', 'come', 'find', 'use', 'right', 'wink', 'come', 'like', 'offended', 'let', 'name', 'say', 'go', 'get', 'go', 'say', 'know', 'allow', 'kept', 'catch', 'speak', 'prize', 'think', 'offended', 'go', 'wasting', 'try', 'please', 'come', 'see', 'glove', 'happen', 'miss', 'get', 'let', 'stop', 'glove', 'become', 'grow', 'get', 'take', 'hear', 'thought', 'know', 'come', 'burn', 'set', 'take', 'found', 'find', 'hungry', 'eat', 'keep', 'changed', 'feel', 'feel', 'tell', 'think', 'wait', 't