In [None]:
import os
import urllib.request
import cv2
import numpy as np
from pathlib import Path
import nltk
from nltk.text import TextCollection
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
snowball_stemmer =  nltk.stem.SnowballStemmer('english')

urllib.request.urlretrieve("http://www.gutenberg.org/files/11/11-0.txt", 'text.txt')
text = Path('text.txt').read_text().lower()[1437:]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:

def tokenize_and_clean_text_chunk(text):
  tokens = nltk.word_tokenize(text)
  tokens = [word for word in tokens if word.isalpha()]
  table = str.maketrans('', '', string.punctuation)
  tokens = [w.translate(table) for w in tokens]
  stop_words = nltk.corpus.stopwords.words('english')
  tokens = [w for w in tokens if not w in stop_words]
  tokens = [wordnet_lemmatizer.lemmatize(word) for word in tokens] 
  # tokens = [snowball_stemmer.stem(word) for word in tokens] 
  return tokens

chapters = text.split("chapter")[1:]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

for idx, chapter in enumerate(chapters):
  vectorizer = TfidfVectorizer(tokenizer=tokenize_and_clean_text_chunk)
  response = vectorizer.fit_transform([chapter])
  terms = vectorizer.get_feature_names()
  # sum tfidf frequency of each term through documents
  sums = response.sum(axis=0)
  # connecting term to its sums frequency
  data = []
  for col, term in enumerate(terms):
      data.append( (term, sums[0,col] ))
  ranking = pd.DataFrame(data, columns=['term','tf_idf'])
  print("CHAPTER {}".format(idx+1))
  print("==========")
  print(ranking.sort_values('tf_idf', ascending=False)[:11]) # +1 for Alice
  print("\n")

0.0
CHAPTER 1
       term    tf_idf
8     alice  0.443830
240  little  0.237766
446     way  0.174362
235    like  0.174362
355     see  0.158511
410   think  0.142660
96     door  0.142660
289     one  0.126809
70    could  0.126809
415    time  0.126809


CHAPTER 2
        term    tf_idf
7      alice  0.386126
213   little  0.247121
236    mouse  0.247121
318     said  0.185341
73      dear  0.169896
147       go  0.169896
126     foot  0.154451
399    thing  0.154451
239     must  0.139005
403  thought  0.139005


CHAPTER 3
      term    tf_idf
318   said  0.535915
13   alice  0.362531
242  mouse  0.331006
106   dodo  0.189146
196   know  0.173384
261    one  0.126098
349   soon  0.110335
47    bird  0.094573
314  round  0.094573
372  thing  0.094573


CHAPTER 4
        term    tf_idf
4      alice  0.411617
252   little  0.305393
351   rabbit  0.199170
308      one  0.185892
374     said  0.185892
35      bill  0.159336
454  thought  0.119502
191    heard  0.119502
436     sure  0.1

Chapter 1: Little Alice thinks of doors and time

Chapter 2: Little Alice sees mice and thinks

Chapter 3: Little Alice knows a bird Dodo

Chapter 4: Little Alice hears a rabbit

Chapter 5: Little Alice thinks of a caterpillar

Chapter 6: Little Alice likes cats

Chapter 7: Little Alice matches with hatters

Chapter 8: Little Alice goes to the queen

Chapter 9: Little Alice mocks turtles

Chapter 10: Little Alice dances with lobsters

Chapter 11: Little Alice thoughts of hatters, dormice, rabbits, kings

Chapter 12: Little Alice works on project

In [None]:
nltk.download('averaged_perceptron_tagger')

sentences = nltk.sent_tokenize(text)
print(len(sentences))
sentences_with_alice = []
sentences_with_alice = [_ for _ in sentences if 'alice' in _]
print(len(sentences_with_alice))
all_verbs = []
for sentence in sentences_with_alice:
  tokens = tokenize_and_clean_text_chunk(sentence)
  pos_tagged = nltk.pos_tag(tokens)
  verbs = list(filter(lambda x:x[1]=='VB',pos_tagged))
  verbs = [_[0] for _ in verbs]
  for verb in verbs:
    all_verbs.append(verb)
print(all_verbs)

def CountFrequency(my_list): 
  
    # Creating an empty dictionary  
    freq = {} 
    for item in my_list: 
        if (item in freq): 
            freq[item] += 1
        else: 
            freq[item] = 1

    freq = dict(sorted(freq.items(), key=lambda item: item[1], reverse=True))
    for key, value in freq.items(): 
        print ("{}: {}".format(key, value)) 

CountFrequency(all_verbs) 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
1080
359
['get', 'hot', 'worth', 'watch', 'see', 'like', 'think', 'see', 'thousand', 'knowledge', 'get', 'get', 'belong', 'get', 'go', 'little', 'knew', 'see', 'find', 'drink', 'say', 'wise', 'shrink', 'end', 'advise', 'get', 'feel', 'go', 'manage', 'walk', 'go', 'say', 'glove', 'go', 'glove', 'mabel', 'go', 'tell', 'existence', 'come', 'find', 'use', 'right', 'wink', 'come', 'like', 'offended', 'let', 'name', 'say', 'go', 'get', 'go', 'find', 'say', 'know', 'allow', 'catch', 'speak', 'prize', 'think', 'offended', 'go', 'wasting', 'try', 'please', 'come', 'see', 'glove', 'happen', 'miss', 'get', 'stop', 'glove', 'grow', 'get', 'take', 'hear', 'thought', 'know', 'come', 'burn', 'set', 'take', 'found', 'find', 'hungry', 'eat', 'keep', 'changed', 'feel', 'feel', 'tell', 'think', 'wait', 'tell', 'remember