In [2]:
import nltk
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk import pos_tag
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import math
import pandas as pd
from collections import Counter
import gensim
import plotly.express as px
from sklearn.manifold import TSNE
# import all the resources for Natural Language Processing with Python
#nltk.download("book")
#nltk.download('punkt')
#nltk.download('words')
#nltk.download('stopwords')


# Import text data

In [3]:
filename = "Alice in Wonderland.txt"
f = open(str(filename), 'r')
text = f.read()

# Preprocessing data

In [4]:
#Removing unnecessary text fragments
text = text.split(" CHAPTER XII.   Alice’s Evidence",1) [1]
text = text.split("THE END", 1) [0]

In [5]:
text_clean = re.sub("<.*?>", " ", text)
# Example : _very_
text_clean= re.sub("_", " ", text_clean)
text_clean = re.sub(r"[^\w\s]", "", text_clean)
#Tokenization - splitting into word 
tokens = word_tokenize(text_clean)
#Lower case
tokens = [w.lower() for w in tokens]
#Stop words
stop_words = set(stopwords.words('english'))
stop_words_2 = set(ENGLISH_STOP_WORDS)
my_stop_words = set(['wow','yer','oh','im'])
stop_words.update(stop_words_2)
stop_words.update(my_stop_words)
tokens = [w for w in tokens if not w in stop_words ]
#Lemmatization
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(i) for i in tokens]

In [6]:
len(set(ENGLISH_STOP_WORDS))

318

In [7]:
len(set(stopwords.words('english')))

179

# Find Top 10 most important words from each chapter

In [8]:
# Split on chapters
chapter_index = [ i for i,word in enumerate(tokens) if word == 'chapter']
chapter_index.append(len(tokens))
chapters = []
for i in range(len(chapter_index)-1):
    if i ==1: 
        #chapter i -> chapter after removed stop words
        chapters.append(tokens[chapter_index[i]+1:chapter_index[i+1]])
    else:
        chapters.append(tokens[(chapter_index[i]+2):chapter_index[i+1]])

In [9]:
def compute_tf(text):
    tf_text = Counter(text)
    for i in tf_text:
        tf_text[i] = tf_text[i]/float(len(text))
    return tf_text

def compute_idf(word, corpus):
#the word is taken as input, for which we consider IDF
#and corpus of documents as a list of word lists
#number of documents where the search term occurs
        return math.log10(len(corpus)/sum([1.0 for i in corpus if word in i]))

def compute_tfidf(corpus):
    documents_list = []
    for text in corpus:
        tf_idf_dictionary = {}
        computed_tf = compute_tf(text)
        for word in computed_tf:
            tf_idf_dictionary[word] = computed_tf[word] * compute_idf(word, corpus)
        documents_list.append(tf_idf_dictionary)
    return documents_list

In [10]:
tfidf = compute_tfidf(chapters)
df_tfidf = pd.DataFrame.from_records(tfidf).fillna(0).transpose()

df_tfidf.rename(columns = {"index":"words"},inplace = True)

In [11]:
for i in range(len(chapter_index)-1): 
    print('-------------------------')
    print ('CHAPTER {}'.format(i+1))
    print(df_tfidf[[i]].nlargest(10,[i]).rename(columns = {i:"tf-idf"}))

-------------------------
CHAPTER 1
           tf-idf
bat      0.005829
key      0.004510
dark     0.004042
poison   0.004042
candle   0.004042
bottle   0.003886
eat      0.003323
fell     0.003007
marked   0.003007
passage  0.002914
-------------------------
CHAPTER 2
           tf-idf
mouse    0.007312
swam     0.006485
pool     0.005789
mabel    0.005188
glove    0.003741
fan      0.003741
dog      0.003741
cat      0.003656
kid      0.002806
capital  0.002806
-------------------------
CHAPTER 3
              tf-idf
dodo        0.013264
mouse       0.011342
prize       0.009198
lory        0.006632
thimble     0.006132
dry         0.005131
caucusrace  0.004599
tale        0.003316
bird        0.003240
wet         0.003066
-------------------------
CHAPTER 4
           tf-idf
window   0.008472
puppy    0.007413
bottle   0.004582
glove    0.004582
fan      0.004582
mary     0.004236
ann      0.004236
honour   0.004236
chimney  0.003818
rabbit   0.003216
-------------------------
CHAPT

# Find the Top 10 most used verbs in sentences with Alice

For this task, it is necessary to perform another data preprocessing by breaking the text into sentences and POS Tagging

 POS Tagging is a task of labelling each word in a sentence with its appropriate part of speech

In [12]:
text_sentences = sent_tokenize(text)

In [13]:
sentences_with_alice = [sen.lower() for sen in text_sentences if 'alice' in sen.lower()]

<li> VB  verb, base form take</li>
<li>VBD verb, past tense took</li>
<li>VBG verb, gerund/present participle taking</li>
<li>VBN verb, past participle taken</li>
<li>VBP verb, sing. present, non-3d take</li>
<li>VBZ verb, 3rd person sing. present takes</li>

In [14]:
verb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP','VBZ' ]
sentences_verb = []
for sen in sentences_with_alice:
    sen = re.sub("<.*?>", " ", sen)
    sen = re.sub("_", " ", sen)
    sen = re.sub(r"[^\w\s]", "", sen)
    tokens_sen = word_tokenize(sen)
    tokens_sen = [w for w in tokens_sen if not w in stop_words ]
    tokens_sen = pos_tag(tokens_sen)
    sen_verb = [verb for verb,tag in tokens_sen if tag in verb_tags]
    tokens_sen = [lemmatizer.lemmatize(i,'v') for i in sen_verb]
    sentences_verb.append(tokens_sen)  
verbs=[j for i in sentences_verb for j in i]

In [15]:
count_verb = Counter(verbs)

In [30]:
print('Most frequent verbs:')
count_verb.most_common(10)

Most frequent verbs:


[('say', 295),
 ('go', 67),
 ('think', 65),
 ('look', 46),
 ('know', 41),
 ('begin', 40),
 ('get', 40),
 ('come', 40),
 ('make', 20),
 ('take', 20)]

# Find Top 100 most used verbs in sentences with Alice (word2vec)

Pretrain model was downloaded from:

https://code.google.com/archive/p/word2vec/

In [17]:
model = gensim.models.KeyedVectors.load_word2vec_format(
    "GoogleNews-vectors-negative300.bin", binary=True)

In [18]:
verb_100 = count_verb.most_common(100)

In [19]:
embeddings = []
words = []
clusters = []

for word  in verb_100:
    for similar_word, _ in model.most_similar(word[0], topn=30):
        words.append(similar_word)
        embeddings.append(model[similar_word])
        clusters.append(word[0])

In [21]:
tsne = TSNE(perplexity=15, n_components=2,n_iter=3500, random_state=32)
embeddings = tsne.fit_transform(embeddings)

In [28]:
fig = px.scatter(
    x=embeddings[:, 0],
    y=embeddings[:, 1],
    color=clusters,
    template="plotly_white",
# )
    text=words)
fig.update_traces(textposition="top center")
fig.show()