In [None]:
import re
import nltk
import spacy
import operator
import pandas as pd
from itertools import islice
import gensim

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from urllib.request import urlopen
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import sys
!{sys.executable} -m pip install spacy
# Download spaCy's  'en' Model
!{sys.executable} -m spacy download en

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 7.4 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


#### Data reading

In [None]:
url = "https://www.gutenberg.org/files/11/11-0.txt"

data_url = urlopen(url)

data_raw = ""

for line in data_url:
  data_raw += str(line.decode("utf-8")) 

data_raw = data_raw.split('THE END')
data_raw = data_raw[0]
data_raw = data_raw.split('CHAPTER')
del data_raw[0:13]
data_raw

[' I.\r\nDown the Rabbit-Hole\r\n\r\n\r\nAlice was beginning to get very tired of sitting by her sister on the\r\nbank, and of having nothing to do: once or twice she had peeped into\r\nthe book her sister was reading, but it had no pictures or\r\nconversations in it, “and what is the use of a book,” thought Alice\r\n“without pictures or conversations?”\r\n\r\nSo she was considering in her own mind (as well as she could, for the\r\nhot day made her feel very sleepy and stupid), whether the pleasure of\r\nmaking a daisy-chain would be worth the trouble of getting up and\r\npicking the daisies, when suddenly a White Rabbit with pink eyes ran\r\nclose by her.\r\n\r\nThere was nothing so _very_ remarkable in that; nor did Alice think it\r\nso _very_ much out of the way to hear the Rabbit say to itself, “Oh\r\ndear! Oh dear! I shall be late!” (when she thought it over afterwards,\r\nit occurred to her that she ought to have wondered at this, but at the\r\ntime it all seemed quite natural); 

#### Data Preprocessing

In [None]:
def data_preprocessing(data):
  # remove url
  data = re.sub(r"http\S+", ' ', data)
  data = re.sub(r"www\S+", ' ', data)

  # remove tabs and switching to a new line
  data = re.sub(r"^\s+|\n|\r|\s+$", ' ', data)

  # remove digits 
  data = re.sub(r"\d+", ' ', data)

  # remove punctuation marks
  punct_mark = '''!()-[]{};:'"\,<>./?@#'“”’‘—$%^&*_~'''
  data = "".join([word for word in data if word not in punct_mark and not word.isdigit()])

  # lowercase conversion
  data = data.lower()

  # lemmatization

  # Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
  load_model = spacy.load('en', disable=['parser', 'ner'])
  # Parse the sentence using the loaded 'en' model object `nlp`
  data = load_model(data)
  # Extract the lemma for each token and join
  data = " ".join([token.lemma_ for token in data])
  # remove -PRON-
  data = data.replace('-PRON-', ' ')

  return data

In [None]:
# tokenization function
def tokenization(data, flag='w'):
  if flag=='w':
    # word tokenization
    return word_tokenize(data)
  elif flag=="s":
    # sentences tokenization
    return sent_tokenize(data)

In [None]:
# remove stop-words
def stop_words(data):
  return [word for word in data if word not in stopwords.words('english')]

#### Most important words in each chapter

In [None]:
# data_preprocessing

data = [data_preprocessing(chapter) for chapter in data_raw]
data = [tokenization(chapter,'w') for chapter in data]
data = [stop_words(chapter) for chapter in data]

In [None]:
len(data)

12

In [None]:
# TF-IDF 
tfidf = TfidfVectorizer(lowercase=False, tokenizer=lambda data: data)

data_array = tfidf.fit_transform(data).toarray()
vocab = tfidf.vocabulary_

reverse_vocab = {v:k for k,v in vocab.items()}
print(reverse_vocab)

feature_names = tfidf.get_feature_names_out()

data_tfidf = pd.DataFrame(data_array, columns = feature_names)
print(data_tfidf)

words = [word for word in data_tfidf.columns.values.tolist()]


for num_chapter in range(len(data_tfidf)):
  # extract the words, enter them into the dictionary (word: tf-idf)
  chapter_dict = {}
  
  for num_words in range(len(words)):
    chapter_dict[words[num_words]] = data_tfidf.loc[num_chapter][words[num_words]]
  sorted_words = sorted(chapter_dict.items(), key=operator.itemgetter(1), reverse=True)

  print(f'CHAPTER {num_chapter}: ') 
  for i in range(10):
    print(f'\t{sorted_words[i][0]}: {round(sorted_words[i][1], 2)}', end='  ')
  print()

       abide      able   absence  ...    youth   zealand   zigzag
0   0.000000  0.000000  0.000000  ...  0.00000  0.029697  0.00000
1   0.000000  0.029254  0.000000  ...  0.00000  0.000000  0.00000
2   0.000000  0.000000  0.000000  ...  0.00000  0.000000  0.00000
3   0.000000  0.000000  0.000000  ...  0.00000  0.000000  0.00000
4   0.000000  0.000000  0.000000  ...  0.13422  0.000000  0.02237
5   0.020519  0.000000  0.000000  ...  0.00000  0.000000  0.00000
6   0.000000  0.000000  0.000000  ...  0.00000  0.000000  0.00000
7   0.000000  0.000000  0.000000  ...  0.00000  0.000000  0.00000
8   0.000000  0.000000  0.018471  ...  0.00000  0.000000  0.00000
9   0.000000  0.000000  0.000000  ...  0.00000  0.000000  0.00000
10  0.000000  0.000000  0.000000  ...  0.00000  0.000000  0.00000
11  0.000000  0.000000  0.000000  ...  0.00000  0.000000  0.00000

[12 rows x 2024 columns]
CHAPTER 0: 
	alice: 0.29  	fall: 0.18  	eat: 0.18  	think: 0.18  	say: 0.17  	go: 0.16  	little: 0.16  	see: 0.16  	

#### Top 10 most used verbs in sentences with Alice

Algorithm:

Tokenize text by sentences

Delete sentences without Alice

Find verbs, calculate their number




In [None]:
# data preprocessing
data = [chapter.lower() for chapter in data_raw]

# remove tabs and switching to a new line
data = [re.sub(r"^\s+|\n|\r|\s+$", ' ', chapter) for chapter in data]

# tokenization by sentences
data = [tokenization(sentence, 's') for sentence in data]

# merging lists
data_merged = []
for lst in data:
  for el in lst:
    data_merged.append(el)

# remove digits 
data = [re.sub(r"\d+", ' ', sentence) for sentence in data_merged]

# remove punctuation marks
punct_mark = '''!()-[]{};:'"\,<>./?@#'“”’‘—$%^&*_~'''
data = ["".join([word for word in sentence if word not in punct_mark and not word.isdigit()]) for sentence in data]

# removing stop-words
data = [" ".join(word for word in sentence.split(" ") if word not in stopwords.words('english')) for sentence in data]

# lemmatization

# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
load_model = spacy.load('en', disable=['parser', 'ner'])
# Parse the sentence using the loaded 'en' model object `nlp`

data_tmp = []
for sentence in data:
  sentence = load_model(sentence)
  # extract the lemma for each token and join
  sentence_tmp = (" ".join([token.lemma_ for token in sentence]))
  # remove -PRON-
  data_tmp.append(sentence_tmp.replace('-PRON-', ' '))

data = data_tmp

data = [sentence.replace('could', 'can') for sentence in data]

data_clean = []
for i in range(len(data)-1):
  if data[i].find("alice") != -1:
    data_clean.append(data[i])

# removing whitespace 
data_clean = [re.sub(r'\s+', ' ', sentence) for sentence in data_clean]

data_clean

[' rabbithole alice begin get tired sit sister bank nothing twice peep book sister reading picture conversation use book think alice without picture conversation consider mind well can hot day make feel sleepy stupid whether pleasure make daisychain would worth trouble get pick daisy suddenly white rabbit pink eye run close',
 'nothing remarkable alice think much way hear rabbit say oh dear',
 'shall late thought afterwards occur ought wonder time seem quite natural rabbit actually take watch waistcoatpocket look hurried alice start foot flash across mind never see rabbit either waistcoatpocket watch take burn curiosity run across field fortunately time see pop large rabbithole hedge',
 'another moment go alice never consider world get',
 'rabbithole go straight like tunnel way dip suddenly suddenly alice moment think stop find fall deep well',
 'well think alice fall shall think nothing tumble stair',
 'let see would four thousand mile think see alice learnt several thing sort lesson 

In [None]:
'''
  Find combinations: 
    alice + VERB
    VERB + alice
'''

# сompiling a dictionary of verbs
verbs_dict = {}

nlp = spacy.load('en_core_web_sm')
for sentence in data_clean:
  doc = nlp(sentence)
  for i in range(len(doc)):
    j = i+1
    if j < len(doc):
      if (doc[i].text == "alice" and doc[j].pos_ == "VERB"):
        if doc[j].text in verbs_dict:
          verbs_dict[doc[j].text] += 1
        else:
          verbs_dict[doc[j].text] = 1
      elif (doc[i].pos_ == "VERB" and doc[j].text == "alice"):
        if doc[i].text in verbs_dict:
          verbs_dict[doc[i].text] += 1
        else:
          verbs_dict[doc[i].text] = 1

In [None]:
# sorting the dictionary by key values

sorted_tuples = sorted(verbs_dict.items(), key=operator.itemgetter(1), reverse=True)
print(sorted_tuples[:10]) 

[('say', 129), ('think', 30), ('go', 15), ('look', 13), ('begin', 12), ('can', 12), ('feel', 9), ('cry', 7), ('turn', 6), ('know', 5)]


In [None]:
# # checking non-lemmatized words
# nlp = spacy.load('en_core_web_sm')
# sentence = nlp("could thought hurried learnt noticed thinking")
# for word in sentence:
#     print(word.text,  word.lemma_)