## 0. Import Dependencies

In [7]:
import urllib.request
import os

## 1. Download text from Gutenberg

In [10]:
# Pride and Prejudice
URL = "" 
book_name = "PrideAndPrejudice"
urllib.request.urlretrieve(URL, './' + book_name + '.txt')

URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1129)>

## 2. Divide Text into Paragraphs

In [12]:
# open and read text file
with open('PrideAndPrejudice.txt', 'r') as text_file:
    text = text_file.read()

##text

In [17]:
paragraphs = text.split('\n\n')
paragraphs[1:2]

['This eBook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this eBook or online at\nwww.gutenberg.org. If you are not located in the United States, you\nwill have to check the laws of the country where you are located before\nusing this eBook.']

## 3. Find and clean 1st Paragraph

In [6]:
def find_first_para(toto):
    for para_index, para in enumerate(paragraphs):
        if "\nChapter 1" in para and paragraphs[para_index-1]=='':
            return para_index+1

In [7]:
first_para_id = find_first_para(paragraphs)
first_para = paragraphs[first_para_id]
first_para

'      It is a truth universally acknowledged, that a single man in\n      possession of a good fortune, must be in want of a wife.'

In [8]:
first_para = first_para.split(' ')
first_para = [token for token in first_para if token != '']
first_para = ' '.join(first_para)
first_para = first_para.replace('\n', '')
first_para

'It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.'

## 4. Installing Spacy

In [4]:
!pip3 install spacy

Collecting spacy
  Using cached spacy-3.1.3-cp39-cp39-macosx_10_9_x86_64.whl (6.2 MB)
Collecting tqdm<5.0.0,>=4.38.0
  Using cached tqdm-4.62.3-py2.py3-none-any.whl (76 kB)
Collecting requests<3.0.0,>=2.13.0
  Using cached requests-2.26.0-py2.py3-none-any.whl (62 kB)
Collecting numpy>=1.15.0
  Using cached numpy-1.21.2-cp39-cp39-macosx_10_9_x86_64.whl (17.0 MB)
Collecting cymem<2.1.0,>=2.0.2
  Using cached cymem-2.0.5-cp39-cp39-macosx_10_9_x86_64.whl (32 kB)
Collecting thinc<8.1.0,>=8.0.9
  Using cached thinc-8.0.10-cp39-cp39-macosx_10_9_x86_64.whl (612 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Using cached murmurhash-1.0.5-cp39-cp39-macosx_10_9_x86_64.whl (18 kB)
Collecting typer<0.5.0,>=0.3.0
  Using cached typer-0.4.0-py3-none-any.whl (27 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Using cached pydantic-1.8.2-cp39-cp39-macosx_10_9_x86_64.whl (2.7 MB)
Collecting spacy-legacy<3.1.0,>=3.0.8
  Using cached spacy_legacy-3.0.8-py2.py3-none-any.whl (14 kB)
Collecting preshed<3.1

In [6]:
# charger le modèle 
!python3 -m spacy download 'en_core_web_sm'

Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[K     |████████████████████████████████| 13.6 MB 18.0 MB/s eta 0:00:01
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.1.0
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


 ## 5. Using Spacy to Analyze Text 
 https://spacy.io/
 
 https://realpython.com/natural-language-processing-spacy-python/#using-spacy for a detailed tutorial

In [11]:
import spacy

In [12]:
# instatiate model and wrap it around the 1st para
nlp = spacy.load ('en_core_web_sm')
doc = nlp(first_para) # you can replace this with a different longer text to expermient with spacy functionalities

### 5.1 Tokenization
Tokenization allows you to identify the basic units in your text. 
These basic units are called tokens. 

In [13]:
for token in doc :
     print(token.text, token.idx)

It 0
is 3
a 6
truth 8
universally 14
acknowledged 26
, 38
that 40
a 45
single 47
man 54
in 58
possession 61
of 72
a 75
good 77
fortune 82
, 89
must 91
be 96
in 99
want 102
of 107
a 110
wife 112
. 116


In [14]:
# Same thing using a list comprehension
tokens = [(token.text, token.idx) for token in doc]
tokens

[('It', 0),
 ('is', 3),
 ('a', 6),
 ('truth', 8),
 ('universally', 14),
 ('acknowledged', 26),
 (',', 38),
 ('that', 40),
 ('a', 45),
 ('single', 47),
 ('man', 54),
 ('in', 58),
 ('possession', 61),
 ('of', 72),
 ('a', 75),
 ('good', 77),
 ('fortune', 82),
 (',', 89),
 ('must', 91),
 ('be', 96),
 ('in', 99),
 ('want', 102),
 ('of', 107),
 ('a', 110),
 ('wife', 112),
 ('.', 116)]

### 5.2 Lemmatization
Lemmatization is the process of reducing inflected forms of a word while still ensuring that the reduced form belongs to the language. This reduced form or root word is called a lemma.
For example, organizes, organized and organizing are all forms of organize. Here, organize is the lemma. 


In [15]:
for token in doc :
    print(token, token.lemma_)

It it
is be
a a
truth truth
universally universally
acknowledged acknowledge
, ,
that that
a a
single single
man man
in in
possession possession
of of
a a
good good
fortune fortune
, ,
must must
be be
in in
want want
of of
a a
wife wife
. .


### 5.3 POS (Part of Speech) Tagging
Here, two attributes of the Token class are accessed:

* tag_ lists the fine-grained part of speech.
* pos_ lists the coarse-grained part of speech.


In [16]:
for token in doc:
    print((token, token.tag_, token.pos_, spacy.explain(token.tag_)))

(It, 'PRP', 'PRON', 'pronoun, personal')
(is, 'VBZ', 'AUX', 'verb, 3rd person singular present')
(a, 'DT', 'DET', 'determiner')
(truth, 'NN', 'NOUN', 'noun, singular or mass')
(universally, 'RB', 'ADV', 'adverb')
(acknowledged, 'VBD', 'VERB', 'verb, past tense')
(,, ',', 'PUNCT', 'punctuation mark, comma')
(that, 'IN', 'SCONJ', 'conjunction, subordinating or preposition')
(a, 'DT', 'DET', 'determiner')
(single, 'JJ', 'ADJ', 'adjective (English), other noun-modifier (Chinese)')
(man, 'NN', 'NOUN', 'noun, singular or mass')
(in, 'IN', 'ADP', 'conjunction, subordinating or preposition')
(possession, 'NN', 'NOUN', 'noun, singular or mass')
(of, 'IN', 'ADP', 'conjunction, subordinating or preposition')
(a, 'DT', 'DET', 'determiner')
(good, 'JJ', 'ADJ', 'adjective (English), other noun-modifier (Chinese)')
(fortune, 'NN', 'NOUN', 'noun, singular or mass')
(,, ',', 'PUNCT', 'punctuation mark, comma')
(must, 'MD', 'AUX', 'verb, modal auxiliary')
(be, 'VB', 'VERB', 'verb, base form')
(in, 'IN',

### 5.4 Visualization: Using displaCy
spaCy comes with a built-in visualizer called displaCy. You can use it to visualize a dependency parse or named entities in a browser or a Jupyter notebook.

In [17]:
from spacy import displacy

In [None]:
displacy.serve(doc, style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [03/Oct/2021 20:25:01] "GET / HTTP/1.1" 200 19281
