# spaCy & NLTK

## Install and setup 

In [3]:
# !pip install -U spacy
# !python -m spacy download en
# !python -m spacy download en_core_web_lg

## Woring with spacy

In [4]:
# Import spaCy and load the language library
import spacy
nlp = spacy.load('en_core_web_sm')

# Create a Doc object
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

# Print each token separately
for token in doc:
    print(token.text, token.pos_, token.dep_)

Tesla PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.S. PROPN compound
startup NOUN dobj
for ADP prep
$ SYM quantmod
6 NUM compound
million NUM pobj


## Working with NLTK 
*More info: https://www.nltk.org/*

In [27]:
import nltk
sentence = """At eight o'clock on Thursday morning, Arthur didn't feel very good."""
tokens = nltk.word_tokenize(sentence)
np.array(tokens)

array(['At', 'eight', "o'clock", 'on', 'Thursday', 'morning', ',',
       'Arthur', 'did', "n't", 'feel', 'very', 'good', '.'], dtype='<U8')

In [29]:
# nltk.download('averaged_perceptron_tagger')
# https://www.nltk.org/book/ch05.html
tagged = nltk.pos_tag(tokens)
tagged[0:6]

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\aqi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


[('At', 'IN'),
 ('eight', 'CD'),
 ("o'clock", 'NN'),
 ('on', 'IN'),
 ('Thursday', 'NNP'),
 ('morning', 'NN')]

In [35]:
# nltk.download('treebank')
from nltk.corpus import treebank
t = treebank.parsed_sents('wsj_0001.mrg')[0]
t.draw()

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\aqi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\treebank.zip.


KeyboardInterrupt: 

# Pipeline 

<img src="https://spacy.io/usage/spacy-101#pipelines/pipeline1.png" width="600">

- Text - > tokenizer -> tagger - > parser -> ner -> doc 
        ---------------nlp-----------

*More info about spaCy pipeline: https://spacy.io/usage/spacy-101#pipelines*

## Tokenization 

In [15]:
doc2 = nlp(u"Tesla isn't   looking into startups anymore.")

for token in doc2:
    print(f"Token:{token.text}")

Token:Tesla
Token:is
Token:n't
Token:  
Token:looking
Token:into
Token:startups
Token:anymore
Token:.


In [30]:
text = "This is New York, isn't it?"

In [32]:
tokenizer = nltk.tokenize.WhitespaceTokenizer()
tokenizer.tokenize(text)

['This', 'is', 'New', 'York,', "isn't", 'it?']

In [33]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokenizer.tokenize(text)

['This', 'is', 'New', 'York', ',', 'is', "n't", 'it', '?']

In [36]:
tokenizer = nltk.tokenize.WordPunctTokenizer()
tokenizer.tokenize(text)

['This', 'is', 'New', 'York', ',', 'isn', "'", 't', 'it', '?']

## Part-of-Speech Tagging (POS)
`Apple` was recognized to be a ***proper noun***


*more information https://spacy.io/api/annotation#pos-tagging* 

In [16]:
doc2 = nlp(u"I bought a new iphone.")

for token in doc2:
    print(f"token:{token.text}; POS:{token.pos_};")

token:I; POS:PRON;
token:bought; POS:VERB;
token:a; POS:DET;
token:new; POS:ADJ;
token:iphone; POS:NOUN;
token:.; POS:PUNCT;


## Dependencies

*more information https://spacy.io/api/annotation#dependency-parsing*

In [19]:
doc2 = nlp(u"I bought a new iphone.")

for token in doc2:
    print(f"Token:{token.text}; Dependency:{token.dep_}")

Token:I; Dependency:nsubj
Token:bought; Dependency:ROOT
Token:a; Dependency:det
Token:new; Dependency:amod
Token:iphone; Dependency:dobj
Token:.; Dependency:punct


In [20]:
spacy.explain('PROPN')

'proper noun'

In [21]:
spacy.explain('nsubj')

'nominal subject'

# Token normalization 

## stemming 

**Function**: remove and replacing suffixes to get the root form of the word 

example 

cats -> cat

cares -> cat

talked -> talk

In [39]:
## stemming 
text = "feet cats wolves talked"
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokens = tokenizer.tokenize(text)

In [40]:
stemmer = nltk.stem.PorterStemmer()
" ".join(stemmer.stem(token) for token in tokens)

'feet cat wolv talk'

## Lemmatization 

**Function**: return the base or dictionary form of words 
Example: 

feet -> foot

wolve -> wolf

In [43]:
nltk.download('wordnet')
stemmer = nltk.stem.WordNetLemmatizer()
" ".join(stemmer.lemmatize(token) for token in tokens)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aqi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


'foot cat wolf talked'

# BOW : Bag of Words 

## Ngram 

In [51]:
import re
import pandas as pd
def pre_process(text):
    # lowercase
    text=text.lower()
    #remove tags
    text=re.sub("<!--?.*?-->","",text)
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    # change white space to one 
    text=re.sub("\s+"," ",text)
    # remove start and end white spaces 
    text = text.strip()
    return text

In [52]:
corpus = ['i love apple',
         'i love sunshine',
         'the sky is blue',
         'the gass is green',
          'Giraff is tall',
         'the cat is so cute',
          'i hate chocolate',
        'snake is horrible']
text_sents_clean = [pre_process(s) for s in corpus]
text_sents_clean

['i love apple',
 'i love sunshine',
 'the sky is blue',
 'the gass is green',
 'giraff is tall',
 'the cat is so cute',
 'i hate chocolate',
 'snake is horrible']

In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 1000, # keep top 1000 terms 
max_df = 0.5, 
smooth_idf=True,
ngram_range=(1,2))

X = vectorizer.fit_transform(text_sents_clean)

X.shape # check shape of the document-term matrix

(8, 23)

In [67]:
pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())

Unnamed: 0,apple,blue,cat,cat cute,chocolate,cute,gass,gass green,giraff,giraff tall,...,horrible,love,love apple,love sunshine,sky,sky blue,snake,snake horrible,sunshine,tall
0,0.608313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.509814,0.608313,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.509814,0.0,0.608313,0.0,0.0,0.0,0.0,0.608313,0.0
2,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.57735,0.57735,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735
5,0.0,0.0,0.57735,0.57735,0.0,0.57735,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.57735,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735,0.0,0.0
