# spaCy & NLTK

## Install and setup 

In [1]:
# !pip install -U spacy
# !python -m spacy download en
# !python -m spacy download en_core_web_lg

## Woring with spacy

In [2]:
# Import spaCy and load the language library
import spacy
nlp = spacy.load('en_core_web_sm')

# Create a Doc object
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

# Print each token separately
for token in doc:
    print(token.text, token.pos_, token.dep_)

ModuleNotFoundError: No module named 'spacy'

## Working with NLTK 
*More info: https://www.nltk.org/*

In [None]:
import nltk
sentence = """At eight o'clock on Thursday morning, Arthur didn't feel very good."""
tokens = nltk.word_tokenize(sentence)
np.array(tokens)

In [None]:
# nltk.download('averaged_perceptron_tagger')
# https://www.nltk.org/book/ch05.html
tagged = nltk.pos_tag(tokens)
tagged[0:6]

In [None]:
# nltk.download('treebank')
from nltk.corpus import treebank
t = treebank.parsed_sents('wsj_0001.mrg')[0]
t.draw()

# Pipeline 

<img src="https://spacy.io/usage/spacy-101#pipelines/pipeline1.png" width="600">

- Text - > tokenizer -> tagger - > parser -> ner -> doc 
        ---------------nlp-----------

*More info about spaCy pipeline: https://spacy.io/usage/spacy-101#pipelines*

## Tokenization 

In [None]:
doc2 = nlp(u"Tesla isn't   looking into startups anymore.")

for token in doc2:
    print(f"Token:{token.text}")

In [None]:
text = "This is New York, isn't it?"

In [None]:
tokenizer = nltk.tokenize.WhitespaceTokenizer()
tokenizer.tokenize(text)

In [None]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokenizer.tokenize(text)

In [None]:
tokenizer = nltk.tokenize.WordPunctTokenizer()
tokenizer.tokenize(text)

## Part-of-Speech Tagging (POS)
`Apple` was recognized to be a ***proper noun***


*more information https://spacy.io/api/annotation#pos-tagging* 

In [None]:
doc2 = nlp(u"I bought a new iphone.")

for token in doc2:
    print(f"token:{token.text}; POS:{token.pos_};")

## Dependencies

*more information https://spacy.io/api/annotation#dependency-parsing*

In [None]:
doc2 = nlp(u"I bought a new iphone.")

for token in doc2:
    print(f"Token:{token.text}; Dependency:{token.dep_}")

In [None]:
spacy.explain('PROPN')

In [None]:
spacy.explain('nsubj')

# Token normalization 

## stemming 

**Function**: remove and replacing suffixes to get the root form of the word 

example 

cats -> cat

cares -> cat

talked -> talk

In [None]:
## stemming 
text = "feet cats wolves talked"
tokenizer = nltk.tokenize.TreebankWordTokenizer()
tokens = tokenizer.tokenize(text)

In [None]:
stemmer = nltk.stem.PorterStemmer()
" ".join(stemmer.stem(token) for token in tokens)

## Lemmatization 

**Function**: return the base or dictionary form of words 
Example: 

feet -> foot

wolve -> wolf

In [None]:
nltk.download('wordnet')
stemmer = nltk.stem.WordNetLemmatizer()
" ".join(stemmer.lemmatize(token) for token in tokens)

# BOW : Bag of Words 

## Ngram 

In [None]:
import re
import pandas as pd
def pre_process(text):
    # lowercase
    text=text.lower()
    #remove tags
    text=re.sub("<!--?.*?-->","",text)
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    # change white space to one 
    text=re.sub("\s+"," ",text)
    # remove start and end white spaces 
    text = text.strip()
    return text

In [None]:
corpus = ['i love apple',
         'i love sunshine',
         'the sky is blue',
         'the gass is green',
          'Giraff is tall',
         'the cat is so cute',
          'i hate chocolate',
        'snake is horrible']
text_sents_clean = [pre_process(s) for s in corpus]
text_sents_clean

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 1000, # keep top 1000 terms 
max_df = 0.5, 
smooth_idf=True,
ngram_range=(1,2))

X = vectorizer.fit_transform(text_sents_clean)

X.shape # check shape of the document-term matrix

In [None]:
pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names())