<a href="https://colab.research.google.com/github/ccarpenterg/introNLP/blob/master/01a_intro_NLP_and_word_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to Natural Language Processing and Word Embeddings



In [0]:
!pip install spacy==2.2.3
!python -m spacy download en_core_web_sm

In [0]:
import spacy

nlp_en = spacy.load('en_core_web_sm')

### Part-of-Speech Tagging

In [0]:
doc = nlp_en("Many Japanese children refuse to go to school")

for token in doc:
    print(token.text, token.pos_, token.tag_)

Many ADJ JJ
Japanese ADJ JJ
children NOUN NNS
refuse VERB VBP
to PART TO
go VERB VB
to ADP IN
school NOUN NN


### Sentence Boundary Disambiguation

In [0]:
doc = nlp_en('Multi-agent planning uses the cooperation and competition of many agents to achieve a given goal. Emergent behavior such as this is used by evolutionary algorithms and swarm intelligence.')

for sent in doc.sents:
    print(sent)

Multi-agent planning uses the cooperation and competition of many agents to achieve a given goal.
Emergent behavior such as this is used by evolutionary algorithms and swarm intelligence.


In [0]:
text = ("Facebook, Inc. is an American social media and technology company"
        " based in Menlo Park, California.")

doc = nlp_en(text)

for key, sent in enumerate(doc.sents):
    print(key, sent)

0 Facebook, Inc. is an American social media and technology company based in Menlo Park, California.


### SpaCy's Language Support

In [0]:
!python -m spacy download pt_core_news_sm

In [0]:
import pt_core_news_sm as language_pt

nlp_pt = language_pt.load()

In [0]:
doc = nlp_pt("China anuncia redução de tarifas de importação de mais de 850 produtos")

for token in doc:
    print(token.text, token.pos_)

China PROPN
anuncia VERB
redução NOUN
de ADP
tarifas NOUN
de ADP
importação NOUN
de ADP
mais ADV
de ADP
850 NUM
produtos SYM


## Word Embeddings

In [0]:
!python -m spacy download en_core_web_lg

In [0]:
import en_core_web_lg as model


In [0]:
spacy.cli.download('en_core_web_lg')

In [0]:
nlp = spacy.load('en_core_web_lg')

In [0]:
nlp = model.load()

In [0]:
tokens = nlp("dog banana chuta")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
banana True 6.700014 False
chuta False 0.0 True


In [0]:
print(nlp.vocab['banana'].vector.shape)
print(nlp.vocab['banana'].vector[:5])

(300,)
[ 0.20228  -0.076618  0.37032   0.032845 -0.41957 ]


In [0]:
word1 = nlp.vocab['country']
word2 = nlp.vocab['nation']

print(word1.similarity(word2))

0.7488121


In [0]:
import numpy as np

queries = np.array([word1.vector])

most_similar = nlp.vocab.vectors.most_similar(queries, n=10)

print(most_similar)

print(most_similar[0].shape)

key0 = most_similar[0].item((0, 9))

#print(key0)

nlp.vocab.strings[key0]

(array([[14089949856107250945, 12290671265767728302,  4000319556510314152,
        12410946551316419195,  4185035456824371994,  4877621044794520897,
        11647056933970380037, 13957816561748780037,  2988880774438019688,
         1499857299936515533]], dtype=uint64), array([[ 9700,   514, 35589, 68168, 11937,  2287, 23526,  1076, 96257,
         3709]], dtype=int32), array([[1.    , 1.    , 1.    , 0.7488, 0.7488, 0.7488, 0.7155, 0.7155,
        0.7155, 0.6419]], dtype=float32))
(1, 10)


'nations'

In [0]:
def most_similar(*words, n=1):
    
    lexs = [nlp.vocab[word].vector for word in words]
    queries = np.array(lexs)
    keys, best_rows, scores = nlp.vocab.vectors.most_similar(queries, n=n)

    res = [nlp.vocab.strings[keys.item(0, key)] for key in range(keys.size)]

    return res

print(most_similar('cold', n=10))



['cOLD', 'COLD', 'Cold', 'cold', 'chilly', 'Chilly', 'CHILLY', 'WARM', 'warm', 'Warm']
