In [3]:
import spacy
from spacy.lang.en import English
from spacy import displacy
import pandas as pd

In [4]:
nlp = spacy.load('en_core_web_md')

In [44]:
test_sent = "Pakistan got independence in 1947. Karachi, Lahore and Islamabad are few of the major cities of Pakistan."

In [47]:
spacy.explain("GPE")

'Countries, cities, states'

In [48]:
parsed_sent = nlp(test_sent)
spacy.displacy.render(parsed_sent, style='ent',jupyter=True)

In [49]:
sentence_spans = list(parsed_sent.sents)                                                                      #connection between words and how they are related
displacy.render(sentence_spans, style='dep', jupyter=True)

In [50]:
for token in parsed_sent:
    print(token.orth_, token.ent_type_ if token.ent_type_ != "" else "(not an entity)")

Pakistan GPE
got (not an entity)
independence (not an entity)
in (not an entity)
1947 DATE
. (not an entity)
Karachi GPE
, (not an entity)
Lahore GPE
and (not an entity)
Islamabad GPE
are (not an entity)
few (not an entity)
of (not an entity)
the (not an entity)
major (not an entity)
cities (not an entity)
of (not an entity)
Pakistan GPE
. (not an entity)


In [52]:
df_token = pd.DataFrame()

for i, token in enumerate(parsed_sent):
    df_token.loc[i, 'text'] = token.text
    df_token.loc[i, 'lemma'] = token.lemma_,                                                                #Lemmatization -->(got lemmatized form is get & are --> be)
    df_token.loc[i, 'pos'] = token.pos_
    df_token.loc[i, 'tag'] = token.tag_
    df_token.loc[i, 'dep'] = token.dep_
    df_token.loc[i, 'shape'] = token.shape_
    df_token.loc[i, 'is_alpha'] = token.is_alpha
    df_token.loc[i, 'is_stop'] = token.is_stop
    
print(df_token)

            text            lemma    pos  tag    dep  shape is_alpha is_stop
0       Pakistan         Pakistan  PROPN  NNP  nsubj  Xxxxx     True   False
1            got           (get,)   VERB  VBD   ROOT    xxx     True   False
2   independence  (independence,)   NOUN   NN   dobj   xxxx     True   False
3             in            (in,)    ADP   IN   prep     xx     True    True
4           1947          (1947,)    NUM   CD   pobj   dddd    False   False
5              .             (.,)  PUNCT    .  punct      .    False   False
6        Karachi       (Karachi,)  PROPN  NNP  nsubj  Xxxxx     True   False
7              ,             (,,)  PUNCT    ,  punct      ,    False   False
8         Lahore        (Lahore,)  PROPN  NNP   conj  Xxxxx     True   False
9            and           (and,)  CCONJ   CC     cc    xxx     True    True
10     Islamabad     (Islamabad,)  PROPN  NNP   conj  Xxxxx     True   False
11           are            (be,)    AUX  VBP   ROOT    xxx     True    True

In [53]:
import nltk                                                                                                 #Library for Stemming

In [54]:
from nltk.stem import PorterStemmer

In [55]:
porter = PorterStemmer()

print(porter.stem("cats"))
print(porter.stem("trouble"))                                                                           #stem of all trouble,troubling,troubled words --> troubl
print(porter.stem("troubling"))
print(porter.stem("troubled"))
print(porter.stem("connections"))
print(porter.stem("connected"))
print(porter.stem("connecting"))
print(porter.stem("connection"))

cat
troubl
troubl
troubl
connect
connect
connect
connect


In [56]:
print(porter.stem("stabilize"))
print(porter.stem("destabilize"))
print(porter.stem("football"))
print(porter.stem("studies"))
print(porter.stem("studying"))
print(porter.stem("beautiful"))
print(porter.stem("beauty"))

stabil
destabil
footbal
studi
studi
beauti
beauti


In [61]:
test_token = "stablize destablize football studies studying beautiful beauty"
parsed_sent = nlp(test_token)
for token in parsed_sent:
    print(token.text, token.lemma_)                                                                                 #Lemmatization(spacy) --> studi stem of studies whereas when lemmatize = study

stablize stablize
destablize destablize
football football
studies study
studying study
beautiful beautiful
beauty beauty


In [5]:
raw_text = "My favorite dog is fluffy and tan. My cat is white with brown spots."

In [6]:
spacy_text = nlp(raw_text)
for sentence in spacy_text.sents:                                                                                   #Break in sentences
    print(sentence)

My favorite dog is fluffy and tan.
My cat is white with brown spots.


In [7]:
token_list = []
for sentence in spacy_text.sents:
    for tokens in sentence:
        token_list.append(tokens)                                                                                                       #Tokenization
print(token_list)

[My, favorite, dog, is, fluffy, and, tan, ., My, cat, is, white, with, brown, spots, .]


In [12]:
import textacy

In [17]:
ngrams = list(textacy.extract.ngrams(spacy_text, 2, filter_stops=False))                                                                #bigram tokenization (pairs)
print(ngrams)

[My favorite, favorite dog, dog is, is fluffy, fluffy and, and tan, My cat, cat is, is white, white with, with brown, brown spots]


In [15]:
ngrams = list(textacy.extract.ngrams(spacy_text, 1, filter_stops=False))                                                                #unigram tokenization
print(ngrams)

[My, favorite, dog, is, fluffy, and, tan, My, cat, is, white, with, brown, spots]


In [18]:
ngrams = list(textacy.extract.ngrams(spacy_text, 1, filter_stops=True))                                                                #unigram tokenization with removig stop words
print(ngrams)

[favorite, dog, fluffy, tan, cat, white, brown, spots]
