In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [7]:
import spacy
import spacy.displacy as displacy

In [8]:
nlp= spacy.load('en_core_web_sm')

### spaCy’s Processing Pipeline

In [9]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

### Tokenization:
Word tokens are the basic units of text involved in any NLPlabeling task. The first step, when processing text, is to split it into tokens.

In [10]:
doc = nlp('Entrepreneur and business magnate, Elon Musk is the CEO and founder of the aerospace company SpaceX and co-founder of Tesla Motors. He also hopes to change the world, advancing private spaceflight with the aim of landing humans on other planets like Mars, and developing a new tube-transport system called the Hyperloop. Live Science will keep you up on Musk’s latest ideas and developments so you can keep your two feet on Earth while looking up at the stars.')

In [11]:
token = [token.text for token in doc]
print(token)

['Entrepreneur', 'and', 'business', 'magnate', ',', 'Elon', 'Musk', 'is', 'the', 'CEO', 'and', 'founder', 'of', 'the', 'aerospace', 'company', 'SpaceX', 'and', 'co', '-', 'founder', 'of', 'Tesla', 'Motors', '.', 'He', 'also', 'hopes', 'to', 'change', 'the', 'world', ',', 'advancing', 'private', 'spaceflight', 'with', 'the', 'aim', 'of', 'landing', 'humans', 'on', 'other', 'planets', 'like', 'Mars', ',', 'and', 'developing', 'a', 'new', 'tube', '-', 'transport', 'system', 'called', 'the', 'Hyperloop', '.', 'Live', 'Science', 'will', 'keep', 'you', 'up', 'on', 'Musk', '’s', 'latest', 'ideas', 'and', 'developments', 'so', 'you', 'can', 'keep', 'your', 'two', 'feet', 'on', 'Earth', 'while', 'looking', 'up', 'at', 'the', 'stars', '.']


### **Sentence Detection**

In [12]:
for sent in doc.sents:
    print(sent)

Entrepreneur and business magnate, Elon Musk is the CEO and founder of the aerospace company SpaceX and co-founder of Tesla Motors.
He also hopes to change the world, advancing private spaceflight with the aim of landing humans on other planets like Mars, and developing a new tube-transport system called the Hyperloop.
Live Science will keep you up on Musk’s latest ideas and developments so you can keep your two feet on Earth while looking up at the stars.


### Stop Words
Stop words are the most common words in a language. In the English language, some examples of stop words are the, are, but, and they. Most sentences need to contain stop words in order to be full sentences that make sense.

In [13]:
all_stop_words = nlp.Defaults.stop_words
print(len(all_stop_words))
print(all_stop_words)

326
{'along', 'among', 'thereafter', 'until', 'get', 'since', 'must', 'sometime', 'perhaps', 'wherein', 'being', 're', 'been', 'about', 'much', 'something', 'least', 'doing', 'through', 'what', 'formerly', 'whence', 'see', 'whom', 'against', 'between', 'this', 'whereby', '’ve', 'therefore', 'indeed', 'in', 'ca', 'with', 'because', 'always', "'ll", 'three', 'well', 'hereupon', 'back', 'of', 'together', 'besides', 'eight', 'ten', 'around', 'herein', 'otherwise', 'more', 'last', 'third', 'myself', 'moreover', 'somehow', 'its', 'make', 'mine', 'five', 'keep', 'neither', 'anyone', 'be', 'if', "n't", 'why', 'elsewhere', 'his', 'when', 'these', '’d', 'such', 'each', 'whereafter', 'ours', 'are', 'none', 'has', 'top', 'their', 'same', 'whoever', 'except', 'take', 'thence', 'per', 'thereby', 'other', 'you', 'him', 'did', 'upon', 'should', 'n’t', 'thru', 'her', 'put', 'most', 'may', 'nobody', 'itself', 'hers', 'alone', 'himself', 'themselves', 'does', 'either', 'but', 'above', 'our', '’s', 'had',

In [14]:
tokenized_words = [word.text for word in doc]
print(tokenized_words)

['Entrepreneur', 'and', 'business', 'magnate', ',', 'Elon', 'Musk', 'is', 'the', 'CEO', 'and', 'founder', 'of', 'the', 'aerospace', 'company', 'SpaceX', 'and', 'co', '-', 'founder', 'of', 'Tesla', 'Motors', '.', 'He', 'also', 'hopes', 'to', 'change', 'the', 'world', ',', 'advancing', 'private', 'spaceflight', 'with', 'the', 'aim', 'of', 'landing', 'humans', 'on', 'other', 'planets', 'like', 'Mars', ',', 'and', 'developing', 'a', 'new', 'tube', '-', 'transport', 'system', 'called', 'the', 'Hyperloop', '.', 'Live', 'Science', 'will', 'keep', 'you', 'up', 'on', 'Musk', '’s', 'latest', 'ideas', 'and', 'developments', 'so', 'you', 'can', 'keep', 'your', 'two', 'feet', 'on', 'Earth', 'while', 'looking', 'up', 'at', 'the', 'stars', '.']


In [15]:
removed_stop_words = [word for word in tokenized_words if word not in all_stop_words]
print(removed_stop_words)

['Entrepreneur', 'business', 'magnate', ',', 'Elon', 'Musk', 'CEO', 'founder', 'aerospace', 'company', 'SpaceX', 'co', '-', 'founder', 'Tesla', 'Motors', '.', 'He', 'hopes', 'change', 'world', ',', 'advancing', 'private', 'spaceflight', 'aim', 'landing', 'humans', 'planets', 'like', 'Mars', ',', 'developing', 'new', 'tube', '-', 'transport', 'system', 'called', 'Hyperloop', '.', 'Live', 'Science', 'Musk', 'latest', 'ideas', 'developments', 'feet', 'Earth', 'looking', 'stars', '.']


### Lemmatization

In [16]:
for word in doc:
    if not word.is_stop and not word.is_punct:
        print(word.text,"|",word.lemma_)

Entrepreneur | Entrepreneur
business | business
magnate | magnate
Elon | Elon
Musk | Musk
CEO | ceo
founder | founder
aerospace | aerospace
company | company
SpaceX | SpaceX
co | co
founder | founder
Tesla | Tesla
Motors | Motors
hopes | hope
change | change
world | world
advancing | advance
private | private
spaceflight | spaceflight
aim | aim
landing | land
humans | human
planets | planet
like | like
Mars | Mars
developing | develop
new | new
tube | tube
transport | transport
system | system
called | call
Hyperloop | Hyperloop
Live | Live
Science | Science
Musk | Musk
latest | late
ideas | idea
developments | development
feet | foot
Earth | Earth
looking | look
stars | star


### Word Frequency

In [17]:
import string
punctuation = string.punctuation
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [18]:
words = list(doc)
wordList = [word.lemma_ for word in words if not word.is_stop and not word.is_punct ]
print(wordList)

['Entrepreneur', 'business', 'magnate', 'Elon', 'Musk', 'ceo', 'founder', 'aerospace', 'company', 'SpaceX', 'co', 'founder', 'Tesla', 'Motors', 'hope', 'change', 'world', 'advance', 'private', 'spaceflight', 'aim', 'land', 'human', 'planet', 'like', 'Mars', 'develop', 'new', 'tube', 'transport', 'system', 'call', 'Hyperloop', 'Live', 'Science', 'Musk', 'late', 'idea', 'development', 'foot', 'Earth', 'look', 'star']


In [19]:
from collections import Counter
listOfWords = Counter(wordList)
print(listOfWords)

Counter({'Musk': 2, 'founder': 2, 'Entrepreneur': 1, 'business': 1, 'magnate': 1, 'Elon': 1, 'ceo': 1, 'aerospace': 1, 'company': 1, 'SpaceX': 1, 'co': 1, 'Tesla': 1, 'Motors': 1, 'hope': 1, 'change': 1, 'world': 1, 'advance': 1, 'private': 1, 'spaceflight': 1, 'aim': 1, 'land': 1, 'human': 1, 'planet': 1, 'like': 1, 'Mars': 1, 'develop': 1, 'new': 1, 'tube': 1, 'transport': 1, 'system': 1, 'call': 1, 'Hyperloop': 1, 'Live': 1, 'Science': 1, 'late': 1, 'idea': 1, 'development': 1, 'foot': 1, 'Earth': 1, 'look': 1, 'star': 1})


In [20]:
print(listOfWords.most_common(5))

[('Musk', 2), ('founder', 2), ('Entrepreneur', 1), ('business', 1), ('magnate', 1)]


### Part of Speech Tagging
Part of speech or POS is a grammatical role that explains how a particular word is used in a sentence

In [21]:
for word in doc:
    print(word.text,'|',word.tag_,'|',spacy.explain(word.tag_))

Entrepreneur | NNP | noun, proper singular
and | CC | conjunction, coordinating
business | NN | noun, singular or mass
magnate | NN | noun, singular or mass
, | , | punctuation mark, comma
Elon | NNP | noun, proper singular
Musk | NNP | noun, proper singular
is | VBZ | verb, 3rd person singular present
the | DT | determiner
CEO | NN | noun, singular or mass
and | CC | conjunction, coordinating
founder | NN | noun, singular or mass
of | IN | conjunction, subordinating or preposition
the | DT | determiner
aerospace | NN | noun, singular or mass
company | NN | noun, singular or mass
SpaceX | '' | closing quotation mark
and | CC | conjunction, coordinating
co | NN | noun, singular or mass
- | NN | noun, singular or mass
founder | NN | noun, singular or mass
of | IN | conjunction, subordinating or preposition
Tesla | NNP | noun, proper singular
Motors | NNPS | noun, proper plural
. | . | punctuation mark, sentence closer
He | PRP | pronoun, personal
also | RB | adverb
hopes | VBZ | verb, 3r

### Dependency Parsing using spaCy
Every sentence has a grammatical structure to it and with the help of dependency parsing, we can extract this structure. 

In [22]:
for word in doc:
    print(word.text,'|',word.dep_)

Entrepreneur | nmod
and | cc
business | conj
magnate | nsubj
, | punct
Elon | compound
Musk | nsubj
is | ROOT
the | det
CEO | attr
and | cc
founder | conj
of | prep
the | det
aerospace | compound
company | pobj
SpaceX | punct
and | cc
co | conj
- | conj
founder | appos
of | prep
Tesla | compound
Motors | pobj
. | punct
He | nsubj
also | advmod
hopes | ROOT
to | aux
change | xcomp
the | det
world | dobj
, | punct
advancing | advcl
private | amod
spaceflight | dobj
with | prep
the | det
aim | pobj
of | prep
landing | pcomp
humans | dobj
on | prep
other | amod
planets | pobj
like | prep
Mars | pobj
, | punct
and | cc
developing | conj
a | det
new | amod
tube | compound
- | punct
transport | compound
system | dobj
called | acl
the | det
Hyperloop | oprd
. | punct
Live | intj
Science | nsubj
will | aux
keep | ROOT
you | dobj
up | prt
on | prep
Musk | poss
’s | case
latest | amod
ideas | pobj
and | cc
developments | conj
so | mark
you | nsubj
can | aux
keep | advcl
your | poss
two | nummod
f

In [23]:
displacy.serve(doc,style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


### Named Entity Recognition using spaCy
Entities are the words or groups of words that represent information about common things such as persons, locations, organizations, etc.

In [24]:
for word in doc.ents:
    print(word.text,"|",word.label_)

Elon Musk | PERSON
Tesla Motors | ORG
Mars | LOC
Hyperloop | ORG
Musk | PERSON
two feet | QUANTITY
Earth | LOC


In [25]:
displacy.serve(doc,style='ent')


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [26]:
for word in doc:
    pass
#     print(word.i,word.text)
    
from spacy.tokens import Span
s1 = Span(doc,16,17,label='ORG')
doc.set_ents([s1],default='unmodified')

In [27]:
displacy.serve(doc,style='ent')


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


### Document similarities

In [29]:
text1 = nlp('hello ,I am vishvanath')
text2 = nlp('how are you')
print("similarity",text1.similarity(text2))

similarity 0.5655234729474474


  This is separate from the ipykernel package so we can avoid doing imports until
