# Load the library

In [1]:
import pandas as pd
import numpy as np
import nltk

## Tokenization

#### 2 Types - a. Sent tokenization b. Word Tokenization
- a. Sent Tokenization - It is used to extract complete sentences from the text
- b. Word Tokenization - It is used to extract words from the text

In [2]:
text = 'Hello Everyone, how are you? I hope everything is going well. Today isa good day, see you dude.'

### a. Word Tokenization

In [3]:
from nltk.tokenize import word_tokenize
word_tokenize(text)

['Hello',
 'Everyone',
 ',',
 'how',
 'are',
 'you',
 '?',
 'I',
 'hope',
 'everything',
 'is',
 'going',
 'well',
 '.',
 'Today',
 'isa',
 'good',
 'day',
 ',',
 'see',
 'you',
 'dude',
 '.']

### Sentence Tokenization

In [4]:
from nltk.tokenize import sent_tokenize
sent_tokenize(text)

['Hello Everyone, how are you?',
 'I hope everything is going well.',
 'Today isa good day, see you dude.']

## Tokenize non - English Language text

In [5]:
text_1 = "Bonjour M. Adam, comment allez-vous? J'espere que tout va bien. Aujoud'hui est un bon jour"

In [8]:
from nltk.tokenize import sent_tokenize
sent_tokenize(text_1, 'french')

['Bonjour M. Adam, comment allez-vous?',
 "J'espere que tout va bien.",
 "Aujoud'hui est un bon jour"]

In [9]:
from nltk.tokenize import sent_tokenize
sent_tokenize(text_1)

['Bonjour M. Adam, comment allez-vous?',
 "J'espere que tout va bien.",
 "Aujoud'hui est un bon jour"]

## Get Synonyms from WordNet

- synonyms - words that have similar meaning eg: pain, grief. lucky, fortunate, 

In [14]:
from nltk.corpus import wordnet

syn = wordnet.synsets('pain')
syn

[Synset('pain.n.01'),
 Synset('pain.n.02'),
 Synset('pain.n.03'),
 Synset('pain.n.04'),
 Synset('annoyance.n.04'),
 Synset('trouble.v.05'),
 Synset('pain.v.02')]

In [20]:
syn[0].definition()

'a symptom of some physical hurt or disorder'

In [21]:
syn[0].examples()

['the patient developed severe pain and distension']

In [22]:
syn[4]

Synset('annoyance.n.04')

In [23]:
syn[4].definition()

'something or someone that causes trouble; a source of unhappiness'

In [24]:
syn[4].examples()

['washing dishes was a nuisance before we got a dish washer',
 'a bit of a bother',
 "he's not a friend, he's an infliction"]

In [26]:
syn = wordnet.synsets('NLP')
syn

[Synset('natural_language_processing.n.01')]

In [29]:
syn[0].definition()

'the branch of information science that deals with natural language information'

In [30]:
syn = wordnet.synsets('Python')
syn

[Synset('python.n.01'), Synset('python.n.02'), Synset('python.n.03')]

In [33]:
syn[1].definition()

'a soothsaying spirit or a person who is possessed by such a spirit'

## Printing the the actual synonymns for the words

In [34]:
syn = wordnet.synsets('Computer')
syn

[Synset('computer.n.01'), Synset('calculator.n.01')]

In [42]:
syn.lemmas()

[Lemma('calculator.n.01.calculator'),
 Lemma('calculator.n.01.reckoner'),
 Lemma('calculator.n.01.figurer'),
 Lemma('calculator.n.01.estimator'),
 Lemma('calculator.n.01.computer')]

In [37]:
for syn in wordnet.synsets('Computer'):
    for word in syn.lemmas():
        #print(word)
        print(word.name())

computer
computing_machine
computing_device
data_processor
electronic_computer
information_processing_system
calculator
reckoner
figurer
estimator
computer


## Get Antonyms from WordNet

- Antonyms means opposite words

In [44]:
syn = wordnet.synsets('small')
syn

[Synset('small.n.01'),
 Synset('small.n.02'),
 Synset('small.a.01'),
 Synset('minor.s.10'),
 Synset('little.s.03'),
 Synset('small.s.04'),
 Synset('humble.s.01'),
 Synset('little.s.07'),
 Synset('little.s.05'),
 Synset('small.s.08'),
 Synset('modest.s.02'),
 Synset('belittled.s.01'),
 Synset('small.r.01')]

In [60]:
for syn in wordnet.synsets('small'):
    for word in syn.lemmas():
        if word.antonyms():
            print(word.antonyms())

[Lemma('large.a.01.large')]
[Lemma('large.a.01.big')]
[Lemma('big.r.03.big')]


In [64]:
for syn in wordnet.synsets('same'):
    for word in syn.lemmas():
        if word.antonyms():
            print(word.antonyms())

[Lemma('other.a.01.other')]
[Lemma('different.a.01.different')]
[Lemma('unlike.a.02.unlike')]


## Stemming

- Word Stemming means removing the extra suffixes from the text and return the root words. The roots words derived from stemming do not have any meaning

In [66]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [67]:
stemmer.stem('playing')

'play'

In [69]:
stemmer.stem('habbits')

'habbit'

In [72]:
stemmer.stem('arts')

'art'

In [73]:
stemmer.stem('adding')

'ad'

In [74]:
stemmer.stem('increases')

'increas'

In [76]:
stemmer.stem('habbitual')

'habbitu'

## Lemmitization

- Lemmitization means removing the extra suffixes from the text and return the root words. The roots words derived from lemmitization will have meaning

In [77]:
from nltk.stem import WordNetLemmatizer
lemmitizer = WordNetLemmatizer()

In [83]:
lemmitizer.lemmatize('playing', pos = 'n')

'playing'

In [79]:
lemmitizer.lemmatize('habbitual')

'habbitual'

In [80]:
lemmitizer.lemmatize('increases')

'increase'

In [81]:
lemmitizer.lemmatize('playing', pos = "v")

'play'

In [100]:
lemmitizer.lemmatize('adding', pos = 'v')

'add'

In [105]:
lemmitizer.lemmatize('easily', pos = 'n')

'easily'

## Parts of Speech

- nouns, verbs, adverbs, adjectives are the part of speech present in any text
- a. JJ = adjective,   
- b. NN = noun,   
- c. RBR, RB = adverb,  
- d. VBD = verbs, 

In [117]:
text = ['natural', 'language', 'processing', 'better', 'research', 'field', 'played', 'nicely']

In [118]:
from nltk import pos_tag
pos_tag(text)

[('natural', 'JJ'),
 ('language', 'NN'),
 ('processing', 'NN'),
 ('better', 'RBR'),
 ('research', 'NN'),
 ('field', 'NN'),
 ('played', 'VBD'),
 ('nicely', 'RB')]

In [108]:
nltk.download('tagsets')

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\AMANT\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [109]:
nltk.help.upenn_tagset('NN')

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


In [110]:
nltk.help.upenn_tagset('RBR')

RBR: adverb, comparative
    further gloomier grander graver greater grimmer harder harsher
    healthier heavier higher however larger later leaner lengthier less-
    perfectly lesser lonelier longer louder lower more ...


In [111]:
nltk.help.upenn_tagset('JJ')

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


In [116]:
nltk.help.upenn_tagset('VBD')

VBD: verb, past tense
    dipped pleaded swiped regummed soaked tidied convened halted registered
    cushioned exacted snubbed strode aimed adopted belied figgered
    speculated wore appreciated contemplated ...


In [119]:
nltk.help.upenn_tagset('RB')

RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly fiscally pitilessly ...


# N-grams

- n-grams means sequence of n-words
- N - grams are continuous sequence of words or symbols, or tokens in a document
- In technical terms, they can be defined as the neighboring sequences of items in a document
- They are used for text data in NLP tasks
- Applications - language models, semantic features, spelling corrections, machine translations or text mining, etc

In [121]:
my_text = 'Hari is very good in mathematics but he is not that much good in science'
my_text

'Hari is very good in mathematics but he is not that much good in science'

In [127]:
from nltk.tokenize import word_tokenize
my_text = word_tokenize(my_text)

- ngrams functions takes in 2 mandatory parameters 1. text data 2. number of grams(specify as a integer)

In [128]:
from nltk.util import ngrams
list(ngrams(my_text, 1))

[('Hari',),
 ('is',),
 ('very',),
 ('good',),
 ('in',),
 ('mathematics',),
 ('but',),
 ('he',),
 ('is',),
 ('not',),
 ('that',),
 ('much',),
 ('good',),
 ('in',),
 ('science',)]

In [129]:
from nltk.util import ngrams
list(ngrams(my_text, 2))

[('Hari', 'is'),
 ('is', 'very'),
 ('very', 'good'),
 ('good', 'in'),
 ('in', 'mathematics'),
 ('mathematics', 'but'),
 ('but', 'he'),
 ('he', 'is'),
 ('is', 'not'),
 ('not', 'that'),
 ('that', 'much'),
 ('much', 'good'),
 ('good', 'in'),
 ('in', 'science')]

# from nltk.util import ngrams
list(ngrams(my_text, 3))

In [131]:
from nltk.util import ngrams
list(ngrams(my_text, 4))

[('Hari', 'is', 'very', 'good'),
 ('is', 'very', 'good', 'in'),
 ('very', 'good', 'in', 'mathematics'),
 ('good', 'in', 'mathematics', 'but'),
 ('in', 'mathematics', 'but', 'he'),
 ('mathematics', 'but', 'he', 'is'),
 ('but', 'he', 'is', 'not'),
 ('he', 'is', 'not', 'that'),
 ('is', 'not', 'that', 'much'),
 ('not', 'that', 'much', 'good'),
 ('that', 'much', 'good', 'in'),
 ('much', 'good', 'in', 'science')]

## Scraping the data from Wikipedia

- Steps 1 : Entering the website using python from jupyter notebook
- Steps 2 : Get the website content(data) into this jupyter notebook for analysis

In [134]:
from urllib import request

In [150]:
response = request.urlopen('https://www.php.net/')
response

<http.client.HTTPResponse at 0x2d3756f8f10>

In [151]:
html = response.read()
html

b'<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhtml" lang="en">\n<head>\n\n  <meta charset="utf-8">\n  <meta name="viewport" content="width=device-width, initial-scale=1.0">\n\n  <title>PHP: Hypertext Preprocessor</title>\n\n <link rel="icon" type="image/svg+xml" sizes="any" href="https://www.php.net/favicon.svg?v=2">\n <link rel="icon" type="image/png" sizes="196x196" href="https://www.php.net/favicon-196x196.png?v=2">\n <link rel="icon" type="image/png" sizes="32x32" href="https://www.php.net/favicon-32x32.png?v=2">\n <link rel="icon" type="image/png" sizes="16x16" href="https://www.php.net/favicon-16x16.png?v=2">\n <link rel="shortcut icon" href="https://www.php.net/favicon.ico?v=2">\n\n <link rel="search" type="application/opensearchdescription+xml" href="http://php.net/phpnetimprovedsearch.src" title="Add PHP.net search">\n <link rel="alternate" type="application/atom+xml" href="https://www.php.net/releases/feed.php" title="PHP Release feed">\n <link rel="alternate" type=

In [153]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html5lib')
text = soup.get_text(strip = True)
print(text)

PHP: Hypertext PreprocessorDownloadsDocumentationGet InvolvedHelpGetting StartedIntroductionA simple tutorialLanguage ReferenceBasic syntaxTypesVariablesConstantsExpressionsOperatorsControl StructuresFunctionsClasses and ObjectsNamespacesEnumerationsErrorsExceptionsFibersGeneratorsAttributesReferences ExplainedPredefined VariablesPredefined ExceptionsPredefined Interfaces and ClassesPredefined AttributesContext options and parametersSupported Protocols and WrappersSecurityIntroductionGeneral considerationsInstalled as CGI binaryInstalled as an Apache moduleSession SecurityFilesystem SecurityDatabase SecurityError ReportingUser Submitted DataHiding PHPKeeping CurrentFeaturesHTTP authentication with PHPCookiesSessionsDealing with XFormsHandling file uploadsUsing remote filesConnection handlingPersistent Database ConnectionsCommand line usageGarbage CollectionDTrace Dynamic TracingFunction ReferenceAffecting PHP's BehaviourAudio Formats ManipulationAuthentication ServicesCommand Line Spec

In [154]:
text = """
<html><head><title>The NLP story</title></head>
<body>
<p class="title"><b>The NLP story</b></p>
<p class="story">Once upon a time there were three little  techniques; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived till the next conference.</p>
<p class="story">...</p>
"""

print(text)


<html><head><title>The NLP story</title></head>
<body>
<p class="title"><b>The NLP story</b></p>
<p class="story">Once upon a time there were three little  techniques; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived till the next conference.</p>
<p class="story">...</p>



In [156]:
from bs4 import BeautifulSoup
text = BeautifulSoup(text, "html").text# for HTML decoding
print(text)

The NLP story

The NLP story
Once upon a time there were three little  techniques; and their names were
Elsie,
Lacie and
Tillie;
and they lived till the next conference.
...



## Removing the urls

In [157]:
text = 'Shall i search the answer on www.google.com ?'
text

'Shall i search the answer on www.google.com ?'

In [158]:
import re
re.sub(r'https?://\S+|www\.\S+', "", text)

'Shall i search the answer on  ?'