# Text Processing with Regular Expressions

In [1]:
import re

In [2]:
data = "my moble number os 9898989898 and your is 8998989898 thank you"

pattern = "[0-9]{10}"
re.findall(pattern,data)

['9898989898', '8998989898']

In [3]:
re.sub(pattern,"*****",data)

'my moble number os ***** and your is ***** thank you'

In [4]:
re.sub(pattern,"",data)

'my moble number os  and your is  thank you'

In [6]:
data = """ my email is anshu@mycompany.com and your email is manager@mycompany.com we have 
recived a complain from cliento on 11/05/2023 for the product they have ordered via an account with t
the email id customer11@outlook.com on date 11/4/2023 and returned on 15-04-2023 please forward
this to support team at mysupport_11@mycompany.com thank you"""


datepattern = "[0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4}"
re.findall(datepattern,data)

['11/05/2023', '11/4/2023']

In [7]:
datepattern = "[0-9]{1,2}/[0-9]{1,2}/[0-9]{2,4}|[0-9]{1,2}-[0-9]{1,2}-[0-9]{2,4}"
re.findall(datepattern,data)

['11/05/2023', '11/4/2023', '15-04-2023']

In [8]:
email = "[a-zA-Z0-9._]+@[a-zA-Z0-9.]+"
re.findall(email,data)

['anshu@mycompany.com',
 'manager@mycompany.com',
 'customer11@outlook.com',
 'mysupport_11@mycompany.com']

# Text Processing with NLTK

In [9]:
import nltk
nltk.download("wordnet")
nltk.download("punkt")
nltk.download("tagsets")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package wordnet to C:\Users\Anshu
[nltk_data]     Pandey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Anshu
[nltk_data]     Pandey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package tagsets to C:\Users\Anshu
[nltk_data]     Pandey\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Anshu Pandey\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [10]:
data = """ my email is anshu@mycompany.com and your email is manager@mycompany.com. We have 
recived a complain from cliento on 11/05/2023 for the product they have ordered via an account with t
the email id customer11@outlook.com on date 11/4/2023. The customer returned the product on 15-04-2023. Please forward
this to support team at mysupport_11@mycompany.com. Thank you!"""

# sentence tokenization
nltk.sent_tokenize(data)

[' my email is anshu@mycompany.com and your email is manager@mycompany.com.',
 'We have \nrecived a complain from cliento on 11/05/2023 for the product they have ordered via an account with t\nthe email id customer11@outlook.com on date 11/4/2023.',
 'The customer returned the product on 15-04-2023.',
 'Please forward\nthis to support team at mysupport_11@mycompany.com.',
 'Thank you!']

In [11]:
# word tokenization
nltk.word_tokenize(data)

['my',
 'email',
 'is',
 'anshu',
 '@',
 'mycompany.com',
 'and',
 'your',
 'email',
 'is',
 'manager',
 '@',
 'mycompany.com',
 '.',
 'We',
 'have',
 'recived',
 'a',
 'complain',
 'from',
 'cliento',
 'on',
 '11/05/2023',
 'for',
 'the',
 'product',
 'they',
 'have',
 'ordered',
 'via',
 'an',
 'account',
 'with',
 't',
 'the',
 'email',
 'id',
 'customer11',
 '@',
 'outlook.com',
 'on',
 'date',
 '11/4/2023',
 '.',
 'The',
 'customer',
 'returned',
 'the',
 'product',
 'on',
 '15-04-2023',
 '.',
 'Please',
 'forward',
 'this',
 'to',
 'support',
 'team',
 'at',
 'mysupport_11',
 '@',
 'mycompany.com',
 '.',
 'Thank',
 'you',
 '!']

In [12]:
# Lemmatization: getting root form of the word

from nltk.stem import WordNetLemmatizer

wd = WordNetLemmatizer()

wd.lemmatize("boxes")

'box'

In [13]:
wd.lemmatize("knives")

'knife'

In [14]:
wd.lemmatize("children")

'child'

In [15]:
wd.lemmatize("went",'v')

'go'

In [16]:
# Part of speech tagging
data = "John visited Microsoft Inc yesterday and met Kelly in the office."
nltk.pos_tag(nltk.word_tokenize(data))

[('John', 'NNP'),
 ('visited', 'VBD'),
 ('Microsoft', 'NNP'),
 ('Inc', 'NNP'),
 ('yesterday', 'NN'),
 ('and', 'CC'),
 ('met', 'VBD'),
 ('Kelly', 'NNP'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('office', 'NN'),
 ('.', '.')]

In [17]:
nltk.help.upenn_tagset("VBD")

VBD: verb, past tense
    dipped pleaded swiped regummed soaked tidied convened halted registered
    cushioned exacted snubbed strode aimed adopted belied figgered
    speculated wore appreciated contemplated ...


In [18]:
# Entity Recognition

In [1]:
data = "John visited Microsoft Inc yesterday and met Kelly in the office. John gave a gift on saturday which is a watch worth $500 and Kelly was happy. John works at Alphabet Inc and previously he was part of JP Morgan and Chase in Mumbai."

import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp(data)




In [3]:
from spacy import displacy
displacy.render(doc,style='ent',jupyter=True)