# Text Processing

In [1]:
import re

In [3]:
data = "my mobile number is 8989898989 and yours is 5656565656 and my email is anshu@abccompany.com and his email is nobody@googlemail13.com what is yours?"
print(data)

my mobile number is 8989898989 and yours is 5656565656 and my email is anshu@abccompany.com and his email is nobody@googlemail13.com what is yours?


In [4]:
pattern = "[0-9]{10}"
re.findall(pattern,data)

['8989898989', '5656565656']

In [5]:
re.sub(pattern, "********",data)

'my mobile number is ******** and yours is ******** and my email is anshu@abccompany.com and his email is nobody@googlemail13.com what is yours?'

In [6]:
re.sub(pattern,"",data)

'my mobile number is  and yours is  and my email is anshu@abccompany.com and his email is nobody@googlemail13.com what is yours?'

In [7]:
pattern = "[a-zA-Z0-9._]+@[a-zA-Z0-9._]+"
re.findall(pattern,data)

['anshu@abccompany.com', 'nobody@googlemail13.com']

# Popular NLP packages

- NLTK = text processing, basic packages for dictionary, text processing, cleaning
- spacy = high level package for text processing, entity recognition, text cleaning
- gensim = text processing with deep learning, using pretrained NLP models, topic modelling
- transformer (hugging face API) - implementing high level techniques such as transformers, vectorizers
- textblob = text processing
- librosa = speech processing

In [8]:
import nltk

In [11]:
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("tagsets")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping help\tagsets.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [12]:
data = "Fujitsu Limited is a Japanese multinational information and communications technology equipment and services corporation, established in 1935 and headquartered in Tokyo. Fujitsu is the world's sixth-largest IT services provider by annual revenue, and the largest in Japan, in 2021."
print(data)

Fujitsu Limited is a Japanese multinational information and communications technology equipment and services corporation, established in 1935 and headquartered in Tokyo. Fujitsu is the world's sixth-largest IT services provider by annual revenue, and the largest in Japan, in 2021.


In [13]:
# tokenization
nltk.sent_tokenize(data)

['Fujitsu Limited is a Japanese multinational information and communications technology equipment and services corporation, established in 1935 and headquartered in Tokyo.',
 "Fujitsu is the world's sixth-largest IT services provider by annual revenue, and the largest in Japan, in 2021."]

In [14]:
nltk.word_tokenize(data)

['Fujitsu',
 'Limited',
 'is',
 'a',
 'Japanese',
 'multinational',
 'information',
 'and',
 'communications',
 'technology',
 'equipment',
 'and',
 'services',
 'corporation',
 ',',
 'established',
 'in',
 '1935',
 'and',
 'headquartered',
 'in',
 'Tokyo',
 '.',
 'Fujitsu',
 'is',
 'the',
 'world',
 "'s",
 'sixth-largest',
 'IT',
 'services',
 'provider',
 'by',
 'annual',
 'revenue',
 ',',
 'and',
 'the',
 'largest',
 'in',
 'Japan',
 ',',
 'in',
 '2021',
 '.']

### lemmatization

In [17]:
from nltk.stem import WordNetLemmatizer
wd = WordNetLemmatizer()
wd.lemmatize("children")

'child'

In [18]:
wd.lemmatize("went",'v')

'go'

In [19]:
wd.lemmatize("going",'v')

'go'

# Spelling Correction

In [20]:
nltk.jaccard_distance(set("apple"),set("bangalore"))

0.6666666666666666

In [23]:
nltk.jaccard_distance(set("Apple"),set("Applo"))

0.4

In [24]:
import numpy as np

In [25]:
mydict = ['apple','mango','grapes','orange','banana']

def correct(w):
    score = [nltk.jaccard_distance(set(w),set(k)) for k in mydict]
    ans_pos = np.argmin(score)
    return mydict[ans_pos]

In [26]:
correct("mongo")

'mango'

In [27]:
correct("banano")

'banana'

In [28]:
correct("grapees")

'grapes'

# Named Entity Recognition

In [29]:
# pip install spacy
# python -m spacy download en_core_web_sm

In [36]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
     ---------------------------------------- 13.9/13.9 MB 1.3 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.2.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


2022-06-04 16:46:03.846767: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'cudart64_110.dll'; dlerror: cudart64_110.dll not found
2022-06-04 16:46:03.846818: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
You should consider upgrading via the 'C:\Users\admin\anaconda3\python.exe -m pip install --upgrade pip' command.


In [37]:
import spacy
from spacy import displacy

In [43]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("John went London to meet his friend Jessy Harper and talk about her work at Microsoft Inc. He also met his old friend David Cameron a week back on his birthday on 04th June 2022 bought a gift worth $50000 for him. John lost his watch.")
displacy.render(doc,style='ent',jupyter=True)

In [46]:
displacy.render(doc)