## NLP Basics: Text Pre-processing

**Author: Abhishek Dey**

### Import Libraries

In [19]:
from textacy import preprocessing as tp
import emoji
import contractions
import string

## Text Pre-processing

### Lower case - Upper case

In [3]:
text = "THIS is My Sample Text"

In [4]:
text.lower()

'this is my sample text'

In [5]:
text.upper()

'THIS IS MY SAMPLE TEXT'

### Contractions

In [6]:
text = "Isn't it ? Let's see ! Don't go there. I Won't talk to you. Haven't they called you ?"

In [7]:
contractions.fix(text)

'Is not it ? Let us see ! Do not go there. I Will not talk to you. Have not they called you ?'

### Remove Emoji

In [13]:
text = "I want to remove the emojis with @ symbol 💼 💡 📚 🤖 🔥"

In [14]:
emoji.replace_emoji(text,'')

'I want to remove the emojis with @ symbol     '

### Remove Punctuations

In [16]:
text = "I want to remove all my punctuation marks : ; ? , . / ! < > \ { ' } | ` ~  ( ) @ # $ % ^ * - _ = +"

In [20]:
text_1 = text.translate(str.maketrans("","",string.punctuation))

text_1

'I want to remove all my punctuation marks                             '

### Remove White Space

In [31]:
text = "This is   a    Sample Text"

tp.normalize.whitespace(text)

'This is a Sample Text'

### Remove text within  Brackets

In [33]:
text = " This [is] a ( Sample ) {text} "

tp.remove.brackets(text)

' This  a   '

### Remove HTML Tags

In [35]:
text = "<!DOCTYPE>  Defines the document type. <html> Root of an HTML document. <head> Contains metadata. <title> Title of the document. <body> Body of the document"

In [36]:
tp.remove.html_tags(text)

'Defines the document type.  Root of an HTML document.  Contains metadata.  Title of the document.  Body of the document'

### Remove URLs

In [30]:
text = " Welcome to youtube https://www.youtube.com/"

text_1 = tp.replace.urls(text,'')

text_1

' Welcome to youtube '

### Replace Currency Symbols

In [37]:
text = "$ US Dollar (USD), € Euro (EUR), £ British Pound Sterling (GBP), ₹ Indian Rupee (INR)"

tp.replace.currency_symbols(text,'')

' US Dollar (USD),  Euro (EUR),  British Pound Sterling (GBP),  Indian Rupee (INR)'

### Replace Email Ids

In [38]:
text = "Name: Peter Parker, Email-id: peter@gmail.com "

tp.replace.emails(text,'')

'Name: Peter Parker, Email-id:  '

### Replace Hashtags in text 

In [40]:
text = "This is a Sample text #ChakdeIndia #MakeInIndia"

tp.replace.hashtags(text,'')

'This is a Sample text  '

### Replace numbers

In [43]:
text = " These are the odd numbers : 1, 3, 5, 7, 9"

tp.replace.numbers(text,'*')

' These are the odd numbers : *, *, *, *, *'

### Replace Phone numbers

In [44]:
text = "Hi ! Here is my contact number +91 9802837268"

tp.replace.phone_numbers(text,'*')

'Hi ! Here is my contact number +91 *'

## Documents and Corpus

### Documents

In [79]:
d0 = "........ This is my    FIRST line of my sample @@@@ TEXT DATA."
d1 = "Email sfjkjf@gmail.com and URLS : https:www.google.com"
d2 = " Here I am learning basic #text pre-processing in NLP. "
d3 = " i AM INCLUDING    some  special characters like +++ --- ****** "
d4 = " numbers -34382+3423--34,2115 672  and phone numbers like 9839428392 ///// ;;;; <<< "
d5 = " Hashtag(# )>>>, Dollar( $) ???, Percentage (%), And (&)"
d6 = "I am including some    of the emojis such as  💼, 💡, 📚, 🤖, 🔥"
d7 = "  Adding contractions like don't, won't, didn't. ..."
d8 = "HTML Tags <!DOCTYPE>, <body>,  <html> <title>"
d9 = "  Lastly i am adding some puctuation marks like : ; ? , . / ! < > \ { ' } | ` ~  ( ) @ # $ % ^ * - _ = +"

### Corpus

In [80]:
corpus = [d0,d1,d2,d3,d4,d5,d6,d7,d8,d9]

In [81]:
corpus

['........ This is my    FIRST line of my sample @@@@ TEXT DATA.',
 'Email sfjkjf@gmail.com and URLS : https:www.google.com',
 ' Here I am learning basic #text pre-processing in NLP. ',
 ' i AM INCLUDING    some  special characters like +++ --- ****** ',
 ' numbers -34382+3423--34,2115 672  and phone numbers like 9839428392 ///// ;;;; <<< ',
 ' Hashtag(# )>>>, Dollar( $) ???, Percentage (%), And (&)',
 'I am including some    of the emojis such as  💼, 💡, 📚, 🤖, 🔥',
 "  Adding contractions like don't, won't, didn't. ...",
 'HTML Tags <!DOCTYPE>, <body>,  <html> <title>',
 "  Lastly i am adding some puctuation marks like : ; ? , . / ! < > \\ { ' } | ` ~  ( ) @ # $ % ^ * - _ = +"]

In [82]:
type(corpus)

list

### Preprocessing function and text filtering

In [83]:
def text_preprocess(text):
    
    
    text = tp.remove.html_tags(text)
    text = tp.replace.urls(text,'')
    text = tp.replace.currency_symbols(text,'')
    text = tp.replace.emails(text,'')
    text = tp.replace.phone_numbers(text,'*')
    text = tp.replace.hashtags(text,'')
    text = tp.replace.numbers(text,'*')
    text = text.translate(str.maketrans("","",string.punctuation))
    text = emoji.replace_emoji(text,'')
    text = contractions.fix(text)
    text = text.lower()
    text = tp.normalize.whitespace(text)

    
    return text


filtered_corpus = text_preprocess(str(corpus))

## Comparison

### Original Text

In [84]:
corpus

['........ This is my    FIRST line of my sample @@@@ TEXT DATA.',
 'Email sfjkjf@gmail.com and URLS : https:www.google.com',
 ' Here I am learning basic #text pre-processing in NLP. ',
 ' i AM INCLUDING    some  special characters like +++ --- ****** ',
 ' numbers -34382+3423--34,2115 672  and phone numbers like 9839428392 ///// ;;;; <<< ',
 ' Hashtag(# )>>>, Dollar( $) ???, Percentage (%), And (&)',
 'I am including some    of the emojis such as  💼, 💡, 📚, 🤖, 🔥',
 "  Adding contractions like don't, won't, didn't. ...",
 'HTML Tags <!DOCTYPE>, <body>,  <html> <title>',
 "  Lastly i am adding some puctuation marks like : ; ? , . / ! < > \\ { ' } | ` ~  ( ) @ # $ % ^ * - _ = +"]

### Filtered Text

In [85]:
filtered_corpus

'this is my first line of my sample text data email and urls https here i am learning basic preprocessing in nlp i am including some special characters like numbers and phone numbers like hashtag dollar percentage and i am including some of the emojis such as adding contractions like do not will not did not html tags lastly i am adding some puctuation marks like'

### Reference:


1. Textacy Documentation : https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html