**TEXT CLEANING/PREPROCESSING TECHNIQUES IN NLP**

In [None]:
# importing necessary libraries

import os
import numpy as np
import pandas as pd

import nltk
nltk.download('punkt')

In [None]:
# importing/reading raw text data

data = pd.read_csv("/content/SMSSpamCollection", sep="\t", header=None)

data.columns = ["category", "text"]

data.head()

Unnamed: 0,category,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
data['category'].value_counts()

ham     4825
spam     747
Name: category, dtype: int64

**TECHNIQUE 1 :- TO LOWER CASE**

In [None]:
def convert_to_lower(text):
    return text.lower()

data["text"] = data["text"].apply(lambda x: convert_to_lower(x))

data.head()

Unnamed: 0,category,text
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


**TECHNIQUE 2 :- REMOVING HTML TAGS**

In [None]:
import re

punc = list(string.punctuation)

def remove_html_tags(text):
    html_pattern = r'<.*?>'
    without_html = re.sub(pattern=html_pattern, repl=' ', string=text)
    return without_html

data["text"] = data["text"].apply(lambda x: remove_html_tags(x))

data.head()

Unnamed: 0,category,text
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


**TECHNIQUE 3 :- REMOVING URLS**

In [None]:
import re

def remove_urls(text):
    url_pattern = r'https?://\S+|www\.\S+'
    without_urls = re.sub(pattern=url_pattern, repl=' ', string=text)
    return without_urls

data["text"] = data["text"].apply(lambda x: remove_urls(x))

data.head()

Unnamed: 0,category,text
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


**TECHNIQUE 4 :- REMOVING NUMBERS**

In [None]:
import re

def remove_numbers(text):
    number_pattern = r'\d+'
    without_number = re.sub(pattern=number_pattern, repl=" ", string=text)
    return without_number

data["text"] = data["text"].apply(lambda x: remove_numbers(x))

data.head()

Unnamed: 0,category,text
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


**TECHNIQUE 5 :- CONVERTING NUMBERS TO WORDS**

In [None]:
# !pip install num2words

from num2words import num2words

def convert_num_2_words(text):
    splittedText = text.split()
    for i in range(len(splittedText)):
        if splittedText[i].isdigit():
            splittedText[i] = num2words(splittedText[i])
    num_2_words = ' '.join(splittedText)
    return num_2_words

data["text"] = data["text"].apply(lambda x: convert_num_2_words(x))

data.head()

Unnamed: 0,category,text
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in two a wkly comp to win fa cup fi...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


**TECHNIQUE 6 :- SPELLING CORRECTION**

In [85]:
# !pip install autocorrect

from autocorrect import Speller
import nltk

nltk.download('punkt')

def spell_checker(text):
    spellChecker = Speller(lang="en")
    correct_words = []
    for word in nltk.word_tokenize(text):
        correct_word = spellChecker(word)
        correct_words.append(correct_word)
    correct_spell_text = " ".join(correct_words)
    return correct_spell_text

example_text = "tihs is a expmle of spel chekcer"
print(f"Original sentence: {example_text}")
print(f"Autocorrect sentence: {spell_checker(example_text)}")

data["text"] = data["text"].apply(lambda x: spell_checker(x))

data.head()

Original sentence: tihs is a expmle of spel chekcer
Autocorrect sentence: this is a example of spell checker


**TECHNIQUE 7 :- CONVERTING ACCENTED CHARS TO ASCII CHARS**

In [89]:
# !pip install unidecode

import unidecode

def convert_accented_2_ascii(text):
    return unidecode.unidecode(text)

example_text = "This is an example text with accented characters like dèèp lèarning ánd cömputer vísíön etc"
print(f"Original sentence: {example_text}")
print(f"Converted sentence: {convert_accented_2_ascii(example_text)}")

Original sentence: This is an example text with accented characters like dèèp lèarning ánd cömputer vísíön etc
Converted sentence: This is an example text with accented characters like deep learning and computer vision etc


**TECHNIQUE 8 :- EXPANDING CONTRACTION**

In [107]:
# !pip install contractions

import contractions

def expand_contractions(text):
    expanded_text = []
    for word in text.split():
        expanded_text.append(contractions.fix(word))
    return " ".join(expanded_text)

example_text = "Sometimes our mind doesn't work properly. I've tried everything."
print(f"Original text: {example_text}")
print(f"Expanded text: {expand_contractions(example_text)}")

Original text: Sometimes our mind doesn't work properly. I've tried everything.
Expanded text: Sometimes our mind does not work properly. I have tried everything.


**TECHNIQUE 9 :- STEMMING**

In [109]:
from nltk.stem import PorterStemmer
from nltk import word_tokenize

def stemming(text):
    stemmer = PorterStemmer()
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        stem_word = stemmer.stem(tokens[i])
        tokens[i] = stem_word
    return " ".join(tokens)

data["text"] = data["text"].apply(lambda x: stemming(x))

data.head()

Unnamed: 0,category,text
0,ham,"go until jurong point , crazy.. avail onli in ..."
1,ham,ok lar ... joke wif u oni ...
2,spam,free entri in two a wkli comp to win fa cup fi...
3,ham,u dun say so earli hor ... u c alreadi then sa...
4,ham,"nah i do n't think he goe to usf , he live aro..."


**TECHNIQUE 10 :- LEMMATIZING**

In [113]:
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

nltk.download("wordnet")

def lemmatizing(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        lemma_word = lemmatizer.lemmatize(tokens[i])
        tokens[i] = lemma_word
    return " ".join(tokens)

data["text"] = data["text"].apply(lambda x: lemmatizing(x))

data.head()

Unnamed: 0,category,text
0,ham,"go until jurong point , crazy.. avail onli in ..."
1,ham,ok lar ... joke wif u oni ...
2,spam,free entri in two a wkli comp to win fa cup fi...
3,ham,u dun say so earli hor ... u c alreadi then sa...
4,ham,"nah i do n't think he goe to usf , he live aro..."


**TECHNIQUE 11 :- EMOJI REMOVAL**

In [114]:
import re

def remove_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    removeEmoji = emoji_pattern.sub(r'', text)
    return removeEmoji

example_text = "This is a test 😻 "
print(f"Original text: {example_text}")
print(f"Removed emoji: {remove_emoji(example_text)}")

Original text: This is a test 😻 
Removed emoji: This is a test  


**TECHNIQUE 12 :- EMOTICONS REMOVAL**

In [116]:
from emo_unicode import *

def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    without_emoticons = emoticon_pattern.sub(r'', text)
    return without_emoticons

example_text = "This is a test :) yooo *_* "
print(f"Original text: {example_text}")
print(f"Removed emoticons: {remove_emoticons(example_text)}")

Original text: This is a test :) yooo *_* 
Removed emoticons: This is a test  yooo *_* 


**TECHNIQUE 13 :- REMOVING PUNCTUATIONS OR SPECIAL CHARS**

In [118]:
import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

data["text"] = data["text"].apply(lambda x: remove_punctuation(x))

data.head()

Unnamed: 0,category,text
0,ham,go until jurong point crazy avail onli in bug...
1,ham,ok lar joke wif u oni
2,spam,free entri in two a wkli comp to win fa cup fi...
3,ham,u dun say so earli hor u c alreadi then say
4,ham,nah i do nt think he goe to usf he live aroun...


**TECHNIQUE 14 :- REMOVING STOPWORDS**

In [121]:
from nltk.corpus import stopwords
from nltk import word_tokenize

nltk.download("stopwords")

def remove_stopwords(text):
    removed = []
    stop_words = list(stopwords.words("english"))
    tokens = word_tokenize(text)
    for i in range(len(tokens)):
        if tokens[i] not in stop_words:
            removed.append(tokens[i])
    return " ".join(removed)

data["text"] = data["text"].apply(lambda x: remove_stopwords(x))

data.head()

Unnamed: 0,category,text
0,ham,go jurong point crazy avail onli bugi n great ...
1,ham,ok lar joke wif u oni
2,spam,free entri two wkli comp win fa cup final tkt ...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah nt think goe usf live around though


**TECHNIQUE 15 :- REMOVING FREQUENT WORDS**

In [120]:
from collections import Counter
from nltk import word_tokenize

counter = Counter()

# first a function that will find most common/frequent words in a corpus
def findMostFrequentWords(corpus):
    tokens = word_tokenize(corpus)
    for word in tokens:
        counter[word] += 1
    freq_words = []
    for (word, word_count) in counter.most_common(10):
        freq_words.append(word)
    return freq_words

# now remove the most common/frequent words from text
def remove_frequent_words(freq_words, text):
    without_freq_words = []
    tokens = word_tokenize(text)
    for word in tokens:
        if word not in freq_words:
            without_freq_words.append(word)
    return " ".join(without_freq_words)

**TECHNIQUE 16 :- REMOVING RARE WORDS**

In [None]:
from collections import Counter
from nltk import word_tokenize

counter = Counter()

def findRareWords(corpus):
    tokens = word_tokenize(corpus)
    for word in tokens:
        counter[word] += 1
    rare_words = []
    num_rare_words = 10
    frequentWords = counter.most_common()
	for (word, word_count) in frequentWords[:-num_rare_words:-1]:
		rare_words.append(word)
    return rare_words

def remove_rare_words(rare_words, text):
    without_rare_words = []
    tokens = word_tokenize(text)
    for word in tokens:
        if word not in rare_words:
            without_rare_words.append(word)
    return " ".join(without_rare_words)

**TECHNIQUE 17 :- REMOVING SINGLE CHARS**

In [123]:
def remove_single_chars(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_single_char = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_single_char

data["text"] = data["text"].apply(lambda x: remove_single_chars(x))

data.head()

Unnamed: 0,category,text
0,ham,go jurong point crazy avail onli bugi great wo...
1,ham,ok lar joke wif oni
2,spam,free entri two wkli comp win fa cup final tkt ...
3,ham,u dun say earli hor c alreadi say
4,ham,nah nt think goe usf live around though


**TECHNIQUE 18 :- REMOVING EXTRA WHITE SPACES**

In [124]:
def remove_extra_white_spaces(text):
    single_char_pattern = r'\s+[a-zA-Z]\s+'
    without_sc = re.sub(pattern=single_char_pattern, repl=" ", string=text)
    return without_sc

data["text"] = data["text"].apply(lambda x: remove_extra_white_spaces(x))

data.head()

Unnamed: 0,category,text
0,ham,go jurong point crazy avail onli bugi great wo...
1,ham,ok lar joke wif oni
2,spam,free entri two wkli comp win fa cup final tkt ...
3,ham,u dun say earli hor alreadi say
4,ham,nah nt think goe usf live around though
