### Text Cleaning in Python

In [1]:
#Creating bunch of sentences

raw_docs = ["I am writing some very basic english sentences",
"I'm just writing it for the demo PURPOSE to make audience understand the basics .",
"The point is to _learn HOW it works_ on #simple # data."]

print(raw_docs)

['I am writing some very basic english sentences', "I'm just writing it for the demo PURPOSE to make audience understand the basics .", 'The point is to _learn HOW it works_ on #simple # data.']


### Step:1- Convert to lower case

In [2]:
raw_docs2 = [doc.lower() for doc in raw_docs]

print(raw_docs2)

['i am writing some very basic english sentences', "i'm just writing it for the demo purpose to make audience understand the basics .", 'the point is to _learn how it works_ on #simple # data.']


### Step:2- Tokenization

In [3]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [6]:
# Word tokenization

word_tokenized_docs = [word_tokenize(text=doc) for doc in raw_docs2]
print(word_tokenized_docs)

print("#"*100)

sent_tokenized_docs = [sent_tokenize(text=doc) for doc in raw_docs2]
print(sent_tokenized_docs)

[['i', 'am', 'writing', 'some', 'very', 'basic', 'english', 'sentences'], ['i', "'m", 'just', 'writing', 'it', 'for', 'the', 'demo', 'purpose', 'to', 'make', 'audience', 'understand', 'the', 'basics', '.'], ['the', 'point', 'is', 'to', '_learn', 'how', 'it', 'works_', 'on', '#', 'simple', '#', 'data', '.']]
####################################################################################################
[['i am writing some very basic english sentences'], ["i'm just writing it for the demo purpose to make audience understand the basics ."], ['the point is to _learn how it works_ on #simple # data.']]


### Step:3- Punctuation Removal

In [7]:
import re
import string

In [9]:
regex = re.compile(pattern='[%s]' % re.escape(pattern=string.punctuation))

tokenized_docs_no_punctuation = []

for word in word_tokenized_docs:
    new_lis = []
    for word_token in word:
        new_token = regex.sub(u'', word_token)
        if not new_token == u'':
            new_lis.append(new_token)
    tokenized_docs_no_punctuation.append(new_lis)

print(tokenized_docs_no_punctuation)

[['i', 'am', 'writing', 'some', 'very', 'basic', 'english', 'sentences'], ['i', 'm', 'just', 'writing', 'it', 'for', 'the', 'demo', 'purpose', 'to', 'make', 'audience', 'understand', 'the', 'basics'], ['the', 'point', 'is', 'to', 'learn', 'how', 'it', 'works', 'on', 'simple', 'data']]


### Step:4- Removing Stopwords

In [10]:
from nltk.corpus import stopwords

In [11]:
tokenized_docs_no_stopwords = []

for doc in tokenized_docs_no_punctuation:
    new_term_vector = []
    for word in doc:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
    
    tokenized_docs_no_stopwords.append(new_term_vector)

print(tokenized_docs_no_stopwords)

[['writing', 'basic', 'english', 'sentences'], ['writing', 'demo', 'purpose', 'make', 'audience', 'understand', 'basics'], ['point', 'learn', 'works', 'simple', 'data']]


### Step:5- Stemming and Lemmantization

In [12]:
# Stemming and Lemmatization
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

In [13]:
porter = PorterStemmer()
wordnet = WordNetLemmatizer()

preprocessed_docs = []

for doc in tokenized_docs_no_stopwords:
    final_doc = []
    for word in doc:
        #final_doc.append(porter.stem(word))
        final_doc.append(wordnet.lemmatize(word))
    
    preprocessed_docs.append(final_doc)

print(preprocessed_docs)

[['writing', 'basic', 'english', 'sentence'], ['writing', 'demo', 'purpose', 'make', 'audience', 'understand', 'basic'], ['point', 'learn', 'work', 'simple', 'data']]
