In [1]:
import numpy as np
import pandas as pd

In [2]:
import nltk

In [3]:
#nltk.download()

# NLP PIPELINE

Step 1 : <b>Raw Text</b> => Model can't distinguish words

Step 2 : <b>Tokenize</b> => Tell the Model what to look at.

Step 3 : <b>Clean Text</b> => Removal of stop words / Punchuation / Stemming. etc.

Step 4 : <b>Vectorize</b> => Converting word to numeric form.

Step 5 : <b>ML Classifier Model</b> => Apply ML classification algo to fit/train model

# Video 1

## Testing NLTK package with stopwords as example.

In [4]:
from nltk.corpus import stopwords
stpwds = stopwords.words('english')

print("Total Stopword in English : {}".format(len(stpwds)))

Total Stopword in English : 179


In [5]:
stpwds[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [6]:
stpwds[0:500:25]

['i', 'herself', 'been', 'with', 'here', 'very', 'doesn', 'won']

# Video 2

## Reading in text data & why do we need to clean the text data?

#### Read in semi-structured text data

In [7]:
# Reading the raw text
rawData = open("SMSSpamCollection.tsv").read()

# Printing the raw data
rawData[0:500]

"ham\tI've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tNah I don't think he goes to usf, he lives around here though\nham\tEven my brother is not like to speak with me. They treat me like aid"

In [8]:
# so data is tab seperated and also there a multiple line replacing tab with new line.
parsedData = rawData.replace('\t','\n').split("\n")

In [9]:
parsedData[0:5]

['ham',
 "I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.",
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham']

In [10]:
labelList = parsedData[0::2] 

In [11]:
labelList[0:5]

['ham', 'spam', 'ham', 'ham', 'ham']

In [12]:
textList = parsedData[1::2]

In [13]:
textList[0:5]

["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.",
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 "Nah I don't think he goes to usf, he lives around here though",
 'Even my brother is not like to speak with me. They treat me like aids patent.',
 'I HAVE A DATE ON SUNDAY WITH WILL!!']

In [14]:
# Checking the created list size
print("labelList size => {}".format(len(labelList)))
print("textList size => {}".format(len(textList)))

labelList size => 5571
textList size => 5570


### Note: 
    Since the size of both list is not same it will give us error while creating the Dataframe
    
    So, checking tail value of labelList

In [15]:
labelList[-5:]

['ham', 'ham', 'ham', 'ham', '']

In [16]:
# we need to get rid of last element from labelList while creating the Dataframe

dataframe = pd.DataFrame({
    "LabelList":labelList[:-1],
    "TextList":textList
}) 

In [17]:
dataframe.head()

Unnamed: 0,LabelList,TextList
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


### All the above tranformation can be done via Pandas pd.read_csv()  

In [18]:
full_corpus_df = pd.read_csv("SMSSpamCollection.tsv",sep="\t",header=None)

In [19]:
full_corpus_df.columns = ['label','body_text']
full_corpus_df.head(5)

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


# Video 3

## Explore the Dataset

#### Since dataframe is created, now need to do data exploration to find insights from the data. 

In [20]:
# What is the shape of the data 
print("Shape of our Dataframe => {}".format(full_corpus_df.shape))

Shape of our Dataframe => (5568, 2)


In [21]:
# Metadata about dataframe 
print(full_corpus_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5568 entries, 0 to 5567
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   label      5568 non-null   object
 1   body_text  5568 non-null   object
dtypes: object(2)
memory usage: 87.1+ KB
None


#### Note:
    1. Both Columns are of type Object
    2. No Null entries in dataset

In [22]:
print(full_corpus_df.describe())

       label               body_text
count   5568                    5568
unique     2                    5165
top      ham  Sorry, I'll call later
freq    4822                      30


#### Note:
    1. Label column is having 2 uniqe value, whereas body_column is not having any unique value.
    2. 4822 times ham value is found.
    3. Sorry, I'll call later is repeated 30 times.

In [23]:
full_corpus_df['label'].value_counts()

ham     4822
spam     746
Name: label, dtype: int64

# Video 4

## Implementing a pipeline to clean text

### <u>Pre-Processing text data</u>

 - Cleaning of the text data is necessary to highlight atributes that you are going to want your ML System to pick up on.
- Cleaning or (Pre-processing) the data typically consists of a number of steps:

    1.Remove Punctuation 
    2.Tokenization
    3.Remove Stopwords
    4.Lemmatization / Stemming

In [24]:
pd.set_option('display.max_colwidth',100)

#### Note: To remove punctuation we can use nltks regexTokenizer or String class in python

In [25]:
import re
import string

In [26]:
def remove_punctuation(text):
    return "".join([char for char in text if char not in string.punctuation])

In [27]:
full_corpus_df['clean_body_text'] = full_corpus_df.body_text.apply(lambda sentence: remove_punctuation(sentence))

In [28]:
full_corpus_df.head(5)

Unnamed: 0,label,body_text,clean_body_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL


# Video 5

### Tokenization performed on clean_body_text

In [29]:
from nltk import tokenize
full_corpus_df['tokenize_clean_body_text'] = full_corpus_df.clean_body_text.apply(lambda sentence : sentence.split(" "))

In [31]:
full_corpus_df.head(5)

Unnamed: 0,label,body_text,clean_body_text,tokenize_clean_body_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...,"[Ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, I, promise, ..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[Free, entry, in, 2, a, wkly, comp, to, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, to..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[Nah, I, dont, think, he, goes, to, usf, he, lives, around, here, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[Even, my, brother, is, not, like, to, speak, with, me, They, treat, me, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[I, HAVE, A, DATE, ON, SUNDAY, WITH, WILL]"


# Video 6

## StopWords Removal

### Removed StopWords from tokenize_clean_body_te

In [33]:
def remove_stopwords(token_ls):
    return [word for word in token_ls if word not in stpwds]
    
full_corpus_df['text_without_stpwrds'] =  full_corpus_df.tokenize_clean_body_text.apply(lambda ls : remove_stopwords(ls))

In [34]:
full_corpus_df.head()

Unnamed: 0,label,body_text,clean_body_text,tokenize_clean_body_text,text_without_stpwrds
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...,"[Ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, I, promise, ...","[Ive, searching, right, words, thank, breather, I, promise, wont, take, help, granted, fulfil, p..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[Free, entry, in, 2, a, wkly, comp, to, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, to...","[Free, entry, 2, wkly, comp, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, 87121, receiv..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[Nah, I, dont, think, he, goes, to, usf, he, lives, around, here, though]","[Nah, I, dont, think, goes, usf, lives, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[Even, my, brother, is, not, like, to, speak, with, me, They, treat, me, like, aids, patent]","[Even, brother, like, speak, They, treat, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[I, HAVE, A, DATE, ON, SUNDAY, WITH, WILL]","[I, HAVE, A, DATE, ON, SUNDAY, WITH, WILL]"


# Video 7

### Performing Stemming / Lemmatization

#### Will create 2 dataframe -> Stemming_DF & Lemma_DF

In [44]:
def perform_stemming_on_text(text_ls):
    ps = nltk.PorterStemmer()
    return [ps.stem(word) for word in text_ls]

In [42]:
full_corpus_df['stemmed_data'] = full_corpus_df.text_without_stpwrds.apply(lambda ls:perform_stemming_on_text(ls))
stemming_df = full_corpus_df.iloc[:,[0,-1]]

In [43]:
stemming_df.head(5)

Unnamed: 0,label,stemmed_data
0,ham,"[ive, search, right, word, thank, breather, I, promis, wont, take, help, grant, fulfil, promis, ..."
1,spam,"[free, entri, 2, wkli, comp, win, FA, cup, final, tkt, 21st, may, 2005, text, FA, 87121, receiv,..."
2,ham,"[nah, I, dont, think, goe, usf, live, around, though]"
3,ham,"[even, brother, like, speak, they, treat, like, aid, patent]"
4,ham,"[I, have, A, date, ON, sunday, with, will]"


In [45]:
def perform_lemmatizing_on_text(text_ls):
    lemma = nltk.WordNetLemmatizer()
    return [lemma.lemmatize(word) for word in text_ls]

In [46]:
lemma_df = full_corpus_df.text_without_stpwrds.apply(lambda ls: perform_lemmatizing_on_text(ls))
lemma_df.head(5)

0    [Ive, searching, right, word, thank, breather, I, promise, wont, take, help, granted, fulfil, pr...
1    [Free, entry, 2, wkly, comp, win, FA, Cup, final, tkts, 21st, May, 2005, Text, FA, 87121, receiv...
2                                                   [Nah, I, dont, think, go, usf, life, around, though]
3                                           [Even, brother, like, speak, They, treat, like, aid, patent]
4                                                             [I, HAVE, A, DATE, ON, SUNDAY, WITH, WILL]
Name: text_without_stpwrds, dtype: object