In [1]:
import nltk
import numpy as np
import pandas as pd

In [2]:
# Load data in a dataframe
dt = pd.read_csv('SPAM-210331-134237.csv')

# Snapshot of the data - 10 items
dt.head(10)

Unnamed: 0,type,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [3]:
# Normalization: Mapping 'spam' to 1 (int) and 'ham' to 0 (int)
dt['spam'] = dt['type'].map({'spam':1, 'ham':0}).astype(int)

dt.head() # Snapshot of updated dataframe

Unnamed: 0,type,text,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [4]:
print('Columns in the given data:')
for col in dt.columns:
    print(col)

Columns in the given data:
type
text
spam


In [5]:
type_len = len(dt['type'])
print('Number of rows in the review column:', type_len)

text_len = len(dt['text'])
print('Number of rows in the liked column:', text_len)

Number of rows in the review column: 116
Number of rows in the liked column: 116


## 2. Tokenization

In [6]:
dt['text'][1] # before

'Ok lar... Joking wif u oni...'

In [7]:
def tokenizer(text):
    return text.split()

In [8]:
dt['text'] = dt['text'].apply(tokenizer)

In [9]:
dt['text'][1] # after

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

## 3. Stemming

In [10]:
dt['text'][1] # before

['Ok', 'lar...', 'Joking', 'wif', 'u', 'oni...']

In [11]:
from nltk.stem.snowball import SnowballStemmer
porter = SnowballStemmer('english', ignore_stopwords = False)

In [12]:
def stem_it(text):
    return [porter.stem(word) for word in text]

In [14]:
dt['text'] = dt['text'].apply(stem_it)

In [15]:
dt['text'][1] # after stemming

['ok', 'lar...', 'joke', 'wif', 'u', 'oni...']

## 4. Lemmitization

In [24]:
dt['text'][92] # before

['smile',
 'in',
 'pleasur',
 'smile',
 'in',
 'pain',
 'smile',
 'when',
 'troubl',
 'pour',
 'like',
 'rain',
 'smile',
 'when',
 'sum1',
 'hurt',
 'u',
 'smile',
 'becoz',
 'someon',
 'still',
 'love',
 'to',
 'see',
 'u',
 'smiling!!']

In [17]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [18]:
def lemmatize_it(text):
    return [lemmatizer.lemmatize(word, pos = 'a') for word in text]

In [20]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aman.singh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [21]:
dt['text'] = dt['text'].apply(lemmatize_it)

In [22]:
dt['text'][92] # after

['smile',
 'in',
 'pleasur',
 'smile',
 'in',
 'pain',
 'smile',
 'when',
 'troubl',
 'pour',
 'like',
 'rain',
 'smile',
 'when',
 'sum1',
 'hurt',
 'u',
 'smile',
 'becoz',
 'someon',
 'still',
 'love',
 'to',
 'see',
 'u',
 'smiling!!']

## 5. Stopword Removal

In [25]:
dt['text'][34] # before

['thank',
 'for',
 'your',
 'subscript',
 'to',
 'rington',
 'uk',
 'your',
 'mobil',
 'will',
 'be',
 'charg',
 '£5/month',
 'pleas',
 'confirm',
 'by',
 'repli',
 'yes',
 'or',
 'no.',
 'if',
 'you',
 'repli',
 'no',
 'you',
 'will',
 'not',
 'be',
 'charg']

In [27]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aman.singh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [29]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [30]:
def stop_it(text):
    review = [word for word in text if not word in stop_words]
    return review

In [31]:
dt['text'] = dt['text'].apply(stop_it)

In [32]:
dt['text'][34] # after

['thank',
 'subscript',
 'rington',
 'uk',
 'mobil',
 'charg',
 '£5/month',
 'pleas',
 'confirm',
 'repli',
 'yes',
 'no.',
 'repli',
 'charg']

In [33]:
dt.head(10)

Unnamed: 0,type,text,spam
0,ham,"[go, jurong, point,, crazy.., avail, onli, bug...",0
1,ham,"[ok, lar..., joke, wif, u, oni...]",0
2,spam,"[free, entri, 2, wkli, comp, win, fa, cup, fin...",1
3,ham,"[u, dun, say, earli, hor..., u, c, alreadi, sa...",0
4,ham,"[nah, think, goe, usf,, live, around, though]",0
5,spam,"[freemsg, hey, darl, 3, week, word, back!, i'd...",1
6,ham,"[even, brother, like, speak, me., treat, like,...",0
7,ham,"[per, request, mell, mell, (oru, minnaminungin...",0
8,spam,"[winner!!, valu, network, custom, select, rece...",1
9,spam,"[mobil, 11, month, more?, u, r, entitl, updat,...",1
