In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
file_path = 'sms+spam+collection/SMSSpamCollection'

df = pd.read_csv(file_path, sep='\t', header=None, names=['label', 'message'])
df.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [5]:
df.isna().sum()

label      0
message    0
dtype: int64

In [6]:
len(df)

5572

# Preprocessing

In [7]:
df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [8]:
# Changing spam label('spam') to 1 and non-spam label('ham') to 0
df['label'] = df['label'].replace({'spam': 1, 'ham': 0})
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
type(df)

pandas.core.frame.DataFrame

In [10]:
# converting messages to lowercases since for instance 'N' is computationally != 'n'
df['message'] = df['message'].apply(lambda a: a.lower())
df.head()

Unnamed: 0,label,message
0,0,"go until jurong point, crazy.. available only ..."
1,0,ok lar... joking wif u oni...
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor... u c already then say...
4,0,"nah i don't think he goes to usf, he lives aro..."


In [11]:
# removing url's from messages
df['message'] = df['message'].str.replace(r'http\S+', '', regex=True)

df['message']

0       go until jurong point, crazy.. available only ...
1                           ok lar... joking wif u oni...
2       free entry in 2 a wkly comp to win fa cup fina...
3       u dun say so early hor... u c already then say...
4       nah i don't think he goes to usf, he lives aro...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                 will ü b going to esplanade fr home?
5569    pity, * was in mood for that. so...any other s...
5570    the guy did some bitching but i acted like i'd...
5571                           rofl. its true to its name
Name: message, Length: 5572, dtype: object

**Tokenization**
* splitting each message to individual words 

import nltk
from nltk.tokenize import word_tokenize

In [12]:
import nltk
from nltk.tokenize import word_tokenize

In [15]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\ATTAH KUMAH
[nltk_data]     MENSAH\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to C:\Users\ATTAH KUMAH
[nltk_data]     MENSAH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
def tokenize_message(messages):
    tokens = []

    tokenized_words = word_tokenize(messages)
    tokens.extend(tokenized_words)

    return tokens

df['tokenized'] = df['message'].apply(tokenize_message)


In [20]:
df.head(10)

Unnamed: 0,label,message,tokenized
0,0,"go until jurong point, crazy.. available only ...","[go, until, jurong, point, ,, crazy, .., avail..."
1,0,ok lar... joking wif u oni...,"[ok, lar, ..., joking, wif, u, oni, ...]"
2,1,free entry in 2 a wkly comp to win fa cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,0,u dun say so early hor... u c already then say...,"[u, dun, say, so, early, hor, ..., u, c, alrea..."
4,0,"nah i don't think he goes to usf, he lives aro...","[nah, i, do, n't, think, he, goes, to, usf, ,,..."
5,1,freemsg hey there darling it's been 3 week's n...,"[freemsg, hey, there, darling, it, 's, been, 3..."
6,0,even my brother is not like to speak with me. ...,"[even, my, brother, is, not, like, to, speak, ..."
7,0,as per your request 'melle melle (oru minnamin...,"[as, per, your, request, 'melle, melle, (, oru..."
8,1,winner!! as a valued network customer you have...,"[winner, !, !, as, a, valued, network, custome..."
9,1,had your mobile 11 months or more? u r entitle...,"[had, your, mobile, 11, months, or, more, ?, u..."
