In [28]:
import pandas as pd
messages = pd.read_csv("data/SMSSpamCollection.txt", sep="\t", names=["label", "message"])


In [29]:
messages.shape

(5572, 2)

In [30]:
messages.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [31]:
## Data cleaning & preprocessing
import re
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bhaveshg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [33]:
corpus = []
for i in range(len(messages)):
    review = re.sub("[^a-zA-Z]", " ", messages["message"][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in set(stopwords.words("english"))]
    review = " ".join(review)
    corpus.append(review)

corpus[:5]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though']

In [37]:
## Creating the Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=100, binary=True)

In [38]:
cv.fit(corpus)
X = cv.transform(corpus).toarray()
X.shape


(5572, 100)

In [39]:
cv.vocabulary_

{'go': np.int64(22),
 'great': np.int64(25),
 'got': np.int64(24),
 'wat': np.int64(90),
 'ok': np.int64(56),
 'free': np.int64(18),
 'win': np.int64(94),
 'text': np.int64(77),
 'txt': np.int64(85),
 'say': np.int64(67),
 'alreadi': np.int64(0),
 'think': np.int64(80),
 'hey': np.int64(28),
 'week': np.int64(92),
 'back': np.int64(3),
 'like': np.int64(38),
 'still': np.int64(73),
 'send': np.int64(69),
 'even': np.int64(15),
 'friend': np.int64(19),
 'prize': np.int64(62),
 'claim': np.int64(7),
 'call': np.int64(4),
 'mobil': np.int64(47),
 'co': np.int64(8),
 'home': np.int64(30),
 'want': np.int64(89),
 'today': np.int64(82),
 'cash': np.int64(6),
 'day': np.int64(12),
 'repli': np.int64(64),
 'www': np.int64(96),
 'right': np.int64(65),
 'thank': np.int64(78),
 'take': np.int64(75),
 'time': np.int64(81),
 'use': np.int64(87),
 'messag': np.int64(44),
 'oh': np.int64(55),
 'ye': np.int64(97),
 'make': np.int64(42),
 'way': np.int64(91),
 'feel': np.int64(16),
 'dont': np.int64(14

In [42]:
## N-Gram model example
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=100, binary=True, ngram_range=(1, 1))
cv.fit(corpus)
X = cv.transform(corpus).toarray()

In [43]:
cv.vocabulary_

{'go': np.int64(22),
 'great': np.int64(25),
 'got': np.int64(24),
 'wat': np.int64(90),
 'ok': np.int64(56),
 'free': np.int64(18),
 'win': np.int64(94),
 'text': np.int64(77),
 'txt': np.int64(85),
 'say': np.int64(67),
 'alreadi': np.int64(0),
 'think': np.int64(80),
 'hey': np.int64(28),
 'week': np.int64(92),
 'back': np.int64(3),
 'like': np.int64(38),
 'still': np.int64(73),
 'send': np.int64(69),
 'even': np.int64(15),
 'friend': np.int64(19),
 'prize': np.int64(62),
 'claim': np.int64(7),
 'call': np.int64(4),
 'mobil': np.int64(47),
 'co': np.int64(8),
 'home': np.int64(30),
 'want': np.int64(89),
 'today': np.int64(82),
 'cash': np.int64(6),
 'day': np.int64(12),
 'repli': np.int64(64),
 'www': np.int64(96),
 'right': np.int64(65),
 'thank': np.int64(78),
 'take': np.int64(75),
 'time': np.int64(81),
 'use': np.int64(87),
 'messag': np.int64(44),
 'oh': np.int64(55),
 'ye': np.int64(97),
 'make': np.int64(42),
 'way': np.int64(91),
 'feel': np.int64(16),
 'dont': np.int64(14

In [55]:
## N-Gram model example
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=500, binary=True, ngram_range=(1, 2))
cv.fit(corpus)
X = cv.transform(corpus).toarray()

In [56]:
cv.vocabulary_

{'go': np.int64(155),
 'point': np.int64(332),
 'great': np.int64(165),
 'world': np.int64(483),
 'got': np.int64(163),
 'wat': np.int64(463),
 'ok': np.int64(297),
 'lar': np.int64(216),
 'wif': np.int64(473),
 'free': np.int64(142),
 'entri': np.int64(126),
 'win': np.int64(475),
 'final': np.int64(135),
 'st': np.int64(399),
 'may': np.int64(252),
 'text': np.int64(414),
 'receiv': np.int64(353),
 'question': np.int64(344),
 'txt': np.int64(445),
 'rate': np.int64(346),
 'appli': np.int64(18),
 'dun': np.int64(115),
 'say': np.int64(367),
 'earli': np.int64(117),
 'alreadi': np.int64(9),
 'think': np.int64(420),
 'goe': np.int64(157),
 'live': np.int64(234),
 'around': np.int64(20),
 'though': np.int64(422),
 'hey': np.int64(187),
 'week': np.int64(466),
 'word': np.int64(481),
 'back': np.int64(29),
 'like': np.int64(230),
 'fun': np.int64(149),
 'still': np.int64(402),
 'xxx': np.int64(490),
 'send': np.int64(375),
 'even': np.int64(127),
 'brother': np.int64(46),
 'speak': np.int